# parsers

> Stata-related helper functions with no Jupyter or pystata dependence

In [None]:
#| default_exp parsers
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import test_eq

In [None]:
#| export
import re

In [None]:
#| export
parse_code_if_in_regex = re.compile(
    r'\A(?P<code>(?!if\s)(?!\sif)(?!in\s)(?!\sin).+?)?(?P<if>\s*if\s+.+?)?(?P<in>\s*in\s.+?)?\Z',
    flags=re.DOTALL + re.MULTILINE
)

In [None]:
#| export
def parse_code_if_in(code):
    """Parse line of Stata code into code, if, in"""
    match = parse_code_if_in_regex.match(code.strip())
    if match:
        args = match.groupdict()
        for k in args:
            args[k] = args[k] if isinstance(args[k],str) else ''   
    else:
        args = {'code':code,
                'if':'',
                'in':''}    
    return args

In [None]:
code = "list var1 if var1==0 in 1/10"
test_eq(parse_code_if_in(code), {'code': 'list var1', 'if': ' if var1==0', 'in': ' in 1/10'})

Not robust to reversing the if/in order:

In [None]:
parse_code_if_in("list var1 in 1/10 if var1==0")

{'code': 'list var1', 'if': '', 'in': ' in 1/10 if var1==0'}

In [None]:
#| export
def in_range(stata_in_code):
    """Return in-statement range"""    
    stata_range_code = stata_in_code.replace(' in ','').strip()
    slash_pos = stata_range_code.find('/')
    if slash_pos == -1:
        return (None, None)
    start = stata_range_code[:slash_pos]
    end = stata_range_code[slash_pos+1:]
    if start.strip() == 'f': start = 1
    if end.strip() == 'l': end = count()
    return (int(start)-1, int(end))

In [None]:
test_eq(in_range(" in 1/10"), (0, 10))

Non-default increments are not supported:

In [None]:
in_range(" in 1(2)10")

(None, None)

In [None]:
#| export
class Selectvar():
    """Class for generating Stata selectvar for getAsDict"""
    
    varname = None
    
    def __init__(self, stata_if_code):
        condition = stata_if_code.replace('if ', '', 1).strip()
        if condition:
            cmd = f"tempvar __selectionVar\ngenerate `__selectionVar' = cond({condition},1,0)"
            pystata.stata.run(cmd, quietly=True)      
            self.varname = sfi.Macro.getLocal("__selectionVar")  

    def clear(self):
        """Remove temporary selectvar from Stata dataset"""
        if self.varname != None:
            pystata.stata.run(f"capture drop {self.varname}", quietly=True)  

`Selectvar.varname` is a temp. Stata variable for use in `sfi.Data.getAsDict`

In [None]:
show_doc(Selectvar.clear)

---

[source](https://github.com/hugetim/nbstata/blob/main/nbstata/helpers.py#L38){target="_blank" style="float:right; font-size:smaller"}

### Selectvar.clear

>      Selectvar.clear ()

Remove temporary selectvar from Stata dataset

In [None]:
#| export
### Regex's for clean_code() ###
# Detect delimiter. This would detect valid delimiters plus macros:
# delimit_regex = re.compile(r'#delimit( |\t)+(;|cr|`.+\'|\$_.+|\$.+)')
# but it's unnecessary, since Stata's #delimit x interprets any x other 
# than 'cr' as switching the delimiter to ';'.
delimit_regex = re.compile(r'#delimit(.*$)', flags=re.MULTILINE)
# Detect comments spanning multiple lines
comment_regex = re.compile(r'((\/\/\/)(.)*(\n|\r)|(\/\*)(.|\s)*?(\*\/))')
# Detect left Whitespace
left_regex = re.compile(r'\n +')
# Detect Multiple whitespace
multi_regex = re.compile(r' +')

def clean_code(code, noisily=False):
    """
    Remove comments spanning multiple lines and replace custom delimiters
    """
    
    def _replace_delimiter(code,delimiter=None):
        # Recursively replace custom delimiter with newline
        
        split = delimit_regex.split(code.strip(),maxsplit=1)

        if len(split) == 3:
            before = split[0]
            after = _replace_delimiter(split[2],split[1].strip())
        else:
            before = code
            after = ''
            
        if delimiter != 'cr' and delimiter != None:
            before = before.replace('\r', '').replace('\n', '')
            before = before.replace(';','\n')

        return before + after

    # Apply custom delimiter
    code = _replace_delimiter(code)
    
    # Delete comments spanning multiple lines
    code = comment_regex.sub(' ',code)
    
    # Delete whitespace at start of line
    code = left_regex.sub('\n',code)
    
    # Replace multiple whitespace with one
    code = multi_regex.sub(' ',code)

    # Add 'noisily' to each newline
    if noisily:
        cl = code.splitlines()
        co = []
        in_program = False
        for c in cl:
            cs = c.strip()

            # Are we starting a program definition?
            if  'program define' in cs:
                in_program = True

            if not (cs.startswith('quietly') 
                    or cs.startswith('noisily') 
                    or cs.startswith('}')
                    or cs.startswith('forv')
                    or cs.startswith('foreach')
                    or cs.startswith('while')
                    or in_program):
                c = 'noisily ' + c
            co.append(c)

            # Are we ending a program definition?
            if cs.startswith('end'):
                in_program = False

        code = '\n'.join(co)
    
    return code

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()