# utils

> Stata-related helper functions with no Jupyter or pystata dependence

In [None]:
#| default_exp utils
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import test_eq

In [None]:
#| export
import re
import sys
import os

## Stata code parsers

In [None]:
#| export
parse_code_if_in_regex = re.compile(
    r'\A(?P<code>(?!if\s)(?!\sif)(?!in\s)(?!\sin).+?)?(?P<if>\s*if\s+.+?)?(?P<in>\s*in\s.+?)?\Z',
    flags=re.DOTALL + re.MULTILINE
)

In [None]:
#| export
def parse_code_if_in(code):
    """Parse line of Stata code into code, if, in"""
    match = parse_code_if_in_regex.match(code.strip())
    if match:
        args = match.groupdict()
        for k in args:
            args[k] = args[k] if isinstance(args[k],str) else ''   
    else:
        args = {'code':code,
                'if':'',
                'in':''}    
    return args

In [None]:
code = "list var1 if var1==0 in 1/10"
test_eq(parse_code_if_in(code), {'code': 'list var1', 'if': ' if var1==0', 'in': ' in 1/10'})

Not robust to reversing the if/in order:

In [None]:
parse_code_if_in("list var1 in 1/10 if var1==0")

{'code': 'list var1', 'if': '', 'in': ' in 1/10 if var1==0'}

In [None]:
#| export
def in_range(stata_in_code):
    """Return in-statement range"""    
    stata_range_code = stata_in_code.replace(' in ','').strip()
    slash_pos = stata_range_code.find('/')
    if slash_pos == -1:
        return (None, None)
    start = stata_range_code[:slash_pos]
    end = stata_range_code[slash_pos+1:]
    if start.strip() == 'f': start = 1
    if end.strip() == 'l': end = count()
    return (int(start)-1, int(end))

In [None]:
test_eq(in_range(" in 1/10"), (0, 10))

Non-default increments are not supported:

In [None]:
in_range(" in 1(2)10")

(None, None)

In [None]:
#| export
class Selectvar():
    """Class for generating Stata selectvar for getAsDict"""
    
    varname = None
    
    def __init__(self, stata_if_code):
        condition = stata_if_code.replace('if ', '', 1).strip()
        if condition:
            cmd = f"tempvar __selectionVar\ngenerate `__selectionVar' = cond({condition},1,0)"
            pystata.stata.run(cmd, quietly=True)      
            self.varname = sfi.Macro.getLocal("__selectionVar")  

    def clear(self):
        """Remove temporary selectvar from Stata dataset"""
        if self.varname != None:
            pystata.stata.run(f"capture drop {self.varname}", quietly=True)  

`Selectvar.varname` is a temp. Stata variable for use in `sfi.Data.getAsDict`

In [None]:
show_doc(Selectvar.clear)

---

[source](https://github.com/hugetim/nbstata/blob/main/nbstata/utils.py#L59){target="_blank" style="float:right; font-size:smaller"}

### Selectvar.clear

>      Selectvar.clear ()

Remove temporary selectvar from Stata dataset

In [None]:
#| export

# Detect comments spanning multiple lines
comment_regex = re.compile(r'((\/\/\/)(.)*(\n|\r)|(\/\*)(.|\s)*?(\*\/))')

def _remove_multi_line_comments(code):
    return comment_regex.sub(' ',code)

In [None]:
test_eq(
    _remove_multi_line_comments("""disp ///
1"""
                    ),
    "disp  1")

In [None]:
test_eq(
    _remove_multi_line_comments("""/*
blah
blah
*/
list var"""
                  ), 
    """ 
list var""")

Note: the following more-complicated regex would detect valid delimiters plus macros: 
```python
delimit_regex = re.compile(r'#delimit( |\t)+(;|cr|`.+\'|\$_.+|\$.+)')
```
but that's unnecessary, since Stata's `#delimit x` interprets any `x` other than 'cr' as switching the delimiter to ';'.

In [None]:
#| export
def is_cr_delimiter(delimiter):
    return delimiter in {'cr', None}

In [None]:
#| export
delimit_regex = re.compile(r'#delimit(.*$)', flags=re.MULTILINE)
def _replace_delimiter(code, starting_delimiter=None):
    # Recursively replace custom delimiter with newline

    split = delimit_regex.split(code.strip(),maxsplit=1)

    if len(split) == 3:
        before = split[0]
        after = _replace_delimiter(split[2],split[1].strip())
    else:
        before = code
        after = ''

    if not is_cr_delimiter(starting_delimiter):
        before = before.replace('\r', '').replace('\n', '')
        before = before.replace(';','\n')

    return before + after

In [None]:
test_eq(
    _replace_delimiter(
        """list var1
#delimit;
list var2;list var3"""
                  ), 
    """list var1
list var2
list var3""")

In [None]:
#| export
def ending_delimiter(code, starting_delimiter=None):
    code = _remove_multi_line_comments(code)
    # Recursively determine ending delimiter
    split = delimit_regex.split(code.strip(),maxsplit=1)
    if len(split) == 3:
        delimiter = ending_delimiter(split[2],split[1].strip())
    elif len(split) == 2:
        delimiter = split[1].strip()
    else:
        delimiter = starting_delimiter
    return None if is_cr_delimiter(delimiter) else ';'

In [None]:
test_eq(
    ending_delimiter(
        """list var1
#delimit;
list var2;list var3"""
                  ), 
    ';')

In [None]:
test_eq(
    ending_delimiter("""
/*
#delimit;
*/
disp 1
disp 2"""
                  ), 
    None)

In [None]:
#| export

# Detect Multiple whitespace
multi_regex = re.compile(r' +')

def standardize_code(code, starting_delimiter=None):
    """Remove comments spanning multiple lines and replace custom delimiters"""
    code = _remove_multi_line_comments(code)
    
    # After removing multi-line comments, which could include "#delimit;"
    code = _replace_delimiter(code, starting_delimiter) 
    
    # Replace multiple whitespace with one
    code = multi_regex.sub(' ',code)
    
    # Delete blank lines and whitespace at start and end of lines
    cl = code.splitlines()
    co = []
    for c in cl:
        cs = c.strip()
        if cs:
            co.append(cs)
    return '\n'.join(co)

In [None]:
test_eq(
    standardize_code("""
list var1
#delimit;
list var2; list var3;
"""
                  ), 
    """list var1
list var2
list var3""")

In [None]:
test_eq(
    standardize_code("""
/*
blah
blah
*/
list var
"""
                  ), 
    "list var")

In [None]:
test_eq(
    standardize_code("""
/*
#delimit;
*/
disp 1
disp 2"""
                  ), 
    """disp 1
disp 2""")

In [None]:
test_eq(
    standardize_code("""
disp ///
1
"""
                    ),
    "disp 1")

In [None]:
test_eq(
    standardize_code("""
disp /// comment
1
"""
                    ),
    "disp 1")

In [None]:
test_eq(
    standardize_code("""
 list var
"""
                  ), 
    "list var")

In [None]:
test_eq(standardize_code("list    var"), "list var")

In [None]:
#| export
def _startswith_stata_abbrev(string, full_command, shortest_abbrev):
    for j in range(len(shortest_abbrev), len(full_command)+1):
        if string.startswith(full_command[0:j] + ' '):
            return True
    return False

In [None]:
test_eq(_startswith_stata_abbrev("q list var", "quietly", "q"), True)

In [None]:
test_eq(_startswith_stata_abbrev("qui list var", "quietly", "q"), True)

In [None]:
#| export
def _remove_prog_prefixes(cs):
    if (_startswith_stata_abbrev(cs, 'quietly', 'qui')
        or cs.startswith('capture ')
        or _startswith_stata_abbrev(cs, 'noisily', 'n')):
        return _remove_prog_prefixes(cs.split(None, maxsplit=1)[1])
    else:
        return cs

In [None]:
test_eq(_remove_prog_prefixes("capture noisily program test_program"), "program test_program")

In [None]:
#| export
def is_start_of_program_block(std_code_line):
    cs = _remove_prog_prefixes(std_code_line)
    _starts_program = (_startswith_stata_abbrev(cs, 'program', 'pr')
                       and not (cs == 'program di'
                                or cs == 'program dir'
                                or cs.startswith('program drop ')
                                or _startswith_stata_abbrev(cs, 'program list', 'program l')))
    return (_starts_program
            or (cs in {'mata', 'mata:'})
            or (cs in {'python', 'python:'}))

In [None]:
test_eq(is_start_of_program_block("capture noisily program test_program"), True)
test_eq(is_start_of_program_block("capture noisily list var"), False)

In [None]:
#| export
def break_out_prog_blocks(code, starting_delimiter=None):
    cl = standardize_code(code, starting_delimiter).splitlines()
    co = []
    blocks = []
    for c in cl:
        # Are we starting a program definition?
        if is_start_of_program_block(c):
            if co: # lines before the start of a program block
                blocks.append({"is_prog": False, "std_code": '\n'.join(co)})
                co = []

        co.append(c)

        # Are we ending a program definition?
        if c == 'end':
            blocks.append({"is_prog": True, "std_code": '\n'.join(co)})
            co = []

    if co: 
        blocks.append({"is_prog": False, "std_code": '\n'.join(co)})
    return blocks

In [None]:
break_out_prog_blocks('''
capture program drop ender
program define ender
    disp "ender output"
end
capture program drop display2
program define display2
    ender
end
display2
''')

[{'is_prog': False, 'std_code': 'capture program drop ender'},
 {'is_prog': True,
  'std_code': 'program define ender\ndisp "ender output"\nend'},
 {'is_prog': False, 'std_code': 'capture program drop display2'},
 {'is_prog': True, 'std_code': 'program define display2\nender\nend'},
 {'is_prog': False, 'std_code': 'display2'}]

## Other utils

[A way to block print statements](https://stackoverflow.com/a/45669280/10637373):

In [None]:
#| export
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')
    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [None]:
with HiddenPrints():
    print("test print output")

In [None]:
#| export
def print_red(text):
    print(f"\x1b[31m{text}\x1b[0m")

print_red source: https://stackoverflow.com/a/16816874/10637373

In [None]:
print_red("test_red")

[31mtest_red[0m


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()