# code_utils

> Stata-related helper functions with no Jupyter or pystata dependence
- order: 4

These are mostly intended for use in the `noecho` module.

In [None]:
#| default_exp code_utils
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import re
from decimal import Decimal

In [None]:
#| export
from pygments import lexers
from pygments.token import Comment

In [None]:
from fastcore.test import test_eq, ExceptionExpected
from textwrap import dedent

## Handling Stata comments and `#delimit;`

In [None]:
#| hide
#| export
stata_lexer = lexers.get_lexer_by_name('stata')

def _lex_tokens(code):
    return stata_lexer.get_tokens_unprocessed(code)

#| hide
* [https://pygments.org/docs/api/#pygments.lexer.Lexer.get_tokens_unprocessed](https://pygments.org/docs/api/#pygments.lexer.Lexer.get_tokens_unprocessed)
* [https://github.com/pygments/pygments/blob/master/pygments/lexers/stata.py](https://github.com/pygments/pygments/blob/master/pygments/lexers/stata.py)
* [https://github.com/pygments/pygments/blob/master/pygments/token.py](https://github.com/pygments/pygments/blob/master/pygments/token.py)

In [None]:
#| export
def remove_comments(code):
    return "".join(token[2] for token in _lex_tokens(code) if token[1] not in Comment)

In [None]:
#| hide
list(_lex_tokens('tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19", "AG4B to AGB", "AG5B to AGC")  //"E20 to B20", ", sort'))

[(0, Token.Keyword, 'tab'),
 (3, Token.Text, ' '),
 (4, Token.Text, 's'),
 (5, Token.Text, 'i'),
 (6, Token.Text, 'z'),
 (7, Token.Text, 'e'),
 (8, Token.Keyword, ' if'),
 (11, Token.Text, ' '),
 (12, Token.Name.Function, 'inlist'),
 (18, Token.Text, '('),
 (19, Token.Text, 'r'),
 (20, Token.Text, 't'),
 (21, Token.Text, '_'),
 (22, Token.Text, 'f'),
 (23, Token.Text, 'r'),
 (24, Token.Text, 'o'),
 (25, Token.Text, 'm'),
 (26, Token.Text, '_'),
 (27, Token.Text, 't'),
 (28, Token.Text, 'o'),
 (29, Token.Text, ','),
 (30, Token.Text, ' '),
 (31, Token.Literal.String, '"'),
 (32, Token.Literal.String, 'A'),
 (33, Token.Literal.String, '1'),
 (34, Token.Literal.String, '0'),
 (35, Token.Literal.String, 'T'),
 (36, Token.Literal.String, 'O'),
 (37, Token.Literal.String, 'U'),
 (38, Token.Literal.String, ' '),
 (39, Token.Literal.String, 't'),
 (40, Token.Literal.String, 'o'),
 (41, Token.Literal.String, ' '),
 (42, Token.Literal.String, 'B'),
 (43, Token.Literal.String, '1'),
 (44, Token.L

In [None]:
remove_comments('tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19")  //"E20 to B20", ", sort')

'tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19")  '

In [None]:
#| hide
test_eq(remove_comments('*tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19")  //"E20 to B20", ", sort'), 
        '')
test_eq(remove_comments('tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19")  /*"E20 to B20", ", sort'),
        'tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19")  ')
test_eq(remove_comments('tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19")  /*"E20 to B20", "*/, sort'),
        'tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19")  , sort')
test_eq(remove_comments('tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19")  ///"E20 to B20", "*/ \n , sort'),
        'tab size if inlist(rt_from_to, "A10TOU to B10", "E19 to B19")   , sort')

In [None]:
test_eq(
    remove_comments(dedent("""\
        disp ///
        1""")),
    "disp 1")

Correctly ignores "///" when not preceded by a space:

In [None]:
test_eq(
    remove_comments(dedent("""\
        disp///
        1
        """)),
    dedent("""\
        disp///
        1
        """)
)

In [None]:
test_eq(
    remove_comments(dedent("""\
        /*
        blah
        blah
        */
        list var
        """)),
    """\

list var
"""
)

In [None]:
#| export
def _end_block_followed_by_non_comment_block(code):
    return (
        code.rfind('*/') != -1
        and not ends_in_comment_block(code[code.rfind('*/')+2:])
    )

In [None]:
#| export
def ends_in_comment_block(code):
    last_token = list(_lex_tokens(code))[-1]
    last_token_type = last_token[1]
    return (
        last_token_type == Comment.Multiline
        and code.strip()[-2:] != "*/"
        and not _end_block_followed_by_non_comment_block(code)
    )

In [None]:
ends_in_comment_block('tab size /*if ')

True

In [None]:
code = 'tab size /*if */\n*'
code[code.rfind('*/')+2:]

'\n*'

In [None]:
test_eq(ends_in_comment_block('tab size /*if '), True)
test_eq(ends_in_comment_block('tab size /*if */'), False)
test_eq(ends_in_comment_block('tab size /*if */\n*'), False)
test_eq(ends_in_comment_block('tab size /*if */\n//'), False)

In [None]:
#| export
def _is_not_cr_delimiter(delimiter):
    return delimiter != 'cr'

In [None]:
#| export
delimit_regex = re.compile(r'^[ \t]*#delimit(.*$)', flags=re.MULTILINE)
def _replace_delimiter(code, sc_delimiter=False):
    # Recursively replace custom delimiter with newline

    split = delimit_regex.split(code.strip(), maxsplit=1)

    if len(split) == 3:
        before = split[0]
        after = _replace_delimiter(split[2], _is_not_cr_delimiter(split[1].strip()))
    else:
        before = code
        after = ''

    if sc_delimiter:
        before_last_sc_pos = before.rfind(';')
        if before_last_sc_pos < len(before.strip()) - 1:
            before = before[:before_last_sc_pos+1]
            if len(split) > 1:
                after = _replace_delimiter(before[before_last_sc_pos+1:]+" ".join(split[1:]), sc_delimiter=True)
        before = before.replace('\r', ' ').replace('\n', ' ')
        before = before.replace(';','\n')

    return before + after

In [None]:
delimit_regex.split(dedent("""\
disp 3
#delimit cr
disp 1
disp 2
"""), maxsplit=1)

['disp 3\n', ' cr', '\ndisp 1\ndisp 2\n']

In [None]:
show_doc(_replace_delimiter)

---

[source](https://github.com/hugetim/nbstata/blob/master/nbstata/code_utils.py#L56){target="_blank" style="float:right; font-size:smaller"}

### _replace_delimiter

>      _replace_delimiter (code, sc_delimiter=False)

Note: the following more-complicated regex would detect valid delimiters plus macros: 
```python
delimit_regex = re.compile(r'#delimit( |\t)+(;|cr|`.+\'|\$_.+|\$.+)')
```
but that's unnecessary, since Stata's `#delimit x` interprets any `x` other than 'cr' as switching the delimiter to ';'.

In [None]:
test_eq(
    _replace_delimiter(dedent("""\
        list var1
        #delimit;
        list var2;list var3;
        list
        var4;
        """)),
    dedent("""\
        list var1
         list var2
        list var3
         list var4
        """)
)

In [None]:
#| hide
test_eq(
    _replace_delimiter(dedent("""\
        list var1
        #delimit;
        list var2;list var3;
        list
        var4
        """)),
    dedent("""\
        list var1
         list var2
        list var3
        """)
)

In [None]:
test_eq(_replace_delimiter(dedent("""\
    disp "start"
    #delimit;
    disp "hello"; disp "hello2";
    disp 
        "hello2a";
    #delimit cr
    disp "hello3"
    disp "hello4"
    #delimit;""")), 
        dedent("""\
    disp "start"
    disp "hello"
     disp "hello2"
     disp      "hello2a"
     disp "hello3"
    disp "hello4"
    """))

In [None]:
#| hide
test_eq(_replace_delimiter(dedent("""\
    disp "start"
    #delimit;
    disp "hello"; disp "hello2";
    disp 
        "hello2a"
    #delimit cr
    disp "hello3"
    disp "hello4"
    #delimit;""")).strip(), 
        dedent("""\
    disp "start"
    disp "hello"
     disp "hello2"
     """).strip())

In [None]:
_replace_delimiter(dedent("""\
disp 3
#delimit cr
disp 1
disp 2
"""), sc_delimiter=True)

''

In [None]:
#| export
def valid_single_line_code(code):
    code = remove_comments(code)
    if delimit_regex.match(code):
        return ""
    else:
        return code

In [None]:
test_eq(valid_single_line_code('tab size if inlist(rt_from_to, "A10TOU to B10")  // E20'), 
        'tab size if inlist(rt_from_to, "A10TOU to B10")  ')
test_eq(valid_single_line_code('#delimit ;'), 
        '')
test_eq(valid_single_line_code('#delimit cr'), 
        '')

In [None]:
#| export
def ending_sc_delimiter(code, sc_delimiter=False):
    code = remove_comments(code)
    # Recursively determine ending delimiter
    split = delimit_regex.split(code.strip(),maxsplit=1)
    
    if len(split) == 3:
        before = split[0]
    else:
        before = code
    if sc_delimiter:
        before_last_sc_pos = before.rfind(';')
        if before_last_sc_pos < len(before.strip()) - 1:
            if len(split) > 1:
                return ending_sc_delimiter(before[before_last_sc_pos+1:]+" ".join(split[1:]), sc_delimiter=True)
            
    if len(split) == 3:
        sc_delimiter = ending_sc_delimiter(split[2], _is_not_cr_delimiter(split[1].strip()))
    elif len(split) == 2:
        sc_delimiter = _is_not_cr_delimiter(split[1].strip())

    return sc_delimiter

In [None]:
test_eq(
    ending_sc_delimiter(dedent("""\
        list var1
        #delimit;
        list var2;list var3;
        """)),
    True)

In [None]:
test_eq(
    ending_sc_delimiter(dedent("""\
        /*
        #delimit;
        */
        disp 1
        disp 2""")),
    False)

In [None]:
#| hide
test_eq(
    ending_sc_delimiter(dedent("""\
        #delimit;
        scalar
        list x""")),
    True)

In [None]:
#| hide
test_eq(
    ending_sc_delimiter(dedent("""\
        #delimit ;""")),
    True)

In [None]:
#| hide
test_eq(ending_sc_delimiter(dedent("""\
    disp "start"
    #delimit;
    disp "hello"; disp "hello2";
    disp 
        "hello2a";
    #delimit cr
    disp "hello3"
    disp "hello4"
    """)), 
    False)

In [None]:
#| hide
test_eq(ending_sc_delimiter(dedent("""\
    disp "start"
    #delimit;
    disp "hello"; disp "hello2";
    disp 
        "hello2a"
    #delimit cr
    disp "hello3"
    disp "hello4"
    """)), 
    True)

In [None]:
#| export

# Detect Multiple whitespace
multi_regex = re.compile(r'(?P<char>\S) +')

def standardize_code(code, sc_delimiter=False):
    """Remove comments spanning multiple lines and replace custom delimiters"""
    code = remove_comments(code)
    
    # After removing multi-line comments, which could include "#delimit;"
    code = _replace_delimiter(code, sc_delimiter) 
    
    # Replace multiple interior whitespace with one
    code = multi_regex.sub('\g<char> ',code)
    
    # Delete blank lines and whitespace at end of lines
    code_lines = code.splitlines()
    std_lines = []
    for code_line in code_lines:
        cs = code_line.rstrip()
        if cs:
            std_lines.append(cs)
    return '\n'.join(std_lines)

In [None]:
test_eq(
    standardize_code(dedent("""\
        list var1
        #delimit;
        list var2; list var3;
        list
        var4;
        """)), 
    dedent("""\
        list var1
         list var2
         list var3
         list var4""")
)

In [None]:
#| hide
test_eq(
    standardize_code(dedent("""\
        list var1
        #delimit;
        list var2; list var3;
        list
        var4
        """)), 
    dedent("""\
        list var1
         list var2
         list var3""")
)

In [None]:
test_eq(
    standardize_code(dedent("""\
        /*
        blah
        blah
        */
        list var
        """)), 
    "list var")

In [None]:
#| hide
test_eq(
    standardize_code(dedent("""\
        /*
        #delimit;
        */
        disp 1
        disp 2
        """)), 
    dedent("""\
        disp 1
        disp 2""")
)

In [None]:
#| hide
test_eq(
    standardize_code(dedent("""\
        disp ///
        1
        """)),
    "disp 1")

In [None]:
test_eq(
    standardize_code(dedent("""\
        disp /// comment
        1
        """)),
    "disp 1")

In [None]:
#| hide
test_eq(
    standardize_code(dedent("""\
        list var
        """)), 
    "list var")

In [None]:
test_eq(standardize_code("list    var"), "list var")

In [None]:
standardize_code('''\
display "displayed1"
/*
display "displayed2"
*/
display "displayed3"''')

'display "displayed1"\ndisplay "displayed3"'

In [None]:
#| hide
standardize_code('''\
display "line continuation " /// commented out
    "comment"''')

'display "line continuation " "comment"'

In [None]:
#| hide
test_eq(standardize_code('''\
try:
    print("This works!")'''), '''\
try:
    print("This works!")''')

## Detect version command

In [None]:
#| export
def _startswith_stata_abbrev(string, full_command, shortest_abbrev):
    for j in range(len(shortest_abbrev), len(full_command)+1):
        if string.startswith(full_command[0:j] + ' '):
            return True
    return False

In [None]:
#| hide
test_eq(_startswith_stata_abbrev("q list var", "quietly", "q"), True)
test_eq(_startswith_stata_abbrev("qui list var", "quietly", "q"), True)

In [None]:
#| export
def _remove_prefixes(std_code_line):
    std_code_line = std_code_line.lstrip()
    if (_startswith_stata_abbrev(std_code_line, 'quietly', 'qui')
        or std_code_line.startswith('capture ')
        or _startswith_stata_abbrev(std_code_line, 'noisily', 'n')):
        return _remove_prefixes(std_code_line.split(None, maxsplit=1)[1])
    else:
        return std_code_line

In [None]:
#| hide
test_eq(_remove_prefixes("capture noisily program test_program"), "program test_program")

In [None]:
#| hide
"    noisily test_program".split(None, maxsplit=1)

['noisily', 'test_program']

In [None]:
#| hide
_remove_prefixes("    noisily test_program")

'test_program'

In [None]:
#| export
def ending_code_version(code, sc_delimiter=False, code_version=None, stata_version='17.0'):
    if 'version' not in code:
        return code_version
    std_code = standardize_code(code, sc_delimiter)
    for std_code_line in reversed(std_code.splitlines()):
        if 'version ' not in std_code_line:
            continue
        m = re.match(r'\A\s*version ([0-9]+(?:\.[0-9][0-9]?)?)\Z', _remove_prefixes(std_code_line))
        if m:
            _version = Decimal(m.group(1)).normalize()
            if Decimal('1') <= _version <= Decimal(stata_version):
                code_version = None if _version == Decimal(stata_version).normalize() else str(_version)
                break
    return code_version

Based on my trial and error, it seems that Stata's `version` command (as of version 17.0) accepts any number between 1 and your Stata version (inclusive) with up to two decimal places.

In [None]:
test_eq(
    ending_code_version(dedent("#delimit ;")),
    None)
test_eq(
    ending_code_version(dedent(" version 15")),
    "15")
test_eq(
    ending_code_version(dedent("version 15.0")),
    "15")
test_eq(
    ending_code_version(dedent("version 15.1")),
    "15.1")
test_eq(
    ending_code_version(dedent("version 15.141")),
    None)
test_eq(
    ending_code_version(dedent("version 23")),
    None)
test_eq(
    ending_code_version(dedent("version 0.7")),
    None)
test_eq(
    ending_code_version(dedent("version 17")),
    None)
test_eq(
    ending_code_version(dedent("version 17.0")),
    None)
test_eq(
    ending_code_version(dedent("version 17.0"), stata_version="17.00"),
    None)
test_eq(
    ending_code_version(dedent("version 18.0"), stata_version="18.00"),
    None)
test_eq(
    ending_code_version(dedent("version 18.0"), stata_version="18.10"),
    "18")

## Check for specific commands in std_code

In [None]:
#| export
pre = (
    r'(cap(t|tu|tur|ture)?'
    r'|qui(e|et|etl|etly)?'
    r'|n(o|oi|ois|oisi|oisil|oisily)?)')
kwargs = {'flags': re.MULTILINE}
local_def_in = re.compile(
    r"(^\s*({0} )*(loc(a|al)?|tempname|tempvar|tempfile|gettoken|token(i|iz|ize)?|levelsof)\s)|st_local\(".format(pre),
    **kwargs,
).search

In [None]:
test_eq(bool(local_def_in(" sysuse auto")), False)
test_eq(bool(local_def_in(" loc auto=1")), True)
test_eq(bool(local_def_in("qui n cap local auto=1")), True)
test_eq(bool(local_def_in("list local auto")), False)
test_eq(bool(local_def_in("tempfile file1")), True)
test_eq(bool(local_def_in(" capture token file1")), True)
test_eq(bool(local_def_in("mata: st_local(test1, 2)")), True)
test_eq(bool(local_def_in("levelsof var1")), True)

In [None]:
#| export
preserve_restore_in = re.compile(
    r"(^({0} )*(preserve|restore)[,\s]?\.*?$)|(;({0} )*(preserve|restore)[,\s]?\.*?$)".format(pre),
    **kwargs,
).search

In [None]:
test_eq(bool(preserve_restore_in("sysuse auto")), False)
test_eq(bool(preserve_restore_in("preserve")), True)
test_eq(bool(preserve_restore_in("preserve\nkeep in 1")), True)
test_eq(bool(preserve_restore_in("restore,")), True)
test_eq(bool(preserve_restore_in("count\nrestore")), True)
test_eq(bool(preserve_restore_in("gen restore=1")), False)

## Separate out Stata program code
...because [such code](https://www.stata.com/manuals/pprogram.pdf) (as well as [python](https://www.stata.com/stata-news/news35-3/python-blogs/)/[mata](https://www.stata.com/manuals/m-1first.pdf) blocks) is unsuitable for `run_as_program`

In [None]:
#| export
def is_start_of_program_block(std_code_line):
    cs = _remove_prefixes(std_code_line)
    _starts_program = (_startswith_stata_abbrev(cs, 'program', 'pr')
                       and not (cs.split()[1] in ['di', 'dir', 'drop', 'l', 'li', 'lis', 'list']))
    return (_starts_program
            or (cs in {'mata', 'mata:'})
            or (cs in {'python', 'python:'}))

In [None]:
test_eq(is_start_of_program_block("capture noisily program test_program"), True)
test_eq(is_start_of_program_block(" capture noisily list var"), False)
test_eq(is_start_of_program_block("pr l display1"), False)

In [None]:
#| export
def _prog_blocks(std_code_lines):
    next_block_lines = []
    in_program = False
    for std_code_line in std_code_lines:         
        if is_start_of_program_block(std_code_line):
            if next_block_lines: # previous lines
                yield _block(next_block_lines, is_prog=in_program)
                next_block_lines = []
            in_program = True
        next_block_lines.append(std_code_line)
        if std_code_line == 'end': # regardless of whether in_program
            yield _block(next_block_lines, is_prog=True)
            next_block_lines = []
            in_program = False
    if next_block_lines:
        yield _block(next_block_lines, in_program)
        

def _block(block_lines, is_prog):
    return {"is_prog": is_prog, "std_code": '\n'.join(block_lines)}

In [None]:
#| export
def break_out_prog_blocks(code, sc_delimiter=False):
    std_code_lines = standardize_code(code, sc_delimiter).splitlines()
    return list(_prog_blocks(std_code_lines))

In [None]:
test_eq(
    break_out_prog_blocks(dedent('''\
        capture program drop ender
        program define ender
            disp "ender output"
        end
        capture program drop display2
        program define display2
            ender
        end
        display2
        ''')),
    [{'is_prog': False, 'std_code': 'capture program drop ender'},
     {'is_prog': True,
      'std_code': 'program define ender\n    disp "ender output"\nend'},
     {'is_prog': False, 'std_code': 'capture program drop display2'},
     {'is_prog': True, 'std_code': 'program define display2\n    ender\nend'},
     {'is_prog': False, 'std_code': 'display2'}]
)

In [None]:
#| hide
test_eq(
    break_out_prog_blocks(dedent('''\
        program define ender
            disp "ender output"
        ''')),
    [{'is_prog': True,
      'std_code': 'program define ender\n    disp "ender output"'}]
)

In [None]:
#| hide
break_out_prog_blocks('''\
display "line continuation " /// commented out
    "comment"''')

[{'is_prog': False, 'std_code': 'display "line continuation " "comment"'}]

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()