# main ideas
1. use txt files to store all necessary information
2. build a parser that reads my lines dynamically
3. build a cv builder that decides what elements are displayed

In [56]:
test_entry = """
    @RAship $3000CAD 201509-201806
        quantitative analysis
        completed 100% of the tasks on time.
        other_specifications:: I don't know
    PI:: Dr. Owen Lo
        #goodfornothing #quant
    
    
    @AnotherRAship $3000/hr 201509-201806
        quantitative analysis
        completed 100% of the tasks on time.
        other_specifications:: I don't know
    #goodfornothing #quant
    
    
"""

In [57]:
import re

def lex(text):    
    cleaned_text = re.sub("\s+", " ", text).split()
    for token in cleaned_text:
        t = token.strip()
        m_title = re.match(r"@([^\r\n]+)", t)
        m_money = re.match(r"\$(\d+)[^\r\n]{0,6}(\w\wD)?", t)
        m_yyyymm = re.match(r"(\d{4,6})[-~]?(\d{4,6})?", t)
        m_tags = re.match(r"\#([^\r\n]+)", t)
        # self-defined categories
        m_other_specs = re.match(r"([^\r\n]+)::", t)
        if m_title: yield "title", m_title.group(1)
        elif  m_money: 
            yield "money", m_money.group(1)
            if m_money.group(2) is not None: 
                yield "money", m_money.group(2)
        elif m_yyyymm:
            yield "start_ym", m_yyyymm.group(1)
            if m_yyyymm.group(2) is not None:
                yield "end_ym", m_yyyymm.group(2)
            else:
                yield "end_ym", "present"
        elif m_tags: yield "tag", m_tags.group(1)
        elif m_other_specs: yield m_other_specs.group(1), ""
        else:
            yield "content", t

list(lex(test_entry))

[('title', 'RAship'),
 ('money', '3000'),
 ('start_ym', '201509'),
 ('end_ym', '201806'),
 ('content', 'quantitative'),
 ('content', 'analysis'),
 ('content', 'completed'),
 ('content', '100%'),
 ('content', 'of'),
 ('content', 'the'),
 ('content', 'tasks'),
 ('content', 'on'),
 ('content', 'time.'),
 ('other_specifications', ''),
 ('content', 'I'),
 ('content', "don't"),
 ('content', 'know'),
 ('PI', ''),
 ('content', 'Dr.'),
 ('content', 'Owen'),
 ('content', 'Lo'),
 ('tag', 'goodfornothing'),
 ('tag', 'quant'),
 ('title', 'AnotherRAship'),
 ('money', '3000'),
 ('start_ym', '201509'),
 ('end_ym', '201806'),
 ('content', 'quantitative'),
 ('content', 'analysis'),
 ('content', 'completed'),
 ('content', '100%'),
 ('content', 'of'),
 ('content', 'the'),
 ('content', 'tasks'),
 ('content', 'on'),
 ('content', 'time.'),
 ('other_specifications', ''),
 ('content', 'I'),
 ('content', "don't"),
 ('content', 'know'),
 ('tag', 'goodfornothing'),
 ('tag', 'quant')]

In [59]:
def parse(lexemes):
    syntax_tree = {}
    current_title, current_subtitle, current_streak = "", "", ""
    predefined_labels = ["start_ym", "end_ym", "title", "salary", "tag", "content"]
    for k, v in lexemes:
        if k == 'title':
            syntax_tree[v] = {}
            current_title = v
            current_subtitle = ""
            content_streak = ""
        elif k not in predefined_labels: # user-specified subtitle
            current_subtitle = k
            content_streak = ""
        elif k == "content":
            content_streak += v + " "
            if current_subtitle == "":
                syntax_tree[current_title]['content'] = content_streak
            else:
                syntax_tree[current_title][current_subtitle] = content_streak
        elif k == "tag":
            if "tags" not in syntax_tree[current_title].keys():
                syntax_tree[current_title]['tags'] = [v]
            else:
                syntax_tree[current_title]['tags'].append(v)    
        else:
            syntax_tree[current_title][k] = v
    return syntax_tree

parse(lex(test_entry))

{'RAship': {'start_ym': '201509',
  'end_ym': '201806',
  'money': 'quantitative analysis completed 100% of the tasks on time. ',
  'other_specifications': "I don't know ",
  'PI': 'Dr. Owen Lo ',
  'tags': ['goodfornothing', 'quant']},
 'AnotherRAship': {'start_ym': '201509',
  'end_ym': '201806',
  'money': 'quantitative analysis completed 100% of the tasks on time. ',
  'other_specifications': "I don't know ",
  'tags': ['goodfornothing', 'quant']}}