In [None]:
from storage.cherrytree import CherryTree
from utility.config import load_config
from utility.strings import snake_case
from IPython.display import display, Markdown
from pathlib import Path
from functools import reduce
from document.pandoc import PandocArgs, interfile, write_pandoc, stream_pandoc
from document.document import Document 
import dateparser
import pandas as pd
import spacy
import attr
import re

In [None]:
        
nlp = spacy.load('en_core_web_md')

output_path = Path('output')
wsplit = re.compile('\s')
    

In [None]:
def load_outline():
    collections = ('Stories', 'Interviews')
    ct = CherryTree('content_index.ctd')
    dfs = pd.DataFrame([dict(node=n, 
                             identifier=n.id,
                             incident=n.name,
                             story=n.parent.name,
                             parent=next((a.name for a in n.ancestors if a.level==1), None),
                             notes=n.notes
                            ) 
                        for c in collections for n in ct.nodes(c)  if n.level > 2 ]) 
    

    dfs['story_index'] = dfs.index 
    dfs.story_index.astype(int, copy=False)
    
    dfe = pd.DataFrame([
        dict(identifier=l.href,
             sequence=s.name,
             episode=e.name) for e in ct.nodes('Synopsis') 
                              for s in e.children 
                              for l in s.links])  

    dfe['episode_index'] = dfe.index
    
    dfp = dfs.merge(dfe, on='identifier', how='left')
    dfp['episode_index'] = dfp.episode_index.fillna(-1).astype(int)
    dfp.episode.fillna('unplaced', inplace=True)
    dfp.sequence.fillna('unplaced', inplace=True)
    
    docs = [] 

    for node in [n for n in ct.nodes() if n.document]:
        try:
            doc = Document.read_file(node.document)
            docs.append((doc.metadata, doc.content, node.id)) 
        except Exception as e:
            print(e, node.document) 
            
    dfd = pd.DataFrame(docs, columns=['metadata', 'content', 'identifier'])
    dfm = dfd.metadata.apply(pd.Series).fillna('No Data') 
    dfm['timestamp'] = dfm.date.apply(lambda x: dateparser.parse(x))
    return dfp.merge(pd.concat([dfd, dfm], axis=1), on='identifier', how='left').drop(['node', 'metadata'], axis=1)
    


In [None]:
def make_linked_documents(section)
    ct = CherryTree('content_index.ctd')
    for node in [n for n in ct.nodes(section) if n.content if not n.filepath]:
        link = node.insert_document_link('synopsis')
        metadata=dict(title=node.name, date='No Date', status='check')
        write_pandoc(PandocArgs(input=interfile(node.content), 
                                output=link.href, 
                                metadata=metadata))
    ct.save()

In [118]:
def format_content(scn, pscn): 
    if re.match('[1-9]', scn.date):
        data = dict(date=scn.timestamp.strftime("%d %B %Y"))
        data['exact_date'] = True
    else:
        data = dict(date=scn.timestamp.strftime("%B %Y"))
        
        
    if not pscn:
        data['new'] = True 
    elif pscn.sequence != scn.sequence:
        data['new'] = True 
    elif scn.parent == 'Interviews':
        data['return'] = True 
    else:
        time_diff = scn.timestamp - pscn.timestamp
        days = time_diff.days
        if days < -600: 
            data['flashback'] = True
        elif days == 0:
            data['that_day'] == True
        elif days == 1:
            data['next_day'] == True
        elif tine_diff.days < 7:
            date['that_week'] = True
        else:
            data['continue'] = True
    return data, scn.content

def export_episode(episode, output_dir):
    output_path = Path(output_dir)
    df = load_outline()
    pandoc_args = [PandocArgs(input=interfile(),
                                output=interfile(),
                                metadata=dict(episode=episode),
                                template='synopsis')]

    prev_scene = None
    for scene in df[df.episode.str.contains(episode)].sort_values(['episode_index']).itertuples(index=True):
        variables, content = format_content(scene, prev_scene)
                    
        pandoc_args.append(PandocArgs(input=interfile(content),
                                        output=interfile(),
                                        variables=variables,
                                        template='synopsis'))   
            
        prev_scene = scene

    pandoc_args.append(PandocArgs(inputs=[a.output for a in pandoc_args],
                                  output=output_path.joinpath(snake_case(episode)).with_suffix('.md')))

    return write_pandoc(pandoc_args)

In [None]:
ct = CherryTree('content_index.ctd')
columns =['story','incident', 'episode', 'sequence']

df_outline = load_outline(ct)


stories = 'Muharto|Birth Of RI-002|Bob the Best'
query = f'(story.str.contains("{stories}", case=False))'

t = df_outline.query(query).sort_values('episode_index')
t[columns]

In [None]:
output = 'synopsis'
ct = CherryTree('content_index.ctd') 
[n.insert_document_link(output) for n in ct.nodes('Interviews') if n.level == 3]
ct.save()

In [None]:
ct = CherryTree('content_index.ctd') 
dfo = load_outline(ct)
dfo.head(50)

In [None]:
dfo[dfo.duplicated(subset='title', keep='last')][['title', 'sequence']]

In [117]:
ct = CherryTree('content_index.ctd')
export_episode('Episode 1', 'output')

In [77]:
dfo = load_outline()
for row in dfo.itertuples():
    print(re.match(r'[1-9]', row.date))


<re.Match object; span=(0, 1), match='5'>
<re.Match object; span=(0, 1), match='2'>
<re.Match object; span=(0, 1), match='7'>
None
None
None
None
None
None
None
None
None
None
None
<re.Match object; span=(0, 1), match='2'>
None
None
<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(0, 1), match='1'>
None
None
<re.Match object; span=(0, 1), match='8'>
<re.Match object; span=(0, 1), match='8'>
<re.Match object; span=(0, 1), match='3'>
<re.Match object; span=(0, 1), match='3'>
<re.Match object; span=(0, 1), match='3'>
<re.Match object; span=(0, 1), match='3'>
<re.Match object; span=(0, 1), match='7'>
<re.Match object; span=(0, 1), match='8'>
<re.Match object; span=(0, 1), match='8'>
None
None
None
None
None
None
<re.Match object; span=(0, 1), match='2'>
None
None
None
None
<re.Match object; span=(0, 1), match='1'>
None
None
None
None
None
<re.Match object; span=(0, 1), match='2'>
None
<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(0, 1), match='2'>
None


TypeError: expected string or bytes-like object

In [None]:
df[df.synopsis.str.contains('wiry')]