In [1]:
from storage.cherrytree import CherryTree
from utility.config import load_config
from utility.strings import snake_case
from IPython.display import display, Markdown
from pathlib import Path
from functools import reduce
from document.pandoc import PandocArgs, interfile, write_pandoc, stream_pandoc
from document.document import Document 
import dateparser
import pandas as pd
import spacy
import attr
import re

In [None]:
        
nlp = spacy.load('en_core_web_md')

output_path = Path('output')
wsplit = re.compile('\s')
    

In [2]:
def load_outline():
    ct = CherryTree('content_index.ctd')
    dfs = pd.DataFrame([dict( 
                             identifier=n.id,
                             incident=n.name,
                             story=n.parent.name,
                             notes=n.notes
                            ) 
                        for n in ct.nodes('Stories')  if n.level > 2 ]) 
    dfs['story_index'] = dfs.index
    dfs.story_index.astype(int, copy=False) 
    
    dfi = pd.DataFrame([dict( 
                             identifier=n.id,
                             interview=n.name,
                             subject=n.parent.name,
                             notes=n.notes
                            ) 
                        for n in ct.nodes('Interviews')  if n.level > 2 ]) 
    
    
    dfe = pd.DataFrame([dict(identifier=l.href,
                             sequence=s.name,
                             episode=e.name) for e in ct.nodes('Synopsis') 
                                              for s in e.children 
                                              for l in s.links])  

    dfe['episode_index'] = dfe.index
    
    
    docs = [] 

    for node in [n for n in ct.nodes() if n.document]:
        try:
            doc = Document.read_file(node.document)
        except Exception as e:
            print('error loading document for', node.name) 
            continue
        docs.append((doc.metadata, doc.content, node.id)) 
    dfd = pd.DataFrame(docs, columns=['metadata', 'content', 'identifier'])
    dfm = dfd.metadata.apply(pd.Series)
    dft = pd.merge(dfd, dfm, left_index=True, right_index=True)
    
    return reduce(lambda  left,right: pd.merge(left,right,on='identifier', how='outer'), [dfs, dfi, dfe, dft]).fillna('No Data')


    


In [None]:
def make_linked_documents(section)
    ct = CherryTree('content_index.ctd')
    for node in [n for n in ct.nodes(section) if n.content if not n.filepath]:
        link = node.insert_document_link('synopsis')
        metadata=dict(title=node.name, date='No Date', status='check')
        write_pandoc(PandocArgs(input=interfile(node.content), 
                                output=link.href, 
                                metadata=metadata))
    ct.save()

In [26]:
# def format_content(scn, pscn): 
    
#     data = {}
#     if scene.timestamp.__class__.__name__ == 'Timestamp'
#         if re.match('[1-9]', scn.date):
#             date = scn.timestamp.strftime("%d %B %Y")
#             data['exact_date'] = True 
#         else:
#             date = scn.timestamp.strftime("%B %Y")
#     else:
#          date = 'No Date'
            
#     data['date'] = date
        
        
#     if not pscn:
#         data['new'] = True 
#     elif pscn.sequence != scn.sequence:
#         data['new'] = True 
#     elif scn.subject:
#         data['return'] = True 
#     else:
#         time_diff = scn.timestamp - pscn.timestamp
#         days = time_diff.days
#         if days < -600: 
#             data['flashback'] = True
#         elif days == 0:
#             data['that_day'] = True
#         elif days == 1:
#             data['next_day'] = True
#         elif days < 7:
#             data['that_week'] = True
#         else:
#             data['continue'] = True
#     return data, scn.content

def export_episode(episode, output_dir):
    output_path = Path(output_dir)
    
    pandoc_args = [PandocArgs(input=interfile(),
                                output=interfile(),
                                metadata=dict(episode=episode),
                                template='synopsis')]

    prev_scene = None
    df = load_outline()
    scn = df[df.episode.str.contains(episode)].copy()
    scn['timestamp'] = scn.date.apply(lambda x: dateparser.parse(x) if type(x) is str else None)
    no_dates = scn[scn.timestamp.isna()]
    print(df.iloc[162])
#     print(no_dates[['incident', 'date', 'timestamp', 'episode']])

#     if no_dates.any():
#         for scene in scn[any(scn.timestamp.isna())].itertuples():
#             print(scene.title, 'has not date') 
#         return False
    
    
    
    
#     .sort_values(['episode_index']).itertuples():
#         variables, content = format_content(scene, prev_scene)
        
                    
#         pandoc_args.append(PandocArgs(input=interfile(content),
#                                         output=interfile(),
#                                         variables=variables,
#                                         template='synopsis'))   
            
#         prev_scene = scene

#     pandoc_args.append(PandocArgs(inputs=[a.output for a in pandoc_args],
#                                   output=output_path.joinpath(snake_case(episode)).with_suffix('.md')))

#     return stream_pandoc(pandoc_args)

In [None]:
ct = CherryTree('content_index.ctd')
columns =['story','incident', 'episode', 'sequence']

df_outline = load_outline(ct)


stories = 'Muharto|Birth Of RI-002|Bob the Best'
query = f'(story.str.contains("{stories}", case=False))'

t = df_outline.query(query).sort_values('episode_index')
t[columns]

In [None]:
output = 'synopsis'
ct = CherryTree('content_index.ctd') 
[n.insert_document_link(output) for n in ct.nodes('Interviews') if n.level == 3]
ct.save()

In [None]:
ct = CherryTree('content_index.ctd') 
dfo = load_outline(ct)
dfo.head(50)

In [None]:
dfo[dfo.duplicated(subset='title', keep='last')][['title', 'sequence']]

In [27]:
ct = CherryTree('content_index.ctd')
export_episode('Episode 1', 'output')


identifier                                                    1001
incident                                                   No Data
story                                                      No Data
notes_x                                                    No Data
story_index                                                No Data
interview                                      Penniless in Manila
subject                                                    Muharto
notes_y          • Stuck in Manila for three months\n• VT-CLA s...
sequence                                     Cameron Meets Muharto
episode                                                  Episode 1
episode_index                                                   12
metadata                                                   No Data
content                                                    No Data
title                                                      No Data
date                                                       No 

In [71]:
dfo = load_outline()
story = 'Shabby'
# dfo[(dfo.synopsis.isna()) & (dfo.story.str.contains(story))].incident
# dfo[(dfo.synopsis.isna())].story.unique()
dfo['timestamp'] = dfo.date.apply(lambda x: dateparser.parse(x) if type(x) is str else None)
dfo.iloc[100]

identifier                                                     971
incident                                    Julia Learns of Emeria
story                                           Alternate Herstory
notes_x          Document \n☐ Explain  Victor finds shot of Eme...
story_index                                                      1
interview                                                      NaN
subject                                                        NaN
notes_y                                                        NaN
sequence                                                    Emeria
episode                                                  Episode 3
episode_index                                                   43
metadata         {'title': 'Julia Learns of Emeria', 'date': '2...
content          \n\n\nVictor searches for more shots of Batavi...
title                                       Julia Learns of Emeria
date                                                2 October 

In [76]:
dfo[~dfo.subject.isna()]
dfo.iloc[154]

identifier                                                     982
incident                                                       NaN
story                                                          NaN
notes_x                                                        NaN
story_index                                                    NaN
interview                                 Cameron Meets Boediardjo
subject                                                 Boediardjo
notes_y          Document \n October 1988\n• First boedi meetin...
sequence                                   Reunion With Boediardjo
episode                                                  Episode 2
episode_index                                                   32
metadata         {'title': 'Cameron Meets Boediardjo', 'date': ...
content          \nMuharto takes Cameron to his long-delayed re...
title                                     Cameron Meets Boediardjo
date                                                5 October 

In [9]:
ct = CherryTree('content_index.ctd')
for node in [n for n in ct.nodes('Stories') if n.document]:
    try:
        doc = Document.read_file(node.document)
    except:
        print(node.name) 
        