In [1]:
"""
collate info for documents to speed things up downstream
"""
import argparse
import pandas as pd
from datetime import datetime as dt
import ujson as json
from glob import glob
from crisis_points import crisis_points
from frequency_utils import list_crisis_docs
import os

#%%
def time_index(docs, lang=None, verbose=False):
    doc_details = {}
    tot = len(docs)
    for i, doc in enumerate(docs):
        if verbose:
            print('\r{} of {} processed'.format(i, tot), end='')
        with open(doc, 'r', encoding='utf-8') as f:
            art = json.loads(f.read())
            try:
                if lang:
                    if art['language_code'] != lang:
                        continue
                date = pd.to_datetime(dt.fromtimestamp(art['publication_date'] / 1e3))
                doc_details[art['an']] = {'date': date}
            except Exception as e:
                print(art['an'] + ': ' + e.characters_written)
    data = pd.DataFrame(doc_details).T
    return data
#%%

def period_info(doc_deets):
    dates = pd.DatetimeIndex(doc_deets['date'])
    doc_deets['week'] = dates.to_period('W')
    doc_deets['month'] = dates.to_period('M')
    doc_deets['quarter'] = dates.to_period('Q')
    return doc_deets


def label_crisis(data, path, verbose=False, period='crisis'):
    data['crisis'] = 0
    crisis = []
    for country in crisis_points.keys():
        if verbose:
            print("\nworking on {}...".format(country))
        crisis_docs = list_crisis_docs(country, path,doc_data=data, period=period)
        crisis_ids = [os.path.basename(doc).replace(".json", '') for doc in crisis_docs]
        crisis += crisis_ids
    data.loc[data.index.isin(crisis), 'crisis'] = 1
    return data

class args_class(object):
    def __init__(self, in_dir,out_dir,period='crisis',verbose=True):
        self.in_dir = in_dir
        self.out_dir = out_dir
        self.period=period
        self.verbose = verbose
        
#%%



In [2]:
args = args_class('../cleaned_small','../data/doc_meta', verbose = False)
 
doc_paths = glob(args.in_dir + '/*.json')
deets = time_index(doc_paths, lang='en', verbose=args.verbose)

 

In [4]:
args.in_dir


'../cleaned_small'

In [None]:
deets = period_info(deets)
deets = label_crisis(deets, path = args.in_dir, verbose=args.verbose, period=args.period)
deets.to_pickle(os.path.join(args.out_dir, 'doc_details_{}.pkl'.format(args.period)))