In [246]:
import pandas as pd
import json
import os
import re
from uuid import uuid4
from nltk.tokenize import MWETokenizer
import random

In [247]:
# to use NLTK's TorqueNizer
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/deallab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [248]:
# set dir name

#inport settings
sample = True

#set input and output dir
if sample:
    input_dir = '../data/sample/2_clean_json'
    output_dir = '../data/sample/3_structured_data'
else:
    input_dir = '../data/result/2_clean_json'
    output_dir = '../data/result/3_structured_data'
    
#set ouput path if not exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [249]:
# conditionize tokenizer
tokenizer = MWETokenizer(separator='')
tokenizer.add_mwe(('[', 'REF', ']'))

In [250]:
any([False])

False

In [251]:
# helper 

#add doc to doc_df and / or return doc uuid
def get_uuid_of_doc(title, pub_year, authors):
    if not title:
        title = 'unknown'
        s = doc_df['title'].str.contains(re.escape('title'))
    else:
        s = doc_df['title'].str.contains(re.escape(title))
        
    if any(s) and doc_df[s]['pub_year'].iloc[0] == pub_year:
        id = doc_df[s]['id'].iloc[0]
        return id
    else:
        id = str(uuid4())
        pub_year = pub_year if pub_year else 'unknown'
        authors = authors if authors else 'unknown'
        doc_df.loc[len(doc_df)] = [id, title, pub_year, authors]
        return id

# concat_authors
def concat_auth(authors):
    return ' ,'.join([' '.join([v if v is not None else 'unk' for v in auth.values()]) for auth in authors])

def is_valid_ref(ref, targets):
    match = re.search(r'<ref.*?target="(.*?)">', ref) #match all ref tags (group 1 for REF and group 2 for GREF)
    if match is None:
        print(ref)
    ref_ids = re.findall(r'(?:b|n)\d{1,3}', match.group(1))
    if 'n999' in ref_ids:
        return False
    return any([id in targets.keys() for id in ref_ids])

In [252]:
ref_columns = ['id', 'ref_loc', 'type', 'cited_doc_id', 'par_id']
par_columns = ['id', 'text', 'section_title', 'doc_id',]
doc_columns = ['id', 'title', 'pub_year', 'authors']
# create dataframes
ref_df = pd.DataFrame(columns=ref_columns)
par_df = pd.DataFrame(columns=par_columns)
doc_df = pd.DataFrame(columns=doc_columns)

unk_id = str(uuid4())
doc_df.loc[0] = [unk_id, 'unknown', 'unknown', 'unknown' ]

#load files
for filename in os.listdir(input_dir):
    if filename.endswith('.json'):  
        input_file_path = os.path.join(input_dir, filename)
        with open(input_file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        # add origin document
        doc_id = get_uuid_of_doc(data['title'], data['pub_year'], concat_auth(data['authors'])) 
            
        # add references
        ref_id_targets = {} # dictionary mapping citation id (b23) to uuid of reference
        for ref in data['references']:
            ref_id_targets[ref['id']] = get_uuid_of_doc(ref['title'], ref['pub_year'], concat_auth(ref['authors']))
            
        #paragraphs
        for section in data['sections']:
            for paragraph_text in section['paragraphs']:
                
                refs = []
                par_id = str(uuid4()) # set par id

                #replace target values in text
                references = re.findall(r'(<ref.*?<\/ref>)', paragraph_text)
                cleaned_text = re.sub(r'(<ref.*?<\/ref>)', ' [REF] ', paragraph_text)

                #replace all ;
                cleaned_text = re.sub(r';', ',', cleaned_text)
                 
                # tokenized paragraph text
                tokenized_text = tokenizer.tokenize(cleaned_text.split())
                
                #replace [REF] token with <ref> tags
                ref_locs = [i for i, token in enumerate(tokenized_text) if '[REF]' in token]
                if len(ref_locs) == len(references):
                    for i, loc in enumerate(ref_locs):
                        tokenized_text[loc] = re.sub(r' target=".*?"','',references[i])
                else:
                    print(f'The references array is of length {len(refs)} while the ref_locs array is of length {len(ref_locs)}')

                # add par to par_df
                par_df.loc[len(par_df)] = [par_id, ';'.join(tokenized_text), section['section_name'], doc_id]
                
                if references is None:
                    continue
                
                for index, ref in enumerate(references):
                    match = re.search(r'type="(.*?)" target="(.*?)"', ref)
                    if match is None:
                        print(ref)
                    ref_type = match.group(1)
                    ref_ids = [m.replace('#', '') for m in match.group(2).split(';')]
                    cited_docs = [ref_id_targets[ref_id] if ref_id in ref_id_targets.keys() else unk_id for ref_id in ref_ids]
                    ref_df.loc[len(ref_df)] = [uuid4(), ref_locs[index], ref_type, ';'.join(str(id) for id in cited_docs), par_id]


In [253]:
# test structured data 
par_ids = par_df['id'].array
doc_ids = doc_df['id'].array

for i, row in ref_df.iterrows():
    if row['par_id'] not in par_ids:
        id = row['id']
        print (f'par id in refs {id} not resolved')
    for doc_id in row['cited_doc_id'].split(';'):
        if doc_id == 'unknown':
            continue
        if doc_id not in doc_ids:
            id = row['id']
            print (f'doc id {doc_id} in refs {id} not resolved')

for i, row in par_df.iterrows():
    if row['doc_id'] not in doc_ids:
        id = row['id']
        print(f'doc id in pars {id} not resolved')


In [254]:
#save df to csv
ref_df.to_csv(output_dir + '/ref.csv', index=False)
par_df.to_csv(output_dir + '/par.csv', index=False)
doc_df.to_csv(output_dir + '/doc.csv', index=False)

In [255]:
len(ref_df['par_id'].unique())

1137