In [2]:
import json
import pandas as pd
import random
import re
from sklearn.model_selection import train_test_split
import nltk
import math
import numpy as np
import os

In [3]:
with open('/raid/deallab/CCE_Data/raw_data/multicite/full_raw.json', 'r') as f:
    full_dataset = json.load(f)

In [4]:
sent_id_lookup = {}
for i , key in enumerate(full_dataset):
    doc = full_dataset[key]['x']
    staged_lookup = []
    for sent in doc:
        staged_lookup.append(sent['sent_id'])
    staged_lookup.sort(key=lambda x: int(x.split('-')[-1]))
    sent_id_lookup[key] = staged_lookup

In [5]:
#helper 
def clean_sentence(sent):
    return re.sub(r'<span style.*>', ' #AUTHOR_TAG ', sent)

In [6]:
sent_lookup = {}
sent_structure = {}
for i , key in enumerate(full_dataset):
    doc = {}
    for sent in full_dataset[key]['x']:
        doc[sent['sent_id']] = clean_sentence(sent['text'])
    ids = sent_id_lookup[key]
    staged_lookup = {}
    staged_structure = {}
    section_id = 0
    section = []
    for id in ids:
        text = doc[id]
        if text[:5] == '-----':
            continue
        elif text[:2] + text[-2:] == '****':
            staged_structure[section_id] = section
            section_id += 1
            section = []
        else:
            staged_lookup[id] = {'text': text, 'section_id': section_id}
            section.append(text)
    staged_structure[section_id] = section
    sent_lookup[key] = staged_lookup
    sent_structure[key] = staged_structure

In [7]:
train, test = train_test_split(list(full_dataset.keys()), test_size=0.2, random_state=99)

In [8]:
INTENT_TOKEN_MAPPING = {'@BACK@': 0, '@MOT@': 1, '@FUT@':2, '@SIM@':3, '@DIF@':4, '@USE@':5, '@EXT@':6, '@UNSURE@':7}
COLUMNS = ['unique_id','citing_id','citing_title','cited_title','cited_authors','section_title','cited_abstract_auto','citation_context','multicite_context','cite_context_paragraph','citation_class_label']

full_df = pd.DataFrame(columns=COLUMNS)

for dataset, name in [(train,'train'), (test,'test')]:
    df = pd.DataFrame(columns=COLUMNS)
    prefix = 'CC' if name == 'train' else 'CCT'
    counter = 0

    for i , key in enumerate(dataset):
        doc = full_dataset[key]['y']
        for intent in doc:
            intent_id = INTENT_TOKEN_MAPPING[intent]
            cite_sent = doc[intent]['cite_sentences']
            gold_cont = doc[intent]['gold_contexts']
            if(len(cite_sent) != len(gold_cont)): 
                print(f'there are not the same amount of gold contexts and citation sentences at document {key}, intent {intent}')
                continue
            for i, cit in enumerate(cite_sent):
                if cit not in sent_lookup[key].keys(): 
                    print(f'cant find sentence id {cit}')
                    continue
                text = sent_lookup[key][cit]['text']
                section_id = sent_lookup[key][cit]['section_id']
                multicite_cont = [sent_lookup[key][id]['text'] for id in gold_cont[i] if id in sent_lookup[key].keys()]
                df.loc[len(df)] = [f'{prefix}{counter}', cit, None, None, None, None, None, text, multicite_cont, sent_structure[key][section_id], intent_id ]
                counter += 1       
    df.to_csv(f'../data/multicite/{name}_raw.txt', sep='\t', index=False)   
    full_df = pd.concat([full_df, df])

there are not the same amount of gold contexts and citation sentences at document ABC_57e65909baf823ff00a9a10a64fffd_15, intent @BACK@
there are not the same amount of gold contexts and citation sentences at document ABC_57e65909baf823ff00a9a10a64fffd_15, intent @SIM@
there are not the same amount of gold contexts and citation sentences at document ABC_57e65909baf823ff00a9a10a64fffd_15, intent @DIF@
there are not the same amount of gold contexts and citation sentences at document ABC_57e65909baf823ff00a9a10a64fffd_15, intent @EXT@
there are not the same amount of gold contexts and citation sentences at document ABC_41de1ad534ca00b7a99260de7bb0b2_2, intent @BACK@
there are not the same amount of gold contexts and citation sentences at document ABC_518d8a8395e38d9971bd51344cf1b8_5, intent @DIF@
there are not the same amount of gold contexts and citation sentences at document ABC_920f2b94270c0711fcc19ad23dbb0d_6, intent @BACK@
there are not the same amount of gold contexts and citation se

OSError: Cannot save file into a non-existent directory: '../data/multicite'

In [34]:
# calculate weights
count =full_df['citation_class_label'].groupby(full_df['citation_class_label']).agg(['count'])['count'].to_list()
[len(full_df) / (len(count) * c) for c in count]

[0.39627130681818185,
 2.27178338762215,
 17.88301282051282,
 1.1386734693877552,
 0.9527834699453552,
 0.42179467795585124,
 2.15258487654321,
 4.603547854785479]