In [None]:
import json
import pandas as pd
import os
from names_dataset import NameDataset
import cleanup_utils

# pd.set_option('max_colwidth', None)
nd = NameDataset()
first_names = nd.first_names
last_names = nd.last_names

In [None]:
ROOT_PATH = '/data/NLP/GIDS (Google IISc Distant Supervision)'


In [None]:
def fix_tokens(token: str, ner: str):
    pronouns = ["I","you","he","she","me","you","him","her","mine","yours","his","hers","myself","yourself","himself","herself"]
    schools = ['College', 'University', 'School']
    locations = ['Square', 'County', 'Beach', 'Chicago', 'City', 'Island']
    entities = [*schools, *locations,  'Agr']
    if '_' in token:
        if token == '_':
            ner = 'O'
        else:
            # if len(word.split('_')) > 1:
            splitted = token.split('_')
            if splitted[0] in first_names and splitted[1] in last_names and min([entity.lower() not in token.lower() for entity in entities]):
                ner = 'PERSON'
            if max([location.lower() in token.lower() for location in locations]) and not max([school.lower() in token.lower() for school in schools]):
                ner = "LOCATION"
            if max([school.lower() in token.lower() for school in schools]):
                ner = "ORGANIZATION"
    ner = 'PERSON' if token.lower() in pronouns else ner
    return token, ner

def load_gids_dataset(path: str):
    data = []

    with open(path) as file:
        for line in file:
            tmp = {'relations': []}

            loaded_data = json.loads(line)
            corenlp = loaded_data.get('corenlp', {}) if loaded_data.get('corenlp', {}) is not None else {}
            sentences = corenlp.get('sentences', [])

            for index, sentence in enumerate(sentences):
                sentence_relations = sentence.get('openie')

                for tmp_relation in sentence_relations:
                    tmp_relation['tokens'] = []
                    tmp_relation['labels'] = []

                    for token_metadata in sentence.get('tokens', []):
                        token, ner = fix_tokens(token_metadata.get('originalText'), token_metadata.get('ner'))
                        splitted_token = token.split('_')
                        token = ' '.join(splitted_token)
                        if token != '_':
                            tmp_relation['tokens'].extend(splitted_token)
                            tmp_relation['labels'].extend([*[ner]*len(splitted_token)])

                tmp['relations'].append(sentence_relations)

            data.append(tmp)

    return data

In [None]:

for _dir in os.listdir(f'{ROOT_PATH}/raw/'):
    data = load_gids_dataset(f'/data/NLP/GIDS (Google IISc Distant Supervision)/raw/{_dir}')

    df = (
        pd.DataFrame(data)
        .explode("relations")
        .explode("relations")
        .dropna()
    )

    df = (
        df
        .assign(**df["relations"].apply(pd.Series))
        .drop(columns=['relations','objectSpan','subjectSpan','relationSpan'])
        # .assign(index = df.reset_index().index)
    )

    df = df.assign(labels=df['labels'].apply(cleanup_utils.process_labels))
    df = df.assign(object=df['object'].apply(lambda x: x[1:].replace('_', ' ') if x.startswith('_') else x.replace('_', ' ')))
    df = df.assign(subject=df['subject'].apply(lambda x: x[1:].replace('_', ' ') if x.startswith('_') else x.replace('_', ' ')))

    df.reset_index(drop=True).to_json(f'{ROOT_PATH}/preprocessed/{_dir}')

In [None]:
train_df = pd.read_json(f"{ROOT_PATH}/preprocessed/train.json").reset_index(drop=True)
dev_df = pd.read_json(f"{ROOT_PATH}/preprocessed/dev.json").reset_index(drop=True)
test_df = pd.read_json(f"{ROOT_PATH}/preprocessed/test.json").reset_index(drop=True)

In [None]:
dev_df.head(20)

In [None]:
# df_filtered = (
#     dev_df
#     .loc[:,['relation','objectSpan','subjectSpan','relationSpan','subject','object','tokens','labels']]
# )

# df_filtered.head(10)
