In [1]:
import requests
import json
import multiprocessing as mp
import mag_functions as F

### Testing JSON Read

In [96]:
import json
import re
import os
import pickle
import pandas as pd
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_colwidth", None)

In [3]:
predictions = pd.read_parquet("s3://mag-model-data/V2/iteration_2/test_data/predictions_20220117_lr0006_beta025_gamma28_nH6_nL4_firstD2048_secondD1024_thirdD1024.parquet")

In [5]:
predictions.shape

(62517, 10)

In [2]:
extra_data = pd.read_parquet("s3://mag-model-data/V2/raw_test_data/part-00000-tid-1178175042784182311-0b55b08b-d5db-4777-9e6a-572d293f915d-832-1-c000.snappy.parquet")

In [6]:
extra_data.shape

(62524, 10)

In [16]:
abstract_test = pd.read_parquet("s3://mag-model-data/raw_mag_data/test_api_abstracts/part-00000-tid-3075369008470969684-b53b0953-d369-469a-9e2b-aa32d5c70e90-1-1-c000.snappy.parquet")

In [19]:
abstract_test.shape

(2, 2)

In [12]:
all_test_data = predictions \
.merge(extra_data.drop(['publication_date'], axis=1), how='left', on='paper_id')

In [20]:
data = all_test_data.head(3).merge(abstract_test, how='left', on='paper_id')

In [23]:
json_data = data[['original_title','doc_type','journal_name','indexed_abstract']].copy()
json_data.columns = ['title','doc_type','journal','abstract']
json_data.to_json('test_json_V2.json', orient='records')

#### Reading in like an input

In [94]:
model_path = "./model_to_api/container/model_files/"

In [97]:
# Load the dictionaries
with open(os.path.join(model_path, "topics_vocab.pkl"), "rb") as f:
    target_vocab = pickle.load(f)
    
target_vocab_inv = {j:i for i,j in target_vocab.items()}

print("Loaded target vocab")

with open(os.path.join(model_path, "doc_type_vocab.pkl"), "rb") as f:
    doc_vocab = pickle.load(f)
    
doc_vocab_inv = {j:i for i,j in doc_vocab.items()}

print("Loaded doc_type vocab")

with open(os.path.join(model_path, "journal_name_vocab.pkl"), "rb") as f:
    journal_vocab = pickle.load(f)
    
journal_vocab_inv = {j:i for i,j in journal_vocab.items()}

print("Loaded journal vocab")

with open(os.path.join(model_path, "paper_title_vocab.pkl"), "rb") as f:
    title_vocab = pickle.load(f)
    
title_vocab_inv = {j:i for i,j in title_vocab.items()}

print("Loaded title vocab")

with open(os.path.join(model_path, "tag_id_vocab.pkl"), "rb") as f:
    tag_id_vocab = pickle.load(f)

Loaded target vocab
Loaded doc_type vocab
Loaded journal vocab
Loaded title vocab


In [93]:
def invert_abstract_to_abstract(invert_abstract):
    invert_abstract = json.loads(invert_abstract)
    ab_len = invert_abstract['IndexLength']
    
    if 30 < ab_len < 1000:
        abstract = [" "]*ab_len
        for key, value in invert_abstract['InvertedIndex'].items():
            for i in value:
                abstract[i] = key
        final_abstract = " ".join(abstract)
    else:
        final_abstract = None
    return final_abstract

def clean_abstract(abstract, inverted=True):
    if inverted:
        if abstract:
            abstract = invert_abstract_to_abstract(abstract)
        else:
            pass
    else:
        pass
    abstract = clean_text(abstract)
    return abstract

def clean_text(text):
    try:
        text = text.lower()

        text = re.sub('[^a-zA-Z0-9 ]+', ' ', text)
        text = re.sub(' +', ' ', text)
        text = text.strip()
        
    except:
        text = ""
    return text

def try_lowercase(text):
    try:
        text = text.lower()
    except:
        pass
    return text

def tokenize_feature(feature, feature_name='doc_type'):
    if feature_name=='doc_type':
        vocab = doc_vocab
    else:
        vocab = journal_vocab
    unk_token_id = vocab.get('[UNK]')
    none_token_id = vocab.get('[NONE]')
    if feature:
        token_feature = [vocab.get(feature, unk_token_id)]
    else:
        token_feature = [none_token_id]
    return token_feature

def tokenize_title(feature):
    split_feature = feature.split(" ")
    vocab = title_vocab
    unk_token_id = vocab.get('[UNK]')
    none_token_id = vocab.get('[NONE]')
    if feature:
        token_feature = [vocab.get(x, unk_token_id) for x in split_feature]
    else:
        token_feature = [none_token_id]
    return token_feature

In [103]:
input_df = pd.read_json('test_json_V2.json', orient='records').reset_index(drop=True)

In [106]:
input_df

Unnamed: 0,title,doc_type,journal,abstract,inverted_abstract,paper_title_tok,abstract_tok,doc_type_tok,journal_tok
0,a graph theoretic approach to atomic displacements in fullerenes,BookChapter,,the recently developed idea of analyzing complex networks in terms of node displacement due to vibration estrada and hatano chem phys lett 486 166 170 2010a is applied to fullerenes the fact that the ramafullerenes fullerenes of ramanujan graphs are limited to fullerenes with relatively small number of c atoms is explained from the point of view of the node displacement the node displacement is also shown to indicate the stability of isomers of c40 fullerenes it is suggested from the analysis of local node displacement that instability of fullerenes mainly comes from pentagon rich areas of the molecules,True,"[239113, 92190, 154991, 132270, 265955, 114561, 51847, 63104, 256479]","[154201, 47075, 139417, 222311, 200157, 90902, 11995, 148679, 63104, 70469, 200157, 228017, 40377, 154976, 265955, 142298, 126337, 187763, 116221, 96536, 267536, 47194, 102148, 86056, 177531, 124176, 113843, 28292, 265955, 256479, 154201, 19219, 226506, 154201, 1, 256479, 200157, 262080, 2543, 193784, 228014, 265955, 256479, 69677, 266744, 199415, 257011, 200157, 171106, 114599, 113843, 159115, 170361, 154201, 256234, 200157, 130806, 200157, 154201, 228017, 40377, 154201, 228017, 40377, 113843, 271789, 58215, 265955, 119704, 154201, 33975, 200157, 205781, 200157, 224379, 256479, 132276, 113843, 92294, 170361, 154201, 135818, 200157, 11260, 228017, 40377, 226506, 51050, 200157, 256479, 273229, 70498, 170361, 193199, 16992, 124473, 200157, 154201, 205075]",[8],[2]
1,sectional anatomy of the human temporal bone,Journal,folia morphologica,,True,"[58192, 126692, 200157, 154201, 69678, 255482, 2487]",[2],[3],[3165]
2,pakistan response towards terrorism a case study of musharraf regime,Thesis,,the ranging course of terrorism banishing peace and security prospects of today s pakistan is seen as a domestic effluent of its own flawed policies bad governance and lack of social justice and rule of law in society and widening gulf of trust between the rulers and the ruled the study focused on policies and performance of the musharraf government since assuming the mantle of front ranking ally of the united states in its so called war on terror the causes of reversal of pre nine eleven position on afghanistan and support of its taliban s rulers are examined in the light of the geo strategic compulsions of that crucial time and the structural weakness of military rule that needed external props for legitimacy the flaws of the response to the terrorist challenges are traced to its total dependence on the hard option to the total neglect of the human factor from which the thesis develops its argument for a holistic approach to security in which the people occupy a central position thesis approach is also shown to hold the solutions for eliminating the causes of extremism on which terrorism feeds and grows in sum the study deconstructs musharraf s regime s response to terrorism by examining the conceptual mould of the strategic players in the country and postulates a holistic and integrated security framework to deal with terrorism on a pro active and sustainable basis an approach such as this would logically entail the redefining of the role of the state vis a vis its people as the fulcrum and medium of ensuring traditional and non traditional security of the country,True,"[247096, 63112, 86555, 62423, 239113, 199399, 68895, 200157, 280974, 56710]","[154201, 153540, 85053, 200157, 62423, 143063, 154250, 187763, 46948, 86572, 200157, 86558, 278048, 247096, 113843, 23366, 244521, 239113, 221564, 176060, 200157, 135821, 39638, 12022, 87345, 120276, 17672, 187763, 12048, 200157, 142282, 277329, 187763, 200167, 200157, 273948, 63104, 261129, 187763, 279579, 155008, 200157, 120299, 113846, 154201, 152911, 187763, 154201, 226673, 154201, 68895, 90654, 114560, 87345, 187763, 223011, 200157, 154201, 280974, 239119, 261113, 177625, 154201, 238516, 200157, 130864, 130805, 39637, 200157, 154201, 75272, 147939, 63104, 135821, 237558, 278757, 227997, 114560, 148023, 154201, 24100, 200157, 29538, 200157, 279567, 176808, 228008, 194550, 114560, 239899, 187763, 132277, 200157, 135821, 217437, 278048, 152911, 193784, 246991, 63104, ...]",[6],[2]


In [105]:
input_df['title'] = input_df['title'].apply(clean_text)
input_df['abstract'] = input_df.apply(lambda x: clean_abstract(x.abstract, x.inverted_abstract), axis=1)
input_df['journal'] = input_df['journal'].apply(try_lowercase)
input_df['paper_title_tok'] = input_df['title'].apply(tokenize_title)
input_df['abstract_tok'] = input_df['abstract'].apply(tokenize_title)
input_df['doc_type_tok'] = input_df['doc_type'].apply(tokenize_feature, args=('doc_type',))
input_df['journal_tok'] = input_df['journal'].apply(tokenize_feature, args=('journal',))