In [1]:
# %pip install tensorflow==2.4.1
# %pip install transformers
# %pip install pyarrow
# %pip install fsspec
# %pip install s3fs
# %pip install boto3
# %pip install tensorflow-addons

In [2]:
import tensorflow as tf
import pandas as pd
import pickle
import os
import tensorflow_addons as tfa
from math import ceil
# from tensorflow.keras.utils import plot_model
# from transformers import RobertaTokenizer, RobertaTokenizerFast, TFRobertaModel, TFAlbertModel

# AUTO = tf.data.experimental.AUTOTUNE

2022-01-19 13:22:15.349713: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
model_iteration = 'iteration_2'

In [4]:
with open(f"./data/{model_iteration}/vocab/topics_vocab.pkl", "rb") as f:
    target_vocab = pickle.load(f)
    
target_vocab_inv = {j:i for i,j in target_vocab.items()}

with open(f"./data/{model_iteration}/vocab/doc_type_vocab.pkl", "rb") as f:
    doc_vocab = pickle.load(f)
    
doc_vocab_inv = {j:i for i,j in doc_vocab.items()}

with open(f"./data/{model_iteration}/vocab/journal_name_vocab.pkl", "rb") as f:
    journal_vocab = pickle.load(f)
    
journal_vocab_inv = {j:i for i,j in journal_vocab.items()}

with open(f"./data/{model_iteration}/vocab/paper_title_vocab.pkl", "rb") as f:
    title_vocab = pickle.load(f)
    
title_vocab_inv = {j:i for i,j in title_vocab.items()}

In [5]:
len(target_vocab)

65026

In [6]:
len(title_vocab)

282998

##### Short code to create ID mapping

In [7]:
tag_ids = pd.read_parquet("fields_of_study_ids.parquet")

FileNotFoundError: [Errno 2] No such file or directory: 'fields_of_study_ids.parquet'

In [None]:
names = tag_ids['normalized_name'].to_list()
ids = tag_ids['field_of_study_id'].to_list()

In [None]:
name_to_id = {name:i for name, i in zip(names, ids)}

In [None]:
id_dict = {i:name_to_id[j] for i,j in target_vocab_inv.items()}

In [None]:
len(id_dict)

In [None]:
# with open(f"./data/{model_iteration}/vocab/tag_id_vocab.pkl", "wb") as f:
#     pickle.dump(id_dict, f)

In [None]:
with open(f"./data/{model_iteration}/vocab/tag_id_vocab.pkl", "rb") as f:
    test_dict = pickle.load(f)

#### Getting the model

In [7]:
encoding_layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
    max_tokens=len(target_vocab)+1, output_mode="binary", sparse=False)


In [8]:
model_str = "20220117_lr0006_beta025_gamma28_nH6_nL4_firstD2048_secondD1024_thirdD1024"
mag_model = tf.keras.models.load_model(f'./models/{model_iteration}/{model_str}/model_epoch30ckpt/')

2022-01-19 13:22:30.374199: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-01-19 13:22:30.375285: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-01-19 13:22:31.004048: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-19 13:22:31.005711: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:16.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2022-01-19 13:22:31.005797: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node ze

In [9]:
mag_model.inputs

[<KerasTensor: shape=(None, 32) dtype=int64 (created by layer 'paper_title_ids')>,
 <KerasTensor: shape=(None, 256) dtype=int64 (created by layer 'abstract_ids')>,
 <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'doc_type_id')>,
 <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'journal_id')>]

In [10]:
final_model = tf.keras.Model(inputs=mag_model.inputs, 
                             outputs=tf.math.top_k(mag_model.outputs, k=30))

In [11]:
final_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
paper_title_ids (InputLayer)    [(None, 32)]         0                                            
__________________________________________________________________________________________________
abstract_ids (InputLayer)       [(None, 256)]        0                                            
__________________________________________________________________________________________________
title_embedding (Embedding)     multiple             18111936    paper_title_ids[0][0]            
                                                                 abstract_ids[0][0]               
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 32, 64)       0           title_embedding[0][0]        

In [12]:
def get_all_model_predictions(data_path):
    # Get all of the files, load into single pandas dataframe
    # split up into blocks of 3000 and get model output
    file_names = [x for x in os.listdir(data_path) if x.startswith('part')]
    file_names.sort()
    
    full_df = pd.DataFrame()
    
    for file_name in file_names:
        temp_df = pd.read_parquet(f"{data_path}{file_name}")
        full_df = pd.concat([full_df, temp_df], axis=0)
    
    num_samples = 500
    preds_final = []
    scores_final = []
    for i in range(ceil(full_df.shape[0]/num_samples)):
        print(i)
        small_df = full_df.iloc[i*num_samples:(i+1)*num_samples, :].copy()
        preds, scores = get_model_predictions(small_df)
        preds_final += preds
        scores_final += scores
    
    full_df['predictions'] = preds_final
    full_df['scores'] = scores_final
    
    return full_df


In [13]:
def cut_length(data, seq_len=512):
    return data[:seq_len]

In [14]:
def get_model_predictions(input_data):
    
    input_data['paper_title_tok'] = input_data['paper_title_tok'].apply(cut_length, args=(32,))
    input_data['abstract_tok'] = input_data['abstract_tok'].apply(cut_length, args=(256,))
    
    paper_title = tf.ragged.constant(input_data['paper_title_tok'].to_list()).to_tensor(shape=[None, 32])
    abstract = tf.ragged.constant(input_data['abstract_tok'].to_list()).to_tensor(shape=[None, 256])
    
    doc_types = tf.convert_to_tensor(input_data['doc_type_tok'].to_list())
    journal = tf.convert_to_tensor(input_data['journal_tok'].to_list())
    
    model_output = final_model([paper_title, abstract, doc_types, journal])
    
    scores = model_output.values.numpy()[0][:,:20].tolist()
    preds = model_output.indices.numpy()[0][:,:20].tolist()
    
    return preds, scores

In [16]:
test_data = get_all_model_predictions(f"./data/iteration_1/tokenized_data/test/")

0


2022-01-19 13:23:34.482623: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2022-01-19 13:23:34.850603: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125


In [17]:
test_data.to_parquet(f"./data/{model_iteration}/test_data/predictions_{model_str}.parquet")

In [18]:
# test_data = pd.read_parquet(f"./{model_iteration}/test_data/data_with_predictions_500.parquet")
test_data['target_test'] = test_data['target_tok'].apply(lambda x: [i for i in x if i!=-1])
test_data['target_test'] = test_data['target_test'].apply(len)
test_data = test_data[test_data['target_test'] > 0].copy()

In [19]:
test_data.shape

(62517, 10)

In [20]:
test_data.sample(5)

Unnamed: 0,paper_id,publication_date,doc_type_tok,journal_tok,target_tok,paper_title_tok,abstract_tok,predictions,scores,target_test
8419,1601404031,1986-07-01,[3],[11995],"[44926, 22106, 41970, 62746, 2666, 36210]","[47684, 272517, 200157, 1233, 81041, 121331]",[2],"[22106, 41970, 44926, 15337, 2666, 62746, 1735...","[0.8915539383888245, 0.5942696332931519, 0.467...",6
3402,159197916,2012-01-01,[6],[2],"[45901, 40695, 38076, 30832, 39138, 28738]","[186562, 19195, 200157, 11256, 69906, 210029, ...","[12019, 125960, 203521, 7236, 203521, 125960, ...","[40695, 30832, 38076, 45901, 28738, 59215, 481...","[0.8051384687423706, 0.7760791778564453, 0.723...",6
5119,948594775,2015-06-11,[3],[24],"[46869, 21422, 7648, 64359]","[158351, 200157, 164039, 279559, 159129, 21081...",[2],"[21422, 46869, 7648, 18748, 29694, 64359, 4291...","[0.786722719669342, 0.6572144627571106, 0.5807...",4
3024,2522048908,1985-12-01,[3],[10207],"[20049, 10251, 677]","[68895, 200157, 158414, 267546, 12770, 85821, ...",[2],"[20049, 36517, 677, 27095, 7648, 61841, 346, 3...","[0.5330604314804077, 0.4599747955799103, 0.424...",3
2591,2060833528,1997-07-01,[3],[425],"[39485, 58562, 18786, 50140, 36943, 7648, 1450...","[153485, 228017, 228061, 209262, 239113, 34686...","[278083, 265955, 119713, 154201, 125235, 20015...","[35497, 7648, 36943, 47195, 39485, 39178, 4359...","[0.9060872197151184, 0.8617819547653198, 0.772...",11


In [21]:
test_data.to_parquet(f"s3://mag-model-data/V2/{model_iteration}/test_data/predictions_{model_str}.parquet")