In [1]:
# %pip install tensorflow==2.4.1
# %pip install transformers
# %pip install pyarrow
# %pip install tensorflow-addons

In [4]:
# import tensorflow as tf
import pandas as pd
import pickle
import os
# import tensorflow_addons as tfa
from math import ceil
# from tensorflow.keras.utils import plot_model
# from transformers import RobertaTokenizer, RobertaTokenizerFast, TFRobertaModel, TFAlbertModel

# AUTO = tf.data.experimental.AUTOTUNE

In [2]:
model_iteration = 'iteration_1'

In [3]:
with open(f"./data/{model_iteration}/vocab/topics_vocab.pkl", "rb") as f:
    target_vocab = pickle.load(f)
    
target_vocab_inv = {j:i for i,j in target_vocab.items()}

with open(f"./data/{model_iteration}/vocab/doc_type_vocab.pkl", "rb") as f:
    doc_vocab = pickle.load(f)
    
doc_vocab_inv = {j:i for i,j in doc_vocab.items()}

with open(f"./data/{model_iteration}/vocab/journal_name_vocab.pkl", "rb") as f:
    journal_vocab = pickle.load(f)
    
journal_vocab_inv = {j:i for i,j in journal_vocab.items()}

with open(f"./data/{model_iteration}/vocab/paper_title_vocab.pkl", "rb") as f:
    title_vocab = pickle.load(f)
    
title_vocab_inv = {j:i for i,j in title_vocab.items()}

In [4]:
len(target_vocab)

65026

In [5]:
len(title_vocab)

282998

##### Short code to create ID mapping

In [6]:
tag_ids = pd.read_parquet("fields_of_study_ids.parquet")

In [7]:
names = tag_ids['normalized_name'].to_list()
ids = tag_ids['field_of_study_id'].to_list()

In [8]:
name_to_id = {name:i for name, i in zip(names, ids)}

In [9]:
id_dict = {i:name_to_id[j] for i,j in target_vocab_inv.items()}

In [10]:
len(id_dict)

65026

In [11]:
# with open(f"./data/{model_iteration}/vocab/tag_id_vocab.pkl", "wb") as f:
#     pickle.dump(id_dict, f)

In [6]:
with open(f"./data/{model_iteration}/vocab/tag_id_vocab.pkl", "rb") as f:
    test_dict = pickle.load(f)

#### Getting the model

In [7]:
encoding_layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
    max_tokens=len(target_vocab)+1, output_mode="binary", sparse=False)


In [8]:
model_str = "20220115_lr0006_beta025_gamma28_nH6_nL4_firstD2048_secondD1024_thirdD1024"
mag_model = tf.keras.models.load_model(f'./data/{model_iteration}/models/{model_str}/model/')

2022-01-16 14:58:35.749229: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-01-16 14:58:35.752527: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-01-16 14:58:35.752537: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-01-16 14:58:35.752552: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-172-31-31-169.ec2.internal): /proc/driver/nvidia/version does not exist
2022-01-16 14:58:35.752730: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebui

In [9]:
mag_model.inputs

[<KerasTensor: shape=(None, 64) dtype=int64 (created by layer 'paper_title_ids')>,
 <KerasTensor: shape=(None, 512) dtype=int64 (created by layer 'abstract_ids')>,
 <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'doc_type_id')>,
 <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'journal_id')>]

In [10]:
final_model = tf.keras.Model(inputs=mag_model.inputs, 
                             outputs=tf.math.top_k(mag_model.outputs, k=30))

In [11]:
final_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
paper_title_ids (InputLayer)    [(None, 64)]         0                                            
__________________________________________________________________________________________________
abstract_ids (InputLayer)       [(None, 512)]        0                                            
__________________________________________________________________________________________________
title_embedding (Embedding)     multiple             18111936    paper_title_ids[0][0]            
                                                                 abstract_ids[0][0]               
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 64, 64)       0           title_embedding[0][0]        

In [12]:
def get_all_model_predictions(data_path):
    # Get all of the files, load into single pandas dataframe
    # split up into blocks of 3000 and get model output
    file_names = [x for x in os.listdir(data_path) if x.startswith('part')]
    file_names.sort()
    
    full_df = pd.DataFrame()
    
    for file_name in file_names:
        temp_df = pd.read_parquet(f"{data_path}{file_name}")
        full_df = pd.concat([full_df, temp_df], axis=0)
    
    num_samples = 200
    preds_final = []
    scores_final = []
    for i in range(ceil(full_df.shape[0]/num_samples)):
        print(i)
        small_df = full_df.iloc[i*num_samples:(i+1)*num_samples, :].copy()
        preds, scores = get_model_predictions(small_df)
        preds_final += preds
        scores_final += scores
    
    full_df['predictions'] = preds_final
    full_df['scores'] = scores_final
    
    return full_df


In [13]:
def cut_length(data, seq_len=512):
    return data[:seq_len]

In [14]:
def get_model_predictions(input_data):
    
    input_data['paper_title_tok'] = input_data['paper_title_tok'].apply(cut_length, args=(64,))
    input_data['abstract_tok'] = input_data['abstract_tok'].apply(cut_length, args=(256,))
    
    paper_title = tf.ragged.constant(input_data['paper_title_tok'].to_list()).to_tensor(shape=[None, 64])
    abstract = tf.ragged.constant(input_data['abstract_tok'].to_list()).to_tensor(shape=[None, 512])
    
    doc_types = tf.convert_to_tensor(input_data['doc_type_tok'].to_list())
    journal = tf.convert_to_tensor(input_data['journal_tok'].to_list())
    
    model_output = final_model([paper_title, abstract, doc_types, journal])
    
    scores = model_output.values.numpy()[0][:,:20].tolist()
    preds = model_output.indices.numpy()[0][:,:20].tolist()
    
    return preds, scores

In [15]:
test_data = get_all_model_predictions(f"./data/{model_iteration}/tokenized_data/test/")

0


2022-01-16 14:58:55.657835: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1890713600 exceeds 10% of free system memory.
2022-01-16 14:58:56.057342: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1890713600 exceeds 10% of free system memory.
2022-01-16 14:58:56.358821: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1890713600 exceeds 10% of free system memory.
2022-01-16 14:58:56.689733: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1890713600 exceeds 10% of free system memory.
2022-01-16 14:58:57.080993: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1890713600 exceeds 10% of free system memory.


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [16]:
test_data.to_parquet(f"./data/{model_iteration}/test_data/predictions_{model_str}.parquet")

In [17]:
# test_data = pd.read_parquet(f"./{model_iteration}/test_data/data_with_predictions_500.parquet")
test_data['target_test'] = test_data['target_tok'].apply(lambda x: [i for i in x if i!=-1])
test_data['target_test'] = test_data['target_test'].apply(len)
test_data = test_data[test_data['target_test'] > 0].copy()

In [18]:
test_data.shape

(62517, 10)

In [19]:
test_data.sample(5)

Unnamed: 0,paper_id,publication_date,doc_type_tok,journal_tok,target_tok,paper_title_tok,abstract_tok,predictions,scores,target_test
10092,3137613306,2021-03-30,[3],[8508],"[27095, 38898, 53383, 20052, 24275, 345, 21032...","[164026, 135830, 126699, 69677, 147160, 90684,...","[154201, 164026, 13523, 200157, 227992, 69677,...","[7648, 21032, 39153, 20052, 27095, 345, 24275,...","[0.9344683289527893, 0.8845618963241577, 0.587...",11
11968,2328757754,1991-04-01,[3],[26545],[17473],"[63104, 58155, 1, 251476, 205881, 102709, 261898]",[2],"[55976, 17473, 1, 3350, 43340, 8580, 36274, 41...","[0.32550737261772156, 0.3008621335029602, 0.27...",1
8258,188905248,2009-11-21,[6],[2],"[31064, 1042, 55684, 58550, 27873, 3063, 52393...","[40425, 103381, 158348, 34791, 63104, 91400, 1...","[154201, 24097, 243294, 200157, 125250, 158348...","[677, 27873, 46396, 24292, 52393, 44252, 58550...","[0.7474294900894165, 0.6099224090576172, 0.605...",10
2472,3158607115,2021-11-01,[3],[12127],"[2356, 39839, 55310, 45561, 43802, 46271, 2571...","[81755, 103336, 200157, 164023, 34691, 96256, ...","[256997, 245755, 45366, 155280, 239113, 198661...","[34450, 46271, 55310, 49166, 39839, 18515, 455...","[0.7110199928283691, 0.5690836310386658, 0.511...",10
4811,2083378874,2008-08-01,[3],[6281],"[57865, 17473, 43705, 24919, 24259, 2677, 8579...","[51254, 265955, 154201, 35470, 187763, 198756,...","[90696, 272442, 125955, 166690, 113843, 239113...","[24919, 57865, 17473, 2677, 18496, 8579, 57229...","[0.6942564845085144, 0.5798172950744629, 0.546...",10


### Getting Extra Test Data

In [13]:
from langdetect import detect

In [17]:
def get_lang(x):
    try:
        lang = detect(x)
    except:
        lang = 'UNK'
    return lang

In [20]:
extra_data = pd.read_parquet("s3://mag-model-data/V2/raw_test_data/part-00000-tid-1178175042784182311-0b55b08b-d5db-4777-9e6a-572d293f915d-832-1-c000.snappy.parquet") \
.rename(columns={"topics":"topics_full"})

In [21]:
extra_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62524 entries, 0 to 62523
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   paper_id          62524 non-null  int64  
 1   doc_type          62524 non-null  object 
 2   original_title    62524 non-null  object 
 3   clean_title       62524 non-null  object 
 4   journal_name      54939 non-null  object 
 5   abstract_length   44073 non-null  float64
 6   abstract          44073 non-null  object 
 7   clean_abstract    62524 non-null  object 
 8   topics_full       62524 non-null  object 
 9   publication_date  62524 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 4.8+ MB


In [22]:
extra_data['publication_date'] = pd.to_datetime(extra_data['publication_date'])
extra_data['year'] = pd.DatetimeIndex(extra_data['publication_date']).year
extra_data['month'] = pd.DatetimeIndex(extra_data['publication_date']).month
extra_data['full_topic_len'] = extra_data['topics_full'].apply(lambda x: len(x))
extra_data['lang'] = extra_data['clean_title'].apply(lambda x: get_lang(x))

In [23]:
extra_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62524 entries, 0 to 62523
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   paper_id          62524 non-null  int64         
 1   doc_type          62524 non-null  object        
 2   original_title    62524 non-null  object        
 3   clean_title       62524 non-null  object        
 4   journal_name      54939 non-null  object        
 5   abstract_length   44073 non-null  float64       
 6   abstract          44073 non-null  object        
 7   clean_abstract    62524 non-null  object        
 8   topics_full       62524 non-null  object        
 9   publication_date  62524 non-null  datetime64[ns]
 10  year              62524 non-null  int64         
 11  month             62524 non-null  int64         
 12  topic_len         62524 non-null  int64         
 13  lang              62524 non-null  object        
dtypes: datetime64[ns](1), 

In [24]:
extra_data.rename(columns={'topic_len':'full_topic_len'}).to_parquet("extra_test_data.parquet")