## 1. Getting all the imports

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, GlobalMaxPool1D, Embedding, Input
import os
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
import json
import itertools
from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 5000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

## 2. Loading data

Read the data that was stored in the JSON format.

In [7]:
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


def load_file():
    with open('./data/arxiv-metadata-oai-snapshot.json') as f:
        for line in f:
            yield json.loads(line)


metadata = load_file()

subset = itertools.islice(metadata, 100000)

df = pd.DataFrame(subset)
df.head(5)

./data\arxiv-metadata-oai-snapshot.json


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


Similar to what we did for the "bow_model.ipynb" notebook, we are only looking into the 6 major categories for now

In [8]:
df_cat = df[df['categories'].isin(['hep-ph', 'quant-ph', 'astro-ph', 'hep-th', 'gr-qc', 'cond-mat.mtrl-sci'])] # high energy physics/quantum physics
df_cat['categories'].value_counts()

df_cat = pd.get_dummies(df_cat, columns=['categories'])

X = df_cat['abstract']

# Dependent Variables
y = df_cat[['categories_astro-ph', 'categories_gr-qc',
            'categories_hep-ph', 'categories_hep-th', 'categories_quant-ph', 'categories_cond-mat.mtrl-sci']]

train_text, test_text, train_labels, test_labels = train_test_split(X, y, test_size=0.3, random_state=123)

## 3. Text preprocessing

1. We first initialize the tokenizer to split the words into lists of tokens. The num_words is set to 20000, which is the maximum number to keep.
2. Using 'fit_on_texts' function to update the internal vocabulary based on the list of texts. 
3. Calling the 'texts_to_sequences' function which transforms each text in texts to a sequence of integers

In [9]:
# Initializing the class
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
# Updates internal vocabulary based on a list of texts.
tokenizer.fit_on_texts(train_text)

# Transforms each text in texts to a sequence of integers.
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)

word_index = tokenizer.word_index
print("Length of word Index:", len(word_index))
print("First 5 elements in the word_index dictionary:", dict(list(word_index.items())[0: 5]) )
print("First abstract text in training set:\n", train_sequences[0])

Length of word Index: 46580
First 5 elements in the word_index dictionary: {'the': 1, 'of': 2, 'and': 3, 'a': 4, 'in': 5}
First abstract text in training set:
 [7, 885, 72, 2792, 183, 1, 1338, 1921, 4509, 602, 13, 3816, 916, 39, 408, 1, 1013, 541, 1339, 496, 4195, 1932, 950, 39, 9, 1, 2215, 123, 3722, 91, 2, 1542, 176, 974, 4, 3, 64, 135, 17, 85, 17, 2617, 1719, 7, 682, 1013, 100, 255, 1104, 1961, 404, 2, 4346, 649, 3, 43, 25, 55, 341, 179, 180, 39, 16, 1, 219, 53, 2, 1, 1463, 1104, 653, 2829, 6, 1714, 5, 30, 2670, 7, 885, 1, 2, 1, 3722, 1815, 4, 64, 3, 4, 22, 1434, 5, 916, 3, 496, 4195, 39, 18, 90, 650, 3, 2041, 1, 4594, 3, 2378, 182, 7, 36, 473, 1, 515, 46, 82, 182, 3, 1, 271, 46, 82, 182, 77, 17, 4, 118, 2, 194, 42, 916, 3, 950, 141, 565, 13, 3722]


Now that we have tokenized the comment texts, we need to pad the sentences to make all the sentences of equal length. This is because for DL model inputs, we should have a fixed length of data inputs. 


In [10]:
# Pad tokenized sequences
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print("Shape of padded sequence list:\n", trainvalid_data.shape)
print("First abstract text in training set - 0 for padding - only last 50 sequences as the rest are paddings:\n", trainvalid_data[0][-50:])

Shape of padded sequence list:
 (23494, 1000)
First abstract text in training set - 0 for padding - only last 50 sequences as the rest are paddings:
 [   4   64    3    4   22 1434    5  916    3  496 4195   39   18   90
  650    3 2041    1 4594    3 2378  182    7   36  473    1  515   46
   82  182    3    1  271   46   82  182   77   17    4  118    2  194
   42  916    3  950  141  565   13 3722]


## 4. Data modeling

LSTM model is used as the classifier model. It is a special kind of RNN, capable of learning long-term dependencies. All RNNs have the form of a chain of repeating modules of neural network. LSTMs also have this chain like structure, but instead of the hidden layer we have something called LSTM cell and we have another connection that runs through all the time steps along with the hidden state. This is the called the "Cell State" vector from which information can be retrieved and removed as and when required.

In [11]:
from keras.layers import Dense, LSTM, Embedding, Input

# Using LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(MAX_NUM_WORDS, 128))
lstm_model.add(LSTM(units = 128, dropout = 0.2, recurrent_dropout = 0.2))
lstm_model.add(Dense(units = 6, activation = 'sigmoid'))
print(lstm_model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         640000    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 6)                 774       
Total params: 772,358
Trainable params: 772,358
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
# Configures the model for training.
lstm_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["AUC", "accuracy"])

# Split the dataset into train and validation set for training and evaludating the model
X_train, X_val, y_train, y_val = train_test_split(trainvalid_data, train_labels, shuffle=True, random_state=42)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

# Trains the model for a fixed number of epochs (iterations on a dataset)
history = rnn_model.fit(X_train, y_train, batch_size=128, epochs=3, validation_data=(X_val, y_val))


(17620, 1000) (17620, 6) (5874, 1000) (5874, 6)
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [13]:
# Use the CNN model to output probabilities on test data
y_preds = rnn_model.predict(test_data)

y_vals = test_labels.to_numpy()
# Model Probabilities for class 1 of each of the target variables
# y_preds = np.transpose(np.array(cnn_model.predict_proba(X_val))[:, :, 1])

y_preds_2 = np.argmax(y_preds, axis=1)
y_test_2 = np.argmax(y_vals, axis=1)
print(f"The Classification report: \n")
print(classification_report(y_preds_2, y_test_2))

The Classification report: 

              precision    recall  f1-score   support

           0       0.98      0.95      0.96      4990
           1       0.01      0.50      0.01         8
           2       0.90      0.88      0.89      1645
           3       0.71      0.71      0.71      1111
           4       0.96      0.60      0.74      1990
           5       0.26      0.52      0.35       326

    accuracy                           0.83     10070
   macro avg       0.64      0.69      0.61     10070
weighted avg       0.91      0.83      0.86     10070



In [14]:
def calculate_roc_auc(y_test, y_pred):
    aucs = []
    # Calculate the ROC-AUC for each of the target column
    for col in range(y_test.shape[1]):
        aucs.append(roc_auc_score(y_test[:,col],y_pred[:,col]))
    return aucs

# Calculate Mean of the ROC-AUC
mean_auc = mean(calculate_roc_auc(y_vals, y_preds))

In [15]:
mean_auc

0.949598733076606