In [18]:
import numpy as np
import pandas as pd
import json
import dask.bag as db

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_curve, auc, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.calibration import CalibratedClassifierCV

stop_words = set(stopwords.words('english'))

from sklearn.base import BaseEstimator, TransformerMixin
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import sklearn.model_selection as ms

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding,  Dropout,  SpatialDropout1D, LSTM
from tensorflow.keras import backend as K

import sklearn.metrics as mt
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

In [9]:
#get data
data_file = '/Users/leventguner/Downloads/LeventMac/paper_data/arxiv-metadata-oai-snapshot.json'

def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

In [10]:
#define text cleaner

class CleanText(BaseEstimator, TransformerMixin):
    def remove_mentions(self, input_text):
        return re.sub(r'@\w+', '', input_text)
    
    def remove_urls(self, input_text):
        return re.sub(r'http.?://[^\s]+[\s]?', '', input_text)
    
    def emoji_oneword(self, input_text):
        # By compressing the underscore, the emoji is kept as one word
        return input_text.replace('_','')
    
    def remove_punctuation(self, input_text):
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        return input_text.translate(trantab)
    def remove_digits(self, input_text):
        return re.sub('\d+', '', input_text)
    
    def to_lower(self, input_text):
        return input_text.lower()
    
    def remove_stopwords(self, input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
    def stemming(self, input_text):
        porter = PorterStemmer()
        words = input_text.split() 
        stemmed_words = [porter.stem(word) for word in words]
        return " ".join(stemmed_words)
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(self.remove_punctuation).apply(self.remove_digits).apply(self.to_lower).apply(self.remove_stopwords)
        return clean_X

In [11]:
#read data with dask
docs = db.read_text(data_file).map(json.loads)



In [12]:
#see an instance example
docs.take(1)

({'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [13]:
#get latest versions and convert to df

get_latest_version = lambda x: x['versions'][-1]['created']

# get only necessary fields
trim = lambda x: {'id': x['id'],
                  'title': x['title'],
                  'category':x['categories'].split(' '),
                  'abstract':x['abstract']}
# filter for papers published on or after 2019-01-01
columns = ['id','category','abstract']
docs_df = (docs
             .filter(lambda x: int(get_latest_version(x).split(' ')[3]) > 2018)
             .map(trim)
             .compute())

# convert to pandas
docs_df = pd.DataFrame(docs_df)

# add general category. we are going to use as our target variable
docs_df['general_category'] = docs_df.category.apply(lambda x:[a.split('.')[0] for a in x])

In [14]:
docs_df['sub_category'] = docs_df.category.apply(lambda x:[a.split('.')[1] if ('.' in a) else a.split('.')[0]+'_nsc' for a in x])
docs_df['new_category'] = docs_df.category.apply(lambda x:[[a.split('.')[0],a.split('.')[1]] if ('.' in a) else [a.split('.')[0],a.split('.')[0]+'_nsc'] for a in x])

In [15]:
docs_df.iloc[128310]

id                                                         1907.10813
title               Model-independent reconstruction of $f(T)$ gra...
category                         [astro-ph.CO, gr-qc, hep-ph, hep-th]
abstract              We apply Gaussian processes and Hubble funct...
general_category                    [astro-ph, gr-qc, hep-ph, hep-th]
sub_category                  [CO, gr-qc_nsc, hep-ph_nsc, hep-th_nsc]
new_category        [[astro-ph, CO], [gr-qc, gr-qc_nsc], [hep-ph, ...
Name: 128310, dtype: object

In [19]:
docs_df

Unnamed: 0,id,title,category,abstract,general_category,sub_category,new_category
0,0704.1445,Deformed Wigner crystal in a one-dimensional q...,"[cond-mat.str-el, cond-mat.mes-hall]",The spatial Fourier spectrum of the electron...,"[cond-mat, cond-mat]","[str-el, mes-hall]","[[cond-mat, str-el], [cond-mat, mes-hall]]"
1,0705.0033,Ergodic Theory: Recurrence,[math.DS],We survey the impact of the Poincar\'e recur...,[math],[DS],"[[math, DS]]"
2,0705.0344,Unifying derived deformation theories,[math.AG],We develop a framework for derived deformati...,[math],[AG],"[[math, AG]]"
3,0705.0825,Einstein's Theory of Gravity in the Presence o...,"[gr-qc, astro-ph, hep-th]",The mysterious `dark energy' needed to expla...,"[gr-qc, astro-ph, hep-th]","[gr-qc_nsc, astro-ph_nsc, hep-th_nsc]","[[gr-qc, gr-qc_nsc], [astro-ph, astro-ph_nsc],..."
4,0705.2562,Anthropic prediction in a large toy landscape,[hep-th],The successful anthropic prediction of the c...,[hep-th],[hep-th_nsc],"[[hep-th, hep-th_nsc]]"
...,...,...,...,...,...,...,...
324653,quant-ph/0612050,The exact cost of redistributing multipartite ...,[quant-ph],How correlated are two quantum systems from ...,[quant-ph],[quant-ph_nsc],"[[quant-ph, quant-ph_nsc]]"
324654,quant-ph/0701163,Does Observation Create Reality?,[quant-ph],It has been suggested that the locality of i...,[quant-ph],[quant-ph_nsc],"[[quant-ph, quant-ph_nsc]]"
324655,quant-ph/0702160,Discrete-query quantum algorithm for NAND trees,[quant-ph],"Recently, Farhi, Goldstone, and Gutmann gave...",[quant-ph],[quant-ph_nsc],"[[quant-ph, quant-ph_nsc]]"
324656,quant-ph/9606017,Quantum Mechanics in Terms of Realism,[quant-ph],.We expound an alternative to the Copenhagen...,[quant-ph],[quant-ph_nsc],"[[quant-ph, quant-ph_nsc]]"


In [23]:
docs_df2 = docs_df.sample(10000)

In [24]:
#prepare categories for prediction

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(docs_df2.general_category)

mlb_sub = MultiLabelBinarizer()
labels_sub = mlb_sub.fit_transform(docs_df2.sub_category)
labels_sub

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
y = labels

#train-test split
X_train, X_test, y_train, y_test = ms.train_test_split(docs_df2.abstract, y, test_size=0.33, random_state=42)
print('splitted')

#clean
ct = CleanText()
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

print('cleaned')


#tokenization
max_features = 10000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_train) #sadece train ile yap

X_train = tokenizer.texts_to_sequences(X_train) #train ve test için ayrı ayrı yap
X_train = pad_sequences(X_train,maxlen=217) #train ve test için ayrı ayrı yap

X_test = tokenizer.texts_to_sequences(X_test) #train ve test için ayrı ayrı yap
X_test = pad_sequences(X_test,maxlen=217) #train ve test için ayrı ayrı yap

print('tokenized')



splitted
cleaned
tokenized


In [26]:
#create the LSTM model
embed_dim = 32
lstm_out = 10

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y_train.shape[1],activation='sigmoid'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

# fit model
batch_size = 32
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs = 2, batch_size=batch_size, verbose = 1)





Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 217, 32)           320000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 217, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 10)                1720      
_________________________________________________________________
dense (Dense)                (None, 20)                220       
Total params: 321,940
Trainable params: 321,940
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/2
Epoch 2/2
