In [65]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json


In [66]:
import numpy as np
import json   #importing this module we can work with JSON data
import nltk   #NLP toolkit
from nltk.corpus import stopwords
nltk.download('stopwords')
import re     # library for regular expression operations
import string # for string operations
import collections
import gensim  
from gensim import parsing        # Help in preprocessing the data, very efficiently

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
path_to_data = '../input/arxiv/arxiv-metadata-oai-snapshot.json'

In [68]:
def get_metadata():
    with open(path_to_data) as f:
        for line in f:
            yield line #Yield is used like Return, except the function will return a generator

In [69]:
metadata = get_metadata()
for paper in metadata:
    first_paper = json.loads(paper) #json.loads() return a dictionary
    break

In [70]:
for key in first_paper:
    print(key)

id
submitter
authors
title
comments
journal-ref
doi
report-no
categories
license
abstract
versions
update_date
authors_parsed


We're interested only in the keys Categories, Authors, Title and Abstract of each paper, so let's save this information in a Dataframe:

In [71]:
#set of empty list that will be filled with the information of each paper
categories=[]
authors=[]
title=[]
abstract=[]


In [72]:
total_items=0

for papers in metadata:
    paper=json.loads(papers)
    categories.append(paper['categories'])
    authors.append(paper['authors'])
    title.append(paper['title'])
    abstract.append(paper['abstract'])
    
    total_items+=1

In [73]:
#In this cell we create a dictionary with the information stored before
d = {'Categories': categories,
    'Authors': authors,
    'Title': title,
    'Abstract': abstract}

In [74]:
df = pd.DataFrame(d)

In [75]:
df.head(10)

Unnamed: 0,Categories,Authors,Title,Abstract
0,math.CO cs.CG,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-..."
1,physics.gen-ph,Hongjun Pan,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...
2,math.CO,David Callan,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...
3,math.CA math.FA,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...
4,cond-mat.mes-hall,Y. H. Pong and C. K. Law,Bosonic characters of atomic Cooper pairs acro...,We study the two-particle wave function of p...
5,gr-qc,"Alejandro Corichi, Tatjana Vukasinac and Jose ...",Polymer Quantum Mechanics and its Continuum Limit,A rather non-standard quantum representation...
6,cond-mat.mtrl-sci,Damian C. Swift,Numerical solution of shock and ramp compressi...,A general formulation was developed to repre...
7,astro-ph,"Paul Harvey, Bruno Merin, Tracy L. Huard, Luis...","The Spitzer c2d Survey of Large, Nearby, Inste...",We discuss the results from the combined IRA...
8,math.CO,Sergei Ovchinnikov,"Partial cubes: structures, characterizations, ...",Partial cubes are isometric subgraphs of hyp...
9,math.NT math.AG,Clifton Cunningham and Lassina Dembele,Computing genus 2 Hilbert-Siegel modular forms...,In this paper we present an algorithm for co...


In [76]:
df.tail()

Unnamed: 0,Categories,Authors,Title,Abstract
1796905,supr-con cond-mat.supr-con,"R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y...",On the origin of the irreversibility line in t...,We report on measurements of the angular dep...
1796906,supr-con cond-mat.supr-con,"Durga P. Choudhury, Balam A. Willemsen, John S...",Nonlinear Response of HTSC Thin Film Microwave...,The non-linear microwave surface impedance o...
1796907,supr-con cond-mat.supr-con,"Balam A. Willemsen, J. S. Derov and S.Sridhar ...",Critical State Flux Penetration and Linear Mic...,The vortex contribution to the dc field (H) ...
1796908,supr-con cond-mat.supr-con,Yasumasa Hasegawa (Himeji Institute of Technol...,Density of States and NMR Relaxation Rate in A...,We show that the density of states in an ani...
1796909,supr-con cond-mat.supr-con,"Naoki Enomoto, Masanori Ichioka and Kazushige ...",Ginzburg Landau theory for d-wave pairing and ...,The Ginzburg Landau theory for d_{x^2-y^2}-w...


In [77]:
df.shape

(1796910, 4)

In [78]:
df.Categories.value_counts()

astro-ph                                                      86914
hep-ph                                                        73549
quant-ph                                                      53966
hep-th                                                        53287
cond-mat.mtrl-sci                                             30107
                                                              ...  
eess.IV cs.LG cs.NA math.NA physics.comp-ph                       1
cond-mat nlin.CD physics.atom-ph                                  1
astro-ph.HE astro-ph.GA cond-mat.quant-gas physics.flu-dyn        1
cond-mat.mtrl-sci math.GR physics.data-an                         1
q-bio.PE math-ph math.AP math.DS math.MP nlin.CD                  1
Name: Categories, Length: 62055, dtype: int64

In [79]:
traindf = df[(df['Categories']=='astro-ph') | (df['Categories']=='hep-ph')]
traindf.head()

Unnamed: 0,Categories,Authors,Title,Abstract
7,astro-ph,"Paul Harvey, Bruno Merin, Tracy L. Huard, Luis...","The Spitzer c2d Survey of Large, Nearby, Inste...",We discuss the results from the combined IRA...
14,hep-ph,"Chao-Hsi Chang, Tong Li, Xue-Qian Li and Yu-Mi...",Lifetime of doubly charmed baryons,"In this work, we evaluate the lifetimes of t..."
15,astro-ph,"Nceba Mhlahlo, David H. Buckley, Vikram S. Dhi...",Spectroscopic Observations of the Intermediate...,Results from spectroscopic observations of t...
21,astro-ph,"M. A. Loukitcheva, S. K. Solanki and S. White",ALMA as the ideal probe of the solar chromosphere,"The very nature of the solar chromosphere, i..."
27,hep-ph,"Zhan Shu, Xiao-Lin Chen and Wei-Zhen Deng",Understanding the Flavor Symmetry Breaking and...,"In $\XQM$, a quark can emit Goldstone bosons..."


Now that only two categories has been selected, we have to convert categories names in a digits in order to be processed by a classification algorithm.

In [80]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()#LabelEncoder object knows how to encode word labels
traindf['Categories']=le.fit_transform(traindf['Categories']) #encode labels in column categories
traindf['Categories'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


array([0, 1])

In [81]:
traindf.head()

Unnamed: 0,Categories,Authors,Title,Abstract
7,0,"Paul Harvey, Bruno Merin, Tracy L. Huard, Luis...","The Spitzer c2d Survey of Large, Nearby, Inste...",We discuss the results from the combined IRA...
14,1,"Chao-Hsi Chang, Tong Li, Xue-Qian Li and Yu-Mi...",Lifetime of doubly charmed baryons,"In this work, we evaluate the lifetimes of t..."
15,0,"Nceba Mhlahlo, David H. Buckley, Vikram S. Dhi...",Spectroscopic Observations of the Intermediate...,Results from spectroscopic observations of t...
21,0,"M. A. Loukitcheva, S. K. Solanki and S. White",ALMA as the ideal probe of the solar chromosphere,"The very nature of the solar chromosphere, i..."
27,1,"Zhan Shu, Xiao-Lin Chen and Wei-Zhen Deng",Understanding the Flavor Symmetry Breaking and...,"In $\XQM$, a quark can emit Goldstone bosons..."


In [82]:
def transformText(text):
  # All the necessary preprocessing on our text of choice
    stops = set(stopwords.words("english"))
  # Convert text to lower
    text = text.lower()
  # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text) 
    text= gensim.parsing.preprocessing.strip_non_alphanum(text)                       
  # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
  # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
  # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
  # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
  # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
  # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
  # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
  # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

In [83]:
traindf['Title'] = traindf['Title'].map(transformText) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [84]:
traindf.head(10)

Unnamed: 0,Categories,Authors,Title,Abstract
7,0,"Paul Harvey, Bruno Merin, Tracy L. Huard, Luis...",spitzer cd survei larg nearbi insterstellar cl...,We discuss the results from the combined IRA...
14,1,"Chao-Hsi Chang, Tong Li, Xue-Qian Li and Yu-Mi...",lifetim doubli charm baryon,"In this work, we evaluate the lifetimes of t..."
15,0,"Nceba Mhlahlo, David H. Buckley, Vikram S. Dhi...",spectroscop observ intermedi polar hydra quies...,Results from spectroscopic observations of t...
21,0,"M. A. Loukitcheva, S. K. Solanki and S. White",alma ideal probe solar chromospher,"The very nature of the solar chromosphere, i..."
27,1,"Zhan Shu, Xiao-Lin Chen and Wei-Zhen Deng",understand flavor symmetri break nucleon flavo...,"In $\XQM$, a quark can emit Goldstone bosons..."
29,1,"V. M. Biryukov (Serpukhov, IHEP)",crystal channel lhc forward proton preserv dis...,"We show that crystal can trap a broad (x, x'..."
30,1,"A. Esteban-Pretel, R. Tom\`as and J. W. F. Valle",probe non standard neutrino interact supernova...,We analyze the possibility of probing non-st...
57,0,"T. G. Beatty, J. M. Fernandez, D. W. Latham, G...",mass radiu unseen dwarf companion singl line e...,We derive masses and radii for both componen...
61,1,Somnath Choudhury,experiment effort search ge neutrinoless doubl...,Neutrinoless double beta decay is one of the...
78,0,Dean E. McLaughlin and S. Michael Fall,shape globular cluster mass function stellar d...,We show that the globular cluster mass funct...


Once we have properly preprocess our data, we have to split the dataset in training and test set.

In [85]:
from sklearn.model_selection import train_test_split
X=traindf['Title']
y=traindf['Categories']
X_train, X_test,y_train,y_test= train_test_split(X,y, test_size=0.33, random_state=1)

In [86]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(107510,)
(52953,)
(107510,)
(52953,)


**Feature Extraction with CountVectorizer and TfidfTransformer( Term Frequency- Inverse Document Frequency)**

In [87]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

*CountVectorizer is able to create a dictionary of word inside all the documents we provide to it and than to represent each of this documents (the titles) in a matrix form. Every row will be a title and every column a word.*

In [88]:
vectorizer=CountVectorizer()
X_train_counts=vectorizer.fit_transform(X_train)
feature_names=vectorizer.get_feature_names()
feature_names[:10]


['aa', 'aac', 'aafrag', 'aal', 'aao', 'aaomega', 'aarhu', 'aat', 'aavso', 'ab']

In [89]:
len(feature_names)

13114

*After the data manipulation above, we have to use the TfidfTransformer in order to create a proper count of the frequency of each word inside our dataset. Tf-idf is the acronym for Term Frequncy-Inverse Document Frequency. With this approach we evaluate the relative importance of a particular word. Tf-idf is the product of two statistics, term frequency and inverse document frequency. Various ways for determining the exact values of both statistics exist. In the case of the Term Frequency is the "raw frequency" of a term in a document, i.e. the number of times a term occurs in document (a title). The "inverse document frequency" is a measure of whether the term is common or rare across all documents. It is obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient. The Tf-idf is the product of this two quantity.*

In [90]:
tfidf_transformer=TfidfTransformer()
X_train_tfidf=tfidf_transformer.fit_transform(X_train_counts)

**Classification with Logistic Regression**

In [91]:
from sklearn.linear_model import LogisticRegression

In [92]:
model=LogisticRegression()
model.fit(X_train_tfidf,y_train)

LogisticRegression()

*Prediction on Test Data*

In [93]:
X_test_counts=vectorizer.transform(X_test)
feature_names=vectorizer.get_feature_names()
feature_names[:10]

['aa', 'aac', 'aafrag', 'aal', 'aao', 'aaomega', 'aarhu', 'aat', 'aavso', 'ab']

In [94]:
X_test_tfidf=tfidf_transformer.transform(X_test_counts)

In [95]:
prediction=model.predict(X_test_tfidf)

**Model Accuracy**

In [96]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(prediction,y_test)
accuracy

0.9769984703416237

**Classification using RandomForestClassifier**

In [97]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
mod=RandomForestClassifier()
mod.fit(X_train_tfidf,y_train)

RandomForestClassifier()

In [99]:
pred=mod.predict(X_test_tfidf)
accuracy=accuracy_score(pred,y_test)
print(accuracy)

0.9720129171151776
