In [1]:
from nltk.corpus import stopwords
import re
import pandas as pd
Chunksize =200000
for chunk in pd.read_csv('cleaned_data.csv', chunksize=Chunksize): #cleaned_data.csv is the preprocessed dataset.
    print(chunk.shape)
    print("="*66)
    print(chunk.head(2))
    print("="*66)
    break
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

(200000, 6)
   Unnamed: 0      id  category  category_num  \
0           0  271675  q-bio-QM           138   
1           1  412276   hep-ph-            68   

                                            abstract  \
0    Bacteria are often exposed to multiple stimu...   
1    Accurate knowledge of the thermodynamic prop...   

                                     Abstract_Parsed  
0    bacteria  often exposed  multiple stimuli  c...  
1    accurate knowledge   thermodynamic propertie...  


In [2]:
chunk.head()

Unnamed: 0.1,Unnamed: 0,id,category,category_num,abstract,Abstract_Parsed
0,0,271675,q-bio-QM,138,Bacteria are often exposed to multiple stimu...,bacteria often exposed multiple stimuli c...
1,1,412276,hep-ph-,68,Accurate knowledge of the thermodynamic prop...,accurate knowledge thermodynamic propertie...
2,2,256956,astro-ph-SR,7,The largest X9.3 solar flare in solar cycle ...,largest x93 solar flare solar cycle 24 p...
3,3,427612,math-PR,93,We say that a random integer variable $X$ is...,say random integer variable $x$ monotone...
4,4,113852,math-CO,76,We derive a formula expressing the joint dis...,derive formula expressing joint distribut...


In [3]:
#unique categories among the selected chunk
chunk.category.nunique()

155

In [4]:
#split data 
from sklearn.model_selection import train_test_split
X = chunk.Abstract_Parsed
y = chunk.category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((140000,), (60000,), (140000,), (60000,))

In [6]:
X_train.head()

21269         performed  $^{125}$te-nuclear magnetic res...
187660      traditional motion planning approaches  mult...
774          employ physics-informed neural networks (pi...
184577       influence  high-enthalpy effects  hypersoni...
37127       recent nlp studies reveal  substantial lingu...
Name: Abstract_Parsed, dtype: object

# Tfid Vectorization 

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

ngram_range = (1,2) #both unigrams and bigrams 
min_df = 1
max_df = 100
max_features = 1000
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)

# Naive Bayes

In [8]:
##naive bias
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
modelnb = nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.34896666666666665


# Logistic Regression

In [9]:
##logistic regression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
model_logreg = logreg.fit(X_train, y_train)



y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy 0.64045


# Linear SVM 

In [10]:
#linear svm
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
model_svm= sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5885333333333334


# Prediction

In [11]:
#read test dataset
df_test = pd.read_csv('test (1).csv')

In [12]:
#make a list of abstract column of test dataset
test_list = list(df_test['abstract'])

In [13]:
category_num = []

In [14]:
"""for all abstracts; using the model with highest accuracy i.e logistic regression for the given dataset
   to predict the categories that respective abstract may fall into"""
for i in test_list:
    new_abstract=i
    cat = model_logreg.predict([repr(new_abstract)])
    category_num.append(cat)

In [15]:
#list into dataframe
prediction= pd.DataFrame(category_num)

In [17]:
prediction.head()

Unnamed: 0,0
0,cs-CV
1,cs-SE
2,hep-ph-
3,physics-ed-ph
4,cs-LG


In [18]:
prediction.columns= ["category"]

In [19]:
#labels
label_codes={'alg-geom-': 0,
  'astro-ph-': 1,
  'astro-ph-CO': 2,
  'astro-ph-EP': 3,
  'astro-ph-GA': 4,
  'astro-ph-HE': 5,
  'astro-ph-IM': 6,
  'astro-ph-SR': 7,
  'cond-mat-dis-nn': 8,
  'cond-mat-mes-hall': 9,
  'cond-mat-mtrl-sci': 10,
  'cond-mat-other': 11,
  'cond-mat-quant-gas': 12,
  'cond-mat-soft': 13,
  'cond-mat-stat-mech': 14,
  'cond-mat-str-el': 15,
  'cond-mat-supr-con': 16,
  'cs-': 17,
  'cs-AI': 18,
  'cs-AR': 19,
  'cs-CC': 20,
  'cs-CE': 21,
  'cs-CG': 22,
  'cs-CL': 23,
  'cs-CR': 24,
  'cs-CV': 25,
  'cs-CY': 26,
  'cs-DB': 27,
  'cs-DC': 28,
  'cs-DL': 29,
  'cs-DM': 30,
  'cs-DS': 31,
  'cs-ET': 32,
  'cs-FL': 33,
  'cs-GL': 34,
  'cs-GR': 35,
  'cs-GT': 36,
  'cs-HC': 37,
  'cs-IR': 38,
  'cs-IT': 39,
  'cs-LG': 40,
  'cs-LO': 41,
  'cs-MA': 42,
  'cs-MM': 43,
  'cs-MS': 44,
  'cs-NE': 45,
  'cs-NI': 46,
  'cs-OH': 47,
  'cs-OS': 48,
  'cs-PF': 49,
  'cs-PL': 50,
  'cs-RO': 51,
  'cs-SC': 52,
  'cs-SD': 53,
  'cs-SE': 54,
  'cs-SI': 55,
  'cs-SY': 56,
  'econ-EM': 57,
  'econ-GN': 58,
  'econ-TH': 59,
  'eess-AS': 60,
  'eess-IV': 61,
  'eess-SP': 62,
  'eess-SY': 63,
  'funct-an-': 64,
  'gr-qc-': 65,
  'hep-ex-': 66,
  'hep-lat-': 67,
  'hep-ph-': 68,
  'hep-th-': 69,
  'math-': 70,
  'math-AC': 71,
  'math-AG': 72,
  'math-AP': 73,
  'math-AT': 74,
  'math-CA': 75,
  'math-CO': 76,
  'math-CT': 77,
  'math-CV': 78,
  'math-DG': 79,
  'math-DS': 80,
  'math-FA': 81,
  'math-GM': 82,
  'math-GN': 83,
  'math-GR': 84,
  'math-GT': 85,
  'math-HO': 86,
  'math-KT': 87,
  'math-LO': 88,
  'math-MG': 89,
  'math-NT': 90,
  'math-OA': 91,
  'math-OC': 92,
  'math-PR': 93,
  'math-QA': 94,
  'math-RA': 95,
  'math-RT': 96,
  'math-SG': 97,
  'math-SP': 98,
  'math-ST': 99,
  'math-ph-': 100,
  'nlin-AO': 101,
  'nlin-CD': 102,
  'nlin-CG': 103,
  'nlin-PS': 104,
  'nlin-SI': 105,
  'nucl-ex-': 106,
  'nucl-th-': 107,
  'physics-acc-ph': 108,
  'physics-ao-ph': 109,
  'physics-app-ph': 110,
  'physics-atm-clus': 111,
  'physics-atom-ph': 112,
  'physics-bio-ph': 113,
  'physics-chem-ph': 114,
  'physics-class-ph': 115,
  'physics-comp-ph': 116,
  'physics-data-an': 117,
  'physics-ed-ph': 118,
  'physics-flu-dyn': 119,
  'physics-gen-ph': 120,
  'physics-geo-ph': 121,
  'physics-hist-ph': 122,
  'physics-ins-det': 123,
  'physics-med-ph': 124,
  'physics-optics': 125,
  'physics-plasm-ph': 126,
  'physics-pop-ph': 127,
  'physics-soc-ph': 128,
  'physics-space-ph': 129,
  'q-alg-': 130,
  'q-bio-BM': 131,
  'q-bio-CB': 132,
  'q-bio-GN': 133,
  'q-bio-MN': 134,
  'q-bio-NC': 135,
  'q-bio-OT': 136,
  'q-bio-PE': 137,
  'q-bio-QM': 138,
  'q-bio-SC': 139,
  'q-bio-TO': 140,
  'q-fin-CP': 141,
  'q-fin-EC': 142,
  'q-fin-GN': 143,
  'q-fin-MF': 144,
  'q-fin-PM': 145,
  'q-fin-PR': 146,
  'q-fin-RM': 147,
  'q-fin-ST': 148,
  'q-fin-TR': 149,
  'quant-ph-': 150,
  'stat-AP': 151,
  'stat-CO': 152,
  'stat-ME': 153,
  'stat-ML': 154,
  'stat-OT': 155}

In [20]:
# Category mapping

prediction['category_num'] =prediction['category']
prediction = prediction.replace({'category_num':label_codes}) #replacing categories with their respective num. 

In [21]:
prediction.head()

Unnamed: 0,category,category_num
0,cs-CV,25
1,cs-SE,54
2,hep-ph-,68
3,physics-ed-ph,118
4,cs-LG,40


# Output

In [ ]:
For the given dataset, Naive Bayes = 34.89% accuracy 
                       Logistic Regression = 64.045% accuracy
                       Linear SVM = 58.85% accuracy
                       
Hence, as logistic regression gave the highest accuracy, predicting categories, then labeling/ mapping them with their respective category numbers using logistic regression. The output is hence predicited only from logistic regression model.

In [22]:
sample = pd.read_csv("sample.csv")

In [23]:
sample.drop(['category_num'], axis='columns', inplace=True)

In [27]:
sample.head()

Unnamed: 0,id,category_num
0,430065,25
1,75226,54
2,301990,68
3,301001,118
4,280179,40


In [25]:
sample["category_num"] = prediction["category_num"]

In [28]:
sample.to_csv("solution(final).csv")