In [9]:
#All of the needed packages will be imported here

import pandas as pd
import seaborn as sns
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk 
from nltk.stem import WordNetLemmatizer 
from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Utilities
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
#Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

In [10]:
df = pd.read_csv('ArXiv-10/arxiv100_parsed.csv', sep=",")

In [11]:
corpus = []
for i in range(0, df['text_filtered'].size):
    corpus.append(df['text_filtered'][i])

print(corpus[:3])

['pre white dwarf eclipsing binary wasp report first bv light curve high resolution spectrum post mass transfer binary star wasp study absolute property extremely low mass white dwarf observed spectrum double lined radial velocity derived effective temperature rotational velocity brighter massive primary found rm eff pm k v sin pm km respectively combined analysis te archive data yielded accurate fundamental parameter program target mass derived accuracy radius better secondary component parameter pm odot r pm r odot rm eff pm k l pm l odot excellent agreement evolutionary sequence helium core white dwarf mass odot indicates star halfway constant luminosity phase result presented article demonstrate wasp el cvn eclipsing binary thin disk formed stable roche lobe overflow channel composed main sequence dwarf spectral type pre white dwarf', 'possible origin khz qpos low mass x ray binary possible origin khz qpos low mass x ray binary proposed recent numerical mhd simulation accretion dis

In [12]:
y = df['label']
category = df['label'].unique()

In [13]:
def tfidf_n_model(N, corpus):
    vectorizer = TfidfVectorizer(ngram_range=(N,2))
    return vectorizer.fit_transform(corpus)

In [14]:
def split_dataset(paramX):
    X_train, X_test, y_train, y_test = train_test_split(paramX, y, test_size = 0.20)

    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)

    print("\nLabel distribution in the training set:")
    print(y_train.value_counts())

    print("\nLabel distribution in the test set:")
    print(y_test.value_counts())
    return [X_train, X_test, y_train, y_test]

## Logistic Regression Hyperparameter Tuning

In [15]:
[X_train, X_test, y_train, y_test] = split_dataset(tfidf_n_model(1,corpus))

(80000, 95632) (80000,)
(20000, 95632) (20000,)

Label distribution in the training set:
hep-ph      8069
hep-th      8047
physics     8043
math        8003
cs          8002
stat        7997
eess        7988
cond-mat    7979
quant-ph    7944
astro-ph    7928
Name: label, dtype: int64

Label distribution in the test set:
astro-ph    2072
quant-ph    2056
cond-mat    2021
eess        2012
stat        2003
cs          1998
math        1997
physics     1957
hep-th      1953
hep-ph      1931
Name: label, dtype: int64


In [None]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
#Place here the results obtained in the previous cell and run
modelLR_final = LogisticRegression(penalty='l2', multi_class='auto', max_iter=300, C=1.0, solver='newton-cg')

## SVM Hyperparameter Tuning

In [16]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}   
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [None]:
#Place here the results obtained in the previous result and run
modelSVC_final = SVC(C=10, kernel='rbf', gamma=0.0001, probability=True)

## Random Forest Classifier Hyperparameter Tuning

In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
# fitting the model for grid search
grid.fit(X_train, y_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:
#Place here the results obtained in the previous cell and run
modelRFC_final = RandomForestClassifier(max_depth=110, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200)

## Evaluate the final models

In [None]:
modelLR_final.fit(X_train, y_train)
y_pred = modelLR_final.predict(X_test)
print(f"\n-----{modelLR_final.__class__.__name__}-----\n")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='macro')}")
print(f"Recall: {recall_score(y_test, y_pred, average='macro')}")
print(f"F1: {f1_score(y_test, y_pred, average='macro')}")

labels = category
print("--------Classification Report--------")
print(classification_report(y_test, y_pred))
print("--------Confusion Matrix--------")

plt.figure(figsize=(8,8))
ax = sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, square=True, fmt="g", cmap="viridis", xticklabels=labels, yticklabels=labels)
ax.set_xlabel("Predicted", fontsize=14)
ax.set_ylabel("True", fontsize=14)
ax.set_title(f"Confusion Matrix {modelLR_final.__class__.__name__}", fontsize=16, pad=20)

plt.show()

In [None]:
modelSVC_final.fit(X_train, y_train)
y_pred = modelSVC_final.predict(X_test)
print(f"\n-----{modelSVC_final.__class__.__name__}-----\n")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='macro')}")
print(f"Recall: {recall_score(y_test, y_pred, average='macro')}")
print(f"F1: {f1_score(y_test, y_pred, average='macro')}")

labels = category
print("--------Classification Report--------")
print(classification_report(y_test, y_pred))
print("--------Confusion Matrix--------")

plt.figure(figsize=(8,8))
ax = sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, square=True, fmt="g", cmap="viridis", xticklabels=labels, yticklabels=labels)
ax.set_xlabel("Predicted", fontsize=14)
ax.set_ylabel("True", fontsize=14)
ax.set_title(f"Confusion Matrix {modelSVC_final.__class__.__name__}", fontsize=16, pad=20)

plt.show()

In [None]:
modelRFC_final.fit(X_train, y_train)
y_pred = modelRFC_final.predict(X_test)
print(f"\n-----{modelRFC_final.__class__.__name__}-----\n")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='macro')}")
print(f"Recall: {recall_score(y_test, y_pred, average='macro')}")
print(f"F1: {f1_score(y_test, y_pred, average='macro')}")

labels = category
print("--------Classification Report--------")
print(classification_report(y_test, y_pred))
print("--------Confusion Matrix--------")

plt.figure(figsize=(8,8))
ax = sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, square=True, fmt="g", cmap="viridis", xticklabels=labels, yticklabels=labels)
ax.set_xlabel("Predicted", fontsize=14)
ax.set_ylabel("True", fontsize=14)
ax.set_title(f"Confusion Matrix {modelRFC_final.__class__.__name__}", fontsize=16, pad=20)

plt.show()