In [1]:
from utilities import * 
import nltk
import numpy as np
nltk.download('stopwords')
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn import tree
from sklearn.metrics import accuracy_score as acc, f1_score, roc_auc_score as auc
from sklearn.multiclass import OneVsRestClassifier

[nltk_data] Downloading package stopwords to /home/koki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
datapath = '/home/koki/Desktop/Data/NLP/arxiv/archive/arxiv-metadata-oai-snapshot.json'
            #'/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'
labelmap = {}
sample_rate = 6
abstracts_train, labels_train, labelmap = get_data_and_labels(
                    datapath=datapath,
                    year=2021, 
                    month_start=1,
                    month_end=12,
                    labelmap=labelmap,
                    update_map=True,
                    sample_rate=sample_rate
                    )

abstracts_val, labels_val, _ = get_data_and_labels(
                    datapath=datapath,
                    year=2022, 
                    month_start=1,
                    month_end=6,
                    labelmap=labelmap,
                    update_map=False,
                    sample_rate=sample_rate
                    )

abstracts_test, labels_test, _ = get_data_and_labels(
                    datapath=datapath,
                    year=2022, 
                    month_start=7,
                    month_end=12,
                    labelmap=labelmap,
                    update_map=False,
                    sample_rate=sample_rate
                    )

# processed papers 0
# processed papers 500000
# processed papers 1000000
# processed papers 1500000
# processed papers 2000000
# processed papers 0
# processed papers 500000
# processed papers 1000000
# processed papers 1500000
# processed papers 2000000
# processed papers 0
# processed papers 500000
# processed papers 1000000
# processed papers 1500000
# processed papers 2000000


In [3]:
#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = remove_between_square_brackets(text)
    return text

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def remove_stopwords(text, tokenizer, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        tokens = [token.lower() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

stopword_list=stopwords.words('english')

In [4]:
toktok_tokenizer=ToktokTokenizer()
def clean(abstracts, tokenizer=toktok_tokenizer):
    clean_abstracts = [denoise_text(abstract) for abstract in abstracts] 
    clean_abstracts = [remove_special_characters(abstract) for abstract in clean_abstracts]  
    clean_abstracts = [remove_stopwords(abstract, tokenizer, is_lower_case=True) for abstract in \
                       clean_abstracts]  
    return clean_abstracts
clean_train = clean(abstracts_train)
clean_val = clean(abstracts_val)
clean_test = clean(abstracts_test)

In [5]:
cv=CountVectorizer(min_df=0.01,max_df=0.5,binary=False,ngram_range=(1,1))
cv_train = cv.fit_transform(clean_train)
cv_val = cv.transform(clean_val)
cv_test = cv.transform(clean_test)

assert cv_train.shape[1] == cv_val.shape[1]
assert cv_train.shape[1] == cv_test.shape[1]

In [6]:
X_train = pd.DataFrame(data=cv_train.toarray(),columns = cv.get_feature_names_out())
X_val = pd.DataFrame(data=cv_val.toarray(),columns = cv.get_feature_names_out())
X_test = pd.DataFrame(data=cv_test.toarray(),columns = cv.get_feature_names_out())

In [7]:
def one_hot(y, labelmap):  
    encoded = np.zeros((len(y), len(labelmap))) #len(np.unique(y))))
    for i, label_i in enumerate(y):
        for lb_j in label_i:
            encoded[i, lb_j] = 1       
    return encoded

y_train = one_hot(labels_train, labelmap)
y_val = one_hot(labels_val, labelmap)
y_test = one_hot(labels_test, labelmap)

In [8]:
len(labelmap)

21

In [None]:
best_auc = 0
best_model = None
for depth in range(3, 5):
    print('depth', depth)
    model = tree.DecisionTreeClassifier(max_depth=depth) 
    model = OneVsRestClassifier(model)
    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_val) 
    current_auc = auc(y_val, y_pred_prob)
    print('AUC', current_auc)
    if current_auc > best_auc:
        best_auc = current_auc
        best_model = model
    else:
        break

depth 3


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
y_pred = best_model.predict(X_test)
y_pred_prob = best_model.predict_proba(X_test)

auc_micro, auc_macro, f1_micro, f1_macro = \
                auc(y_test, y_pred_prob, average='micro'), auc(y_test, y_pred_prob, average='macro'),\
                f1_score(y_test, y_pred, average='micro'), f1_score(y_test, y_pred, average='macro')

print('micro-AUC: {}, macro-AUC: {}, micro-F1: {}, macro-F1: {}'.format(np.round(auc_micro, 3),\
                                    np.round(auc_macro, 3), np.round(micro, 3), np.round(macro, 3)))