In [1]:
# Step 1: Load and preprocess the data
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
import kagglehub

# Download latest version of the dataset
path = kagglehub.dataset_download("shivanandmn/multilabel-classification-dataset")

print("Path to dataset files:", path)

# Load the dataset
url_train = f'{path}/train.csv'
train_set  = pd.read_csv(url_train)

Path to dataset files: /home/lolli/.cache/kagglehub/datasets/shivanandmn/multilabel-classification-dataset/versions/1


In [2]:
train_set.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [3]:

train_set.drop(columns=['ID'], inplace=True)
# Preprocess the data

# Combine the category columns into a list of categories for each paper
categories = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
train_set['Categories'] = train_set[categories].apply(lambda row: [cat for cat in categories if row[cat] == 1], axis=1)

# modify the text to preprocess it, remove special characters, convert to lowercase, etc.7
train_set['TITLE'] = train_set['TITLE'].str.lower()
train_set['TITLE'] = train_set['TITLE'].str.replace(r'[^\w\s]', '')
train_set['TITLE'] = train_set['TITLE'].str.replace(r'\d+', '')
train_set['TITLE'] = train_set['TITLE'].str.strip()
train_set['ABSTRACT'] = train_set['ABSTRACT'].str.lower()
train_set['ABSTRACT'] = train_set['ABSTRACT'].str.replace(r'[^\w\s]', '')
train_set['ABSTRACT'] = train_set['ABSTRACT'].str.replace(r'\d+', '')
train_set['ABSTRACT'] = train_set['ABSTRACT'].str.strip()

# merge the abstract and title columns
train_set['TEXT'] = train_set['TITLE'] + ' ' + train_set['ABSTRACT']


train_set.head()

Unnamed: 0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,Categories,TEXT
0,reconstructing subject-specific effect maps,predictive models allow subject-specific infer...,1,0,0,0,0,0,[Computer Science],reconstructing subject-specific effect maps pr...
1,rotation invariance neural network,rotation invariance and translation invariance...,1,0,0,0,0,0,[Computer Science],rotation invariance neural network rotation in...
2,spherical polyharmonics and poisson kernels fo...,we introduce and develop the notion of spheric...,0,0,1,0,0,0,[Mathematics],spherical polyharmonics and poisson kernels fo...
3,a finite element approximation for the stochas...,the stochastic landau--lifshitz--gilbert (llg)...,0,0,1,0,0,0,[Mathematics],a finite element approximation for the stochas...
4,comparative study of discrete wavelet transfor...,fourier-transform infra-red (ftir) spectra of ...,1,0,0,1,0,0,"[Computer Science, Statistics]",comparative study of discrete wavelet transfor...


In [4]:

# Binarize the labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_set['Categories'])

# Vectorize the text data
tfidf = TfidfVectorizer(max_features=10000)
X = tfidf.fit_transform(train_set['TEXT'])


In [5]:
y

array([[1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1]])

In [6]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1852627 stored elements and shape (20972, 10000)>

In [7]:

# Step 2: Implement a machine learning classifier
classifier = LogisticRegression()


In [8]:

from sklearn.multiclass import OneVsRestClassifier

# Step 3: Evaluate the classifier using suitable metrics
kf = KFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
accuracy_scores = []

ovr_classifier = OneVsRestClassifier(classifier)

for train_index, test_index in kf.split(train_set):
    X_train_raw, X_test_raw = train_set['TEXT'].iloc[train_index], train_set['TEXT'].iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train = tfidf.transform(X_train_raw)
    X_test = tfidf.transform(X_test_raw)
    
    ovr_classifier.fit(X_train, y_train)
    y_pred = ovr_classifier.predict(X_test)
    
    f1_scores.append(f1_score(y_test, y_pred, average='micro'))
    accuracy_scores.append(accuracy_score(y_test, y_pred))


In [9]:

# Step 4: Split data for k-fold cross-validation
# (Already done in the loop above)

# Step 5: Run the evaluation
print("F1 Scores: ", f1_scores)


F1 Scores:  [0.801497369486038, 0.8024402643619726, 0.8073038865653371, 0.8090365717759093, 0.8028140293637847]


In [10]:
print("Accuracy Scores: ", accuracy_scores)

# Step 6: Compare with dedicated baselines
# (Assuming we have baseline scores to compare with)

# Step 7: Analyze the obtained results
print("Average F1 Score: ", sum(f1_scores) / len(f1_scores))
print("Average Accuracy Score: ", sum(accuracy_scores) / len(accuracy_scores))

Accuracy Scores:  [0.6433849821215734, 0.6512514898688916, 0.6549833094897473, 0.6599904625655698, 0.6504530281354316]
Average F1 Score:  0.8046184243106083
Average Accuracy Score:  0.6520126544362428


In [11]:
test_set = pd.read_csv(f'{path}/test.csv')
test_set.drop(columns=['ID'], inplace=True)



In [12]:
test_set.head()

Unnamed: 0,TITLE,ABSTRACT
0,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...
1,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...
2,Case For Static AMSDU Aggregation in WLANs,Frame aggregation is a mechanism by which mu...
3,The $Gaia$-ESO Survey: the inner disk intermed...,Milky Way open clusters are very diverse in ...
4,Witness-Functions versus Interpretation-Functi...,Proving that a cryptographic protocol is cor...


In [13]:
# generate a classification using the trained model to predict the categories of the test set and show some results, but before doing so preprocess the data as done before
test_set['TITLE'] = test_set['TITLE'].str.lower()
test_set['TITLE'] = test_set['TITLE'].str.replace(r'[^\w\s]', '')
test_set['TITLE'] = test_set['TITLE'].str.replace(r'\d+', '')
test_set['TITLE'] = test_set['TITLE'].str.strip()
test_set['ABSTRACT'] = test_set['ABSTRACT'].str.lower()
test_set['ABSTRACT'] = test_set['ABSTRACT'].str.replace(r'[^\w\s]', '')
test_set['ABSTRACT'] = test_set['ABSTRACT'].str.replace(r'\d+', '')
test_set['ABSTRACT'] = test_set['ABSTRACT'].str.strip()

test_set['TEXT'] = test_set['TITLE'] + ' ' + test_set['ABSTRACT']

X_test = tfidf.transform(test_set['TEXT'])
y_pred_test = ovr_classifier.predict(X_test)
y_pred_labels = mlb.inverse_transform(y_pred_test)

test_set['Categories'] = y_pred_labels
test_set.head()

# Save the results (only the predicted categories) to a CSV file
test_set[['Categories']].to_csv('predicted_categories.csv', index=False)


