In [1]:
# Step 1: Load and preprocess the data

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import kagglehub


In [2]:

# Download latest version of the dataset
path = kagglehub.dataset_download("shivanandmn/multilabel-classification-dataset")

print("Path to dataset files:", path)

# Load the dataset
url_train = f'{path}/train.csv'
train_set = pd.read_csv(url_train)


Path to dataset files: /home/lolli/.cache/kagglehub/datasets/shivanandmn/multilabel-classification-dataset/versions/1


In [3]:

train_set.drop(columns=['ID'], inplace=True)

# Combine the category columns into a list of categories for each paper
categories = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
train_set['Categories'] = train_set[categories].apply(lambda row: [cat for cat in categories if row[cat] == 1], axis=1)

# Preprocess the text data
train_set['TITLE'] = train_set['TITLE'].str.lower().str.replace(r'[^\\w\\s]', '').str.replace(r'\\d+', '').str.strip()
train_set['ABSTRACT'] = train_set['ABSTRACT'].str.lower().str.replace(r'[^\\w\\s]', '').str.replace(r'\\d+', '').str.strip()
train_set['TEXT'] = train_set['TITLE'] + ' ' + train_set['ABSTRACT']


In [4]:

# Binarize the labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_set['Categories'])

# Vectorize the text data
tfidf = TfidfVectorizer(max_features=10000)
X = tfidf.fit_transform(train_set['TEXT'])


In [5]:

# Step 2: Implement a machine learning classifier
# Try different classifiers and hyperparameters
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])

parameters = {
    'tfidf__max_features': [10000, 20000],
    'clf__estimator__C': [0.1, 1, 10],
    'clf__estimator__solver': ['lbfgs', 'liblinear']
}


In [6]:

grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='f1_micro')
grid_search.fit(train_set['TEXT'], y)

print("Best parameters set found on development set:")
print(grid_search.best_params_)


Best parameters set found on development set:
{'clf__estimator__C': 10, 'clf__estimator__solver': 'lbfgs', 'tfidf__max_features': 20000}


In [11]:
# Evaluate the classifier using suitable metrics
kf = KFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
accuracy_scores = []

In [18]:
X = train_set['TEXT']

In [19]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    grid_search.best_estimator_.fit(X_train, y_train)
    y_pred = grid_search.best_estimator_.predict(X_test)
    
    f1_scores.append(f1_score(y_test, y_pred, average='micro'))
    accuracy_scores.append(accuracy_score(y_test, y_pred))

In [20]:

print("F1 Scores: ", f1_scores)
print("Accuracy Scores: ", accuracy_scores)
print("Average F1 Score: ", sum(f1_scores) / len(f1_scores))
print("Average Accuracy Score: ", sum(accuracy_scores) / len(accuracy_scores))


F1 Scores:  [0.8040832351786416, 0.8099271223163286, 0.8148294222046933, 0.8099743741375912, 0.8110306517210594]
Accuracy Scores:  [0.6474374255065555, 0.6519666269368296, 0.6587982832618026, 0.6585598474010491, 0.6607057701478303]
Average F1 Score:  0.8099689611116627
Average Accuracy Score:  0.6554935906508135


In [21]:

# Load and preprocess the test set
test_set = pd.read_csv(f'{path}/test.csv')
test_set.drop(columns=['ID'], inplace=True)

test_set['TITLE'] = test_set['TITLE'].str.lower().str.replace(r'[^\\w\\s]', '').str.replace(r'\\d+', '').str.strip()
test_set['ABSTRACT'] = test_set['ABSTRACT'].str.lower().str.replace(r'[^\\w\\s]', '').str.replace(r'\\d+', '').str.strip()
test_set['TEXT'] = test_set['TITLE'] + ' ' + test_set['ABSTRACT']


In [23]:

X_test = test_set['TEXT']
y_pred_test = grid_search.best_estimator_.predict(X_test)
y_pred_labels = mlb.inverse_transform(y_pred_test)


In [25]:

test_set.drop(columns=['TITLE'], inplace=True)
test_set.drop(columns=['ABSTRACT'], inplace=True)
test_set['Categories'] = y_pred_labels
test_set.head()


Unnamed: 0,TEXT,Categories
0,closed-form marginal likelihood in gamma-poiss...,"(Statistics,)"
1,laboratory mid-ir spectra of equilibrated and ...,"(Physics,)"
2,case for static amsdu aggregation in wlans fra...,"(Computer Science,)"
3,the $gaia$-eso survey: the inner disk intermed...,"(Physics,)"
4,witness-functions versus interpretation-functi...,"(Computer Science,)"


In [26]:

# Save the results (only the predicted categories) to a CSV file
test_set[['Categories']].to_csv('predicted_categories.csv', index=False)