In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay



In [5]:
df = pd.read_csv('MathDataset.csv', sep=';', engine='python', encoding='latin1')
df.head()

Unnamed: 0,Student ID,Student Country,Question ID,Type of Answer,Question Level,Topic,Subtopic,Keywords
0,647,Ireland,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
1,41,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
2,340,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
3,641,Italy,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
4,669,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."


### Data cleaning / setup

In [6]:
# rename columns if they have trailing spaces/newlines
df.columns = [c.strip() for c in df.columns]

# Dropping Student ID  because it's an identifier (not predictive)
new_df = df.drop(columns=['Student ID'])

# It ensures that the 'Keywords' column is treated consistently as text data, even if some entries are missing or are currently stored in a non-string format.
new_df['Keywords'] = new_df['Keywords'].fillna('').astype(str)

# Categorical fields
cats = ['Student Country','Question Level','Topic','Subtopic']

# for each of the category field check if any row has a missing value and fill it with missing
for c in cats:
    new_df[c] = new_df[c].fillna('missing').astype(str)

# new_df

# for safety we make Question ID columns values to numeric whether it was string or digit/numerical 
new_df['Question ID'] = pd.to_numeric(new_df['Question ID'])

### Feature Preprocessing (dependent and independant vars)

In [7]:
# independent variables
X = new_df.drop(columns=['Type of Answer'])

# dependent variable
y = new_df['Type of Answer'].astype(int)

### Data Splitting

In [8]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Columns transformation

- numeric: StandardScaler (standardizes features by removing the mean and scaling to unit variance) 
- categorical: OneHotEncoder  (creates binary columns for each category)
- text: TfidfVectorizer on 'Keywords' (converts text documents to a matrix of TF-IDF features)


In [9]:
num_cols = ['Question ID']
cat_cols = cats
text_col = 'Keywords'

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('txt', TfidfVectorizer(max_features=2000, ngram_range=(1,2)), text_col),
    ]
)


### Definition of Model Pipelines

In [10]:
# Logistic Regression
pipe_logreg = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(max_iter=1000))
])

# K-Nearest Neighbors
pipe_knn = Pipeline([
    ('pre', preprocessor),
    ('clf', KNeighborsClassifier())
])

# XGBoost
pipe_xgb = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

models = {
    "Logistic Regression": pipe_logreg,
    "KNN": pipe_knn,
    "XGBoost": pipe_xgb
}



### Evaluate function

In [11]:
def evaluate(model):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_prob)
    }

### Evaluation of each model

In [12]:


results = []
for name, model in models.items():
    print(f"Training: {name}")
    model.fit(X_train, y_train)
    metrics = evaluate(model)
    print(metrics)
    results.append({'Model': name, **metrics})



Training: Logistic Regression
{'Accuracy': 0.5575916230366492, 'F1': 0.465528146742568, 'AUC': np.float64(0.5795455045887721)}
Training: KNN
{'Accuracy': 0.5575916230366492, 'F1': 0.5190665907797382, 'AUC': np.float64(0.5823760547129595)}
Training: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Accuracy': 0.569109947643979, 'F1': 0.47209749839640797, 'AUC': np.float64(0.5904515448572284)}


### Tabulate results

In [13]:
pd.DataFrame(results)

Unnamed: 0,Model,Accuracy,F1,AUC
0,Logistic Regression,0.557592,0.465528,0.579546
1,KNN,0.557592,0.519067,0.582376
2,XGBoost,0.56911,0.472097,0.590452
