In [107]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import plotly.express as px
from summarytools import dfSummary
from scipy.stats import chi2_contingency
import joblib
import pickle


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, \
    AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb 
from sklearn.preprocessing import FunctionTransformer

from sklearn.metrics import accuracy_score, \
    precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance



### Read Data

In [108]:
df = pd.read_csv('../data/Math.csv', sep=';', encoding='ISO-8859-1')

### Data Exploration

In [109]:
df.head()

Unnamed: 0,Student ID,Student Country,Question ID,Type of Answer,Question Level,Topic,Subtopic,Keywords
0,647,Ireland,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
1,41,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
2,340,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
3,641,Italy,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
4,669,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."


In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9546 entries, 0 to 9545
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Student ID       9546 non-null   int64 
 1   Student Country  9546 non-null   object
 2   Question ID      9546 non-null   int64 
 3   Type of Answer   9546 non-null   int64 
 4   Question Level   9546 non-null   object
 5   Topic            9546 non-null   object
 6   Subtopic         9546 non-null   object
 7   Keywords         9546 non-null   object
dtypes: int64(3), object(5)
memory usage: 596.8+ KB


In [111]:
df.describe()

Unnamed: 0,Student ID,Question ID,Type of Answer
count,9546.0,9546.0,9546.0
mean,775.402263,478.912319,0.468259
std,460.590559,249.244061,0.499018
min,26.0,77.0,0.0
25%,380.0,323.0,0.0
50%,885.0,428.0,0.0
75%,1219.0,571.0,1.0
max,1565.0,1549.0,1.0


In [112]:
dfSummary(df, is_collapsible=False)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Student ID [int64],Mean (sd) : 775.4 (460.6) min < med < max: 26.0 < 885.0 < 1565.0 IQR (CV) : 839.0 (1.7),372 distinct values,,0 (0.0%)
2,Student Country [object],1. Portugal 2. Lithuania 3. Italy 4. Slovenia 5. Ireland 6. Russian Federation 7. Romania 8. Spain,"5,495 (57.6%) 1,443 (15.1%) 1,358 (14.2%) 755 (7.9%) 300 (3.1%) 107 (1.1%) 60 (0.6%) 28 (0.3%)",,0 (0.0%)
3,Question ID [int64],Mean (sd) : 478.9 (249.2) min < med < max: 77.0 < 428.0 < 1549.0 IQR (CV) : 248.0 (1.9),833 distinct values,,0 (0.0%)
4,Type of Answer [int64],1. 0 2. 1,"5,076 (53.2%) 4,470 (46.8%)",,0 (0.0%)
5,Question Level [object],1. Basic 2. Advanced,"7,844 (82.2%) 1,702 (17.8%)",,0 (0.0%)
6,Topic [object],1. Linear Algebra 2. Fundamental Mathematics 3. Complex Numbers 4. Differentiation 5. Analytic Geometry 6. Statistics 7. Numerical Methods 8. Optimization 9. Real Functions of a single var 10. Integration 11. other,"5,726 (60.0%) 818 (8.6%) 592 (6.2%) 579 (6.1%) 358 (3.8%) 340 (3.6%) 310 (3.2%) 182 (1.9%) 164 (1.7%) 144 (1.5%) 333 (3.5%)",,0 (0.0%)
7,Subtopic [object],"1. Vector Spaces 2. Linear Transformations 3. Complex Numbers 4. Algebraic expressions, Equatio 5. Linear Systems 6. Analytic Geometry 7. Statistics 8. Elementary Geometry 9. Derivatives 10. Numerical Methods 11. other","2,749 (28.8%) 2,127 (22.3%) 592 (6.2%) 496 (5.2%) 420 (4.4%) 358 (3.8%) 340 (3.6%) 322 (3.4%) 317 (3.3%) 310 (3.2%) 1,515 (15.9%)",,0 (0.0%)
8,Keywords [object],"1. Linear application,Linearity 2. Simplify expressions 3. Subspace,Linear combination,Sp 4. Linear independence,Span,Linea 5. Range,Kernel 6. Matrix of a linear transformat 7. Subspace,Span,Linear combinati 8. Basis,Dimension 9. Linear combination,Subspace,Sp 10. Basis 11. other","443 (4.6%) 401 (4.2%) 401 (4.2%) 399 (4.2%) 344 (3.6%) 314 (3.3%) 216 (2.3%) 210 (2.2%) 198 (2.1%) 186 (1.9%) 6,434 (67.4%)",,0 (0.0%)


### Feature Engineering

### Check for Class Imbalance

In [113]:
label_counts = df['Type of Answer'].value_counts().reset_index()
label_counts.columns = ['Class', 'Count']

fig = px.bar(
    label_counts,
    x='Class',
    y='Count',
    color='Class',
    text='Count',
    title='Label Class Distribution',
    labels={'Class': 'Class Label', 'Count': 'Frequency'}
)

fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1],
        title="Class Label"
    ),
    yaxis_title="Frequency",
    width=600,
    height=400
)
fig.show()

Class is widely not imbalnced hence we will not use any sampling techniques like SMOTE etc.

In [114]:
"""
We drop the Question ID and Student ID columns since they don't influence or add value to our models
"""
ml_data = df.drop(['Student ID', 'Question ID'],axis=1)

#### Split Data into Train Test

In [115]:
X = ml_data .drop(['Type of Answer'], axis = 1, inplace = False)
y = ml_data ['Type of Answer']

X_train, X_eval, y_train, y_eval = train_test_split(X, y, 
                                                  test_size = 0.2, 
                                                  random_state = 0)

#### Feature Enconding

In [116]:
# segregate categorical columns from numerical columns.
categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns


# define the transformer for numeric columns
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Add a step to convert sparse to dense
def convert_to_dense(x):
    if hasattr(x, "toarray"):
        return x.toarray()
    return x

dense_transformer = FunctionTransformer(convert_to_dense)

# define the transformer for categorical columns
categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(drop="first", handle_unknown='ignore'))
        ])


features_preprocessor = ColumnTransformer(
    transformers=[
     ('numeric',      numeric_transformer,     integer_features),
     ('categorical', categorical_transformer, categorical_features),
    ])


####  Train Baseline Models

In [117]:
# List of models/classifiers
classifiers = [    
    LogisticRegression(),
    KNeighborsClassifier(),
    SVC(),
    DecisionTreeClassifier(),
    GaussianNB(),
]

results = []


for classifier in classifiers:
    pipe = Pipeline(steps=[
        ('preprocessor', features_preprocessor),  
        ('to_dense', dense_transformer),         
        ('classifier', classifier)              
    ])
    pipe.fit(X_train, y_train)  
    y_pred = pipe.predict(X_eval)  

    
    results.append({
        'Classifier': classifier.__class__.__name__,
        'Accuracy': accuracy_score(y_eval, y_pred),
        'Precision': precision_score(y_eval, y_pred, average='binary'),
        'Recall': recall_score(y_eval, y_pred, average='binary'),
        'F1_Score': f1_score(y_eval, y_pred, average='binary'),
    })

In [118]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1_Score
0,LogisticRegression,0.575393,0.578073,0.38453,0.461845
1,KNeighborsClassifier,0.550262,0.529487,0.456354,0.490208
2,SVC,0.576963,0.582624,0.377901,0.458445
3,DecisionTreeClassifier,0.574869,0.569507,0.420994,0.484117
4,GaussianNB,0.533508,0.504965,0.78674,0.615119


#### Train on other Ensemble Models

In [119]:
classifiers_ensemble = [    
     RandomForestClassifier(),
     ExtraTreesClassifier(),
     AdaBoostClassifier(),
     GradientBoostingClassifier(),
     xgb.XGBClassifier() 
]

results1 = []


for classifier in classifiers_ensemble:
    pipe = Pipeline(steps=[
        ('preprocessor', features_preprocessor),  
        ('to_dense', dense_transformer),         
        ('classifier', classifier)              
    ])
    pipe.fit(X_train, y_train)  
    y_pred = pipe.predict(X_eval)  

    
    results1.append({
        'Classifier': classifier.__class__.__name__,
        'Accuracy': accuracy_score(y_eval, y_pred),
        'Precision': precision_score(y_eval, y_pred, average='binary'),
        'Recall': recall_score(y_eval, y_pred, average='binary'),
        'F1_Score': f1_score(y_eval, y_pred, average='binary'),
    })

In [120]:
df_results1 = pd.DataFrame(results1)
df_results1

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1_Score
0,RandomForestClassifier,0.571728,0.557616,0.465193,0.507229
1,ExtraTreesClassifier,0.574869,0.568082,0.428729,0.488665
2,AdaBoostClassifier,0.573822,0.592292,0.322652,0.41774
3,GradientBoostingClassifier,0.567016,0.59949,0.259669,0.362375
4,XGBClassifier,0.580628,0.594545,0.361326,0.449485


From the initial training we found GaussianNB model to perform better. Hence we will perform parameter tuning to see if it could be improved further.

#### Perform Hyper-Parameter Tuning

In [121]:
# Define the parameter grid for GaussianNB
param_grid = {
    'classifier__var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
}


pipe = Pipeline(steps=[
    ('preprocessor', features_preprocessor), 
    ('to_dense', dense_transformer),
    ('classifier', GaussianNB())
])

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='f1',
    cv=10,  
    n_jobs=-1, 
    verbose=1
)

grid_search.fit(X_train, y_train)

# Print the best parameters and corresponding score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_}")

# Evaluate the best model on the evaluation set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_eval)



print(f"Accuracy: {accuracy_score(y_eval, y_pred):.2f}")
print(f"Precision: {precision_score(y_eval, y_pred, average='binary'):.2f}")
print(f"Recall: {recall_score(y_eval, y_pred, average='binary'):.2f}")
print(f"F1 Score: {f1_score(y_eval, y_pred, average='binary'):.2f}")


Fitting 10 folds for each of 9 candidates, totalling 90 fits




Best Parameters: {'classifier__var_smoothing': 1e-10}
Best Cross-Validation Accuracy: 0.573768045026736
Accuracy: 0.53
Precision: 0.50
Recall: 0.80
F1 Score: 0.62




In [122]:
model = GaussianNB(var_smoothing=1e-10)
hyper_pipe = Pipeline(steps=[
    ('preprocessor', features_preprocessor), 
    ('to_dense', dense_transformer),
    ('classifier', model)
])
model = hyper_pipe.fit(X_train,y_train)
model1 = model.predict(X_eval)
f1 = f1_score(y_eval, model1)
print(f'The f1 score is {f1}')

The f1 score is 0.6180110968843363


#### Export Pipeline

In [123]:
to_export = {
    "pipeline": hyper_pipe
}

with open('../ml_pipeline.pkl', 'wb') as file:
    pickle.dump(to_export, file)

#### Save Best Model

In [124]:
joblib.dump(hyper_pipe, '../best_classification_model.pkl')

['../best_classification_model.pkl']