# Model Making

### Description

This notebook file must be run after taking the output from datasetPreparation.ipynb

Training of the model will be done ONLY. Manipulation of the dataset must NOT be done here. Only importing of dataset is allowed. 

Train Test Splits will be done on datasetPrepation.ipynb

This notebook will output the top performing ZERO-SHOT models. No hyperparameter tuning will be done in this notebook.

Technique used will be the following:

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
from xgboost import XGBClassifier

### Importing of dataset: train, validation, and test sets

In [24]:
# Get dataset file names
import pandas as pd
from sklearn.model_selection import train_test_split

X_train = pd.read_csv('data/cleaned/X_train.csv')
X_test = pd.read_csv('data/cleaned/X_test.csv')
y_train = pd.read_csv('data/cleaned/y_train.csv')
y_test = pd.read_csv('data/cleaned/y_test.csv')

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (69999, 6)
X_test shape: (30000, 6)
y_train shape: (69999, 1)
y_test shape: (30000, 1)


### Training of model

##### Naive Bayes

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

model_accuracies = {}

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
model_accuracies['Naive Bayes'] = accuracy
print(f"Naive Bayes Accuracy: {accuracy:.2f}")

Naive Bayes Accuracy: 0.61


In [15]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
model_accuracies['K-Nearest Neighbors'] = accuracy
print(f"K-Nearest Neighbors Accuracy: {accuracy:.2f}")


K-Nearest Neighbors Accuracy: 0.59


In [16]:
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
model_accuracies['Decision Tree'] = accuracy
print(f"Decision Tree Accuracy: {accuracy:.2f}")


Decision Tree Accuracy: 0.99


In [17]:
from xgboost import XGBClassifier


xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
model_accuracies['XGBoost'] = accuracy
print(f"XGBoost Model Accuracy: {accuracy:.2f}")

XGBoost Model Accuracy: 0.98


In [19]:
import lightgbm as lgb

lgbm = lgb.LGBMClassifier()
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
model_accuracies['LightGBM'] = accuracy
print(f"LightGBM Model Accuracy: {accuracy:.2f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002870 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 125505, number of used features: 9
[LightGBM] [Info] Start training from score -0.693394
[LightGBM] [Info] Start training from score -1.417870
[LightGBM] [Info] Start training from score -1.355206
LightGBM Model Accuracy: 0.98


In [20]:
print("\nModel Accuracies:")
for model, accuracy in model_accuracies.items():
    print(f"{model}: {accuracy:.2f}")


Model Accuracies:
Naive Bayes: 0.61
Logistic Regression: 0.50
K-Nearest Neighbors: 0.59
Decision Tree: 0.99
XGBoost: 0.98
LightGBM: 0.98


In [21]:
sorted_models = sorted(model_accuracies.items(), key=lambda x: x[1], reverse=True)[:5]


In [22]:
# Save the top 5 models for hyperparameter tuning
models = {
    'Naive Bayes': gnb,
    
    'K-Nearest Neighbors': knn,
    'Decision Tree': dt,
    'XGBoost': xgb,
    'LightGBM': lgbm
}

In [None]:
import pickle


for model_name, accuracy in sorted_models:
    model = models[model_name]
    with open(f'{model_name}.pkl', 'wb') as file:
        pickle.dump(model, file)
    print(f"Saved {model_name} with accuracy {accuracy:.2f}")