## Model Selection

In [2]:
%pip install imblearn
%pip install xgboost
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')


Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv(r"C:\Users\karsa\My Drive\Extracurriculars\Research\Audax Labs\notebooks\data\preprocessed.csv")

In [4]:
df

Unnamed: 0,Sensor-57,Sensor-134,Sensor-76,Sensor-28,Sensor-164,Sensor-369,Sensor-108,Sensor-81,Sensor-449,Sensor-319,Good/Bad
0,0.927000,1010.787000,-0.011800,5.158300,0.125000,0.002000,-0.104200,-0.081400,0.259300,3.319900,-1.0
1,0.927000,997.806400,0.000500,5.067000,0.077000,0.004600,-0.084200,-0.107400,0.162800,3.077600,-1.0
2,0.927000,997.806400,-0.001900,7.332000,0.150000,0.002400,-0.095100,-0.120400,0.113200,3.821800,-1.0
3,0.923600,1003.021100,-0.030300,5.204000,0.090000,0.004100,0.032800,-0.102400,0.193200,3.689200,-1.0
4,0.927000,1001.379600,-0.019900,7.347700,0.147000,0.001500,-0.076200,-0.111800,0.118800,3.677100,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
2891,0.935256,1006.338356,-0.013324,5.134490,0.134560,0.003128,0.171730,-0.090508,0.249052,2.880244,1.0
2892,0.933079,1011.613741,-0.041175,5.246843,0.125419,0.006995,-0.082471,0.052672,0.182294,2.230297,1.0
2893,0.929047,1010.982112,-0.012599,7.286756,0.132515,0.003094,-0.040525,0.021938,0.206023,4.401918,1.0
2894,0.930195,1008.248190,0.002995,7.152810,0.186947,0.003197,0.005543,-0.008435,0.330398,1.604100,1.0


In [5]:
X = df.iloc[:, :-1]
Y = df.iloc[:, -1:]
Y.replace((-1, 1), (0, 1), inplace=True)
Y.value_counts()


Good/Bad
0.0         1448
1.0         1448
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=30)

In [7]:
from sklearn.metrics import accuracy_score

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    accuracy = accuracy_score(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, accuracy, r2_square

In [8]:
models = {
    "knn": {
        "model": KNeighborsClassifier(),
        "space": {"n_neighbors": list(range(1, 31))}
    }, 
    "decision_tree": {
        "model": DecisionTreeClassifier(),
        "space": {"criterion": ['gini', 'entropy'], "max_depth": range(1, 10), "min_samples_split": range(1, 10), "min_samples_leaf": range(1, 5)},
    },
    "nb": {
        "model": GaussianNB(),
        "space": {'var_smoothing': np.logspace(0,-9, num=100)},
    },
    "svm": {
        "model": SVC(),
        "space": {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']},
    },
    "xgboost": {
        "model": XGBClassifier(),
        "space": {'max_depth': [2,4,6], 'n_estimators': [50,100,200]},
    },
    "random_forest": {
        "model": RandomForestClassifier(),
        "space": {'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy']},
    }
}

#xgboost and random forest
#randomcv to optimize hyperparamters

In [13]:
for model_name, info in models.items():
    model = info["model"]
    space = info["space"]
    cv = 10
    verbose = 1
    search = GridSearchCV(model, space, scoring="accuracy")
    search.fit(X_train, y_train);
    pred = search.predict(X_test)
    
    mae, rmse, accuracy, r2_square=evaluate_model(y_test,pred)

    print(model_name)
    print("Accuracy:", accuracy)
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)
    print(search.best_params_)
    print('='*25)

knn
Accuracy: 0.7353279631760644
RMSE: 0.5144628624341465
MAE: 0.2646720368239356
R2 score -5.8723196881091955
{'n_neighbors': 1}
decision_tree
Accuracy: 0.7905638665132336
RMSE: 0.45764192715131163
MAE: 0.2094361334867664
R2 score 16.22277311636576
{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 4}
nb
Accuracy: 0.6708860759493671
RMSE: 0.5736845161328942
MAE: 0.3291139240506329
R2 score -31.64992795999666
{'var_smoothing': 2.848035868435805e-09}
svm
Accuracy: 0.8032220943613348
RMSE: 0.4435965572890136
MAE: 0.19677790563866512
R2 score 21.286231884057948
{'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}
xgboost
Accuracy: 0.9263521288837745
RMSE: 0.2713814126211033
MAE: 0.07364787111622555
R2 score 70.53987626070005
{'max_depth': 6, 'n_estimators': 200}
random_forest
Accuracy: 0.8757192174913694
RMSE: 0.3525347961671736
MAE: 0.12428078250863062
R2 score 50.28604118993134
{'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 500}


In [10]:
X_train.columns

Index(['Sensor-57', 'Sensor-134', 'Sensor-76', 'Sensor-28', 'Sensor-164',
       'Sensor-369', 'Sensor-108', 'Sensor-81', 'Sensor-449', 'Sensor-319'],
      dtype='object')

### best models: 
xgboost - 92% 
random forest - 87% 

In [15]:
model = XGBClassifier(max_depth=6, n_estimators=200)
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [16]:
import pickle

pickle.dump(model, open('model.pkl', 'wb'))