# Model Development

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report as cr

In [17]:
# Load the training data variables
%store -r X
%store -r Y

In [18]:
X.shape, len(Y)

((469, 4096), 469)

In [11]:
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size=0.1, random_state=123)

In [19]:
C = 1000
pipeline = Pipeline([("scaler", StandardScaler()), ("svc", SVC(kernel="linear", C=C))])
pipeline.fit(X_train, Y_train)
accuracy = pipeline.score(X_test, Y_test)
print(f"Accuracy of the classifier: {accuracy*100:.2f}%")

Accuracy of the classifier: 89.36%


In [20]:
print(cr(Y_test, pipeline.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.83      0.87        12
           1       0.80      0.67      0.73         6
           2       1.00      0.90      0.95        10
           3       1.00      1.00      1.00         3
           4       0.91      1.00      0.95        10
           5       0.75      1.00      0.86         6

    accuracy                           0.89        47
   macro avg       0.89      0.90      0.89        47
weighted avg       0.90      0.89      0.89        47



In [21]:
models_and_params = {
    "svm": {
        "model": SVC(gamma="auto", probability=True),
        "params": {
            "svc__C": [1, 10, 100, 1000],
            "svc__kernel": ["rbf", "linear"]
        }
    },
    
    "logistic_regression": {
        "model": LogisticRegression(solver="liblinear", multi_class="auto"),
        "params": {
            "logisticregression__C": [1, 5, 10]
        }
    },
    
    "random_forest": {
        "model": RandomForestClassifier(),
        "params": {
            "randomforestclassifier__n_estimators": [1, 5, 10]
        }
    }
}

In [None]:
cv=5
scores = []
best_estimators = {}
for algo, mp in models_and_params.items():
    pipeline = make_pipeline(StandardScaler(), mp["model"])
    classifier = GridSearchCV(pipeline, mp["params"], cv=cv, return_train_score=False)
    classifier.fit(X_train, Y_train)
    scores.append({
        "model": algo,
        "best_score": classifier.best_score_,
        "best_params": classifier.best_params_
    })
    
    best_estimators[algo] = classifier.best_estimator_

df = pd.DataFrame(scores)
df