# Model Optimization & Advanced Models

## Import the Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import pickle

## Load the Dataset

In [2]:
dataset = pd.read_csv('Datasets/final_dataset.csv')
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10
0,15,0,1,34,55,14,42,43,43,79,79
1,15,1,34,55,33,14,42,43,43,79,79
2,15,0,34,55,33,14,42,43,43,79,79
3,15,0,1,55,33,14,42,43,43,79,79
4,15,0,1,34,33,14,42,43,43,79,79


## Feature Selection

In [3]:
X = dataset.iloc[:, 1:].values
y = dataset['Disease'].values

## Split the dataset into training and testing sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Standardize the features for SVM models.

In [5]:
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.fit_transform(X_test)

## Function for Hyperparameter Tuning

In [6]:
def tune_model(model, param_grid, X, y, cv=5, search='grid'):
    if search == 'grid':
        searcher = GridSearchCV(model, param_grid, scoring='accuracy', cv=cv, n_jobs=-1)
    elif search == 'random':
        searcher = RandomizedSearchCV(model, param_grid, scoring='accuracy', cv=cv, n_jobs=-1, n_iter=50)
    searcher.fit(X, y)
    return searcher.best_estimator_, searcher.best_params_

## Models and Hyperparameter Tuning

### Logistic Regression

In [7]:
log_model = LogisticRegression(multi_class='ovr', solver='liblinear', random_state=42)
log_model.fit(X_train, y_train)
log_y_pred = log_model.predict(X_test)



### KNN Classifier

In [8]:
knn_params = {'n_neighbors': range(3, 10), 'metric': ['minkowski'], 'p': [1, 2]}
knn_model, knn_best_params = tune_model(KNeighborsClassifier(), knn_params, X_train, y_train)

### SVM Classifier

In [9]:
svm_params = {'C': [0.1, 1, 10]}
svm_model, svm_best_params = tune_model(SVC(kernel='linear', random_state=42), svm_params, X_train_scaled, y_train)

### Kernal SVM Classifier

In [10]:
kernal_svm_params = {'C': [1, 10, 100], 'gamma': [0.001, 0.01, 0.1]}
kernal_svm_model, kernal_svm_best_params = tune_model(SVC(kernel='rbf', random_state=42), kernal_svm_params, X_train_scaled, y_train)

### Naive Bayes Classifier

In [11]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_y_pred = nb_model.predict(X_test)

### Decision Tree Classifier

In [12]:
dt_params = {'max_depth': range(3, 15), 'min_samples_split': [2, 5, 10]}
dec_tree_model, dec_tree_best_params = tune_model(DecisionTreeClassifier(random_state=42), dt_params, X_train, y_train)

### Random Forest Classifier

In [13]:
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [10, 15, 20], 'min_samples_split': [2, 5, 10]}
ran_forest_model, ran_forest_best_params = tune_model(RandomForestClassifier(random_state=42), rf_params, X_train, y_train)

### XGBoost

In [14]:
xgb_params = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.2]}
xgb_model, xgb_best_params = tune_model(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), xgb_params, X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



### LightGBM

In [15]:
lgb_params = {'num_leaves': [31, 50, 70], 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [50, 100, 200]}
lgb_model, lgb_best_params = tune_model(LGBMClassifier(random_state=42), lgb_params, X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 355
[LightGBM] [Info] Number of data points in the train set: 3936, number of used features: 10
[LightGBM] [Info] Start training from score -3.652947
[LightGBM] [Info] Start training from score -3.778111
[LightGBM] [Info] Start training from score -3.713572
[LightGBM] [Info] Start training from score -3.724043
[LightGBM] [Info] Start training from score -3.713572
[LightGBM] [Info] Start training from score -3.703209
[LightGBM] [Info] Start training from score -3.812012
[LightGBM] [Info] Start training from score -3.703209
[LightGBM] [Info] Start training from score -3.682800
[LightGBM] [Info] Start training from score -3.623960
[LightGBM] [Info] Start training from score -3.703209
[LightGBM] [Info] Start training from score -3.734625
[

## Evaluate Models
### Function to calculate and print evaluation metrics for a given model.

In [16]:
def evaluate_model(y_test, y_pred, model_name):
    print(f"\n{model_name} Evaluation Metrics:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

### Evaluate each model

In [17]:
evaluate_model(y_test, log_y_pred, "Logistic Regression")


Logistic Regression Evaluation Metrics:
Accuracy: 0.8404
Precision: 0.8597
Recall: 0.8404
F1 Score: 0.8423
Confusion Matrix:
[[ 8  0  0 ...  0  0  0]
 [ 0 20  0 ...  0  0  0]
 [ 0  0 20 ...  2  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 19  0]
 [ 0  0  0 ...  0  0 28]]


In [18]:
evaluate_model(y_test, knn_model.predict(X_test), "K-Nearest Neighbors")


K-Nearest Neighbors Evaluation Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix:
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


In [19]:
evaluate_model(y_test, svm_model.predict(X_test_scaled), "SVM (Linear Kernel)")


SVM (Linear Kernel) Evaluation Metrics:
Accuracy: 0.9787
Precision: 0.9811
Recall: 0.9787
F1 Score: 0.9789
Confusion Matrix:
[[16  0  0 ...  0  0  0]
 [ 0 26  0 ...  0  0  0]
 [ 0  0 23 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 19  0]
 [ 0  0  0 ...  0  0 34]]


In [20]:
evaluate_model(y_test, kernal_svm_model.predict(X_test_scaled), "SVM (RBF Kernel)")


SVM (RBF Kernel) Evaluation Metrics:
Accuracy: 0.9990
Precision: 0.9990
Recall: 0.9990
F1 Score: 0.9990
Confusion Matrix:
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


In [21]:
evaluate_model(y_test, nb_y_pred, "Naive Bayes")


Naive Bayes Evaluation Metrics:
Accuracy: 0.8364
Precision: 0.8819
Recall: 0.8364
F1 Score: 0.8435
Confusion Matrix:
[[ 8  0  0 ...  0  0  0]
 [ 0 22  8 ...  0  0  0]
 [ 0  0 22 ...  2  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 9  0  0 ...  0  9  0]
 [ 0  0  0 ...  0  0 30]]


In [22]:
evaluate_model(y_test, dec_tree_model.predict(X_test), "Decision Tree")


Decision Tree Evaluation Metrics:
Accuracy: 0.9837
Precision: 0.9863
Recall: 0.9837
F1 Score: 0.9841
Confusion Matrix:
[[16  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 29]]


In [23]:
evaluate_model(y_test, ran_forest_model.predict(X_test), "Random Forest")


Random Forest Evaluation Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix:
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


In [24]:
evaluate_model(y_test, xgb_model.predict(X_test), "XGBoost")


XGBoost Evaluation Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix:
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


In [25]:
evaluate_model(y_test, lgb_model.predict(X_test), "LightGBM")


LightGBM Evaluation Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix:
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


## Compare Model Performance

In [26]:
models = pd.DataFrame({
    'Classifier': ['Logistic Regression', 'K-Nearest Neighbors', 'SVM (Linear Kernel)', 'SVM (RBF Kernel)',
                   'Naive Bayes', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM'],
    'Accuracy': [
        accuracy_score(y_test, log_y_pred),
        accuracy_score(y_test, knn_model.predict(X_test)),
        accuracy_score(y_test, svm_model.predict(X_test_scaled)),
        accuracy_score(y_test, kernal_svm_model.predict(X_test_scaled)),
        accuracy_score(y_test, nb_y_pred),
        accuracy_score(y_test, dec_tree_model.predict(X_test)),
        accuracy_score(y_test, ran_forest_model.predict(X_test)),
        accuracy_score(y_test, xgb_model.predict(X_test)),
        accuracy_score(y_test, lgb_model.predict(X_test))
    ]
})

In [27]:
models.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Classifier,Accuracy
1,K-Nearest Neighbors,1.0
6,Random Forest,1.0
7,XGBoost,1.0
8,LightGBM,1.0
3,SVM (RBF Kernel),0.998984
5,Decision Tree,0.98374
2,SVM (Linear Kernel),0.978659
0,Logistic Regression,0.840447
4,Naive Bayes,0.836382


## Saving the model as a pkl file

In [30]:
with open('Pickle files/model.pkl', 'wb') as model_file:
    pickle.dump(xgb_model, model_file)