# Diabetes Prediction Model

## Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Loading the Dataset

In [2]:
data = pd.read_csv('diabetes_risk_prediction_dataset.csv')
data.head() 

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


## Data Cleaning and Preprocessing

### Checking for Missing and Duplicate Values

In [3]:
data.isnull().sum()  # Check for missing values
data.duplicated().any()  # Check for duplicate rows

True

### Dropping Duplicates

In [4]:
data.drop_duplicates(inplace=True)
data.duplicated().any()  # Verify no duplicates remain

False

### Understanding Data Types 

In [5]:
data.dtypes 

Age                    int64
Gender                object
Polyuria              object
Polydipsia            object
sudden weight loss    object
weakness              object
Polyphagia            object
Genital thrush        object
visual blurring       object
Itching               object
Irritability          object
delayed healing       object
partial paresis       object
muscle stiffness      object
Alopecia              object
Obesity               object
class                 object
dtype: object

## Encoding Categorical Data

In [6]:
le = LabelEncoder()
columns_to_encode = [col for col in data.columns if col != 'Age']
for column in columns_to_encode:
    data[column] = le.fit_transform(data[column]) 

## Splitting Dataset into Features and Target

In [7]:
X = data.iloc[:, :-1].values  # Features
y = data.iloc[:, -1].values  # Target

## Splitting Dataset into Training and Testing Sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Model Training

### Logistic Regression

In [10]:
# Hyperparameter tuning
logistic_params = {'C': [0.01, 0.1, 1, 10, 100], 'max_iter': [500, 1000, 5000]}
logistic_grid = GridSearchCV(LogisticRegression(solver='lbfgs', random_state=0), logistic_params, scoring='accuracy', cv=5)
logistic_grid.fit(X_train, y_train)
best_logistic_model = logistic_grid.best_estimator_
best_logistic_accuracy = logistic_grid.best_score_

### K-Nearest Neighbors

In [11]:
knn_params = {'n_neighbors': [3, 5, 7, 9], 'metric': ['minkowski', 'euclidean', 'manhattan']}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, scoring='accuracy', cv=5)
knn_grid.fit(X_train, y_train)
best_knn_model = knn_grid.best_estimator_
best_knn_accuracy = knn_grid.best_score_

### Support Vector Machine (Linear Kernel)

In [12]:
svm_linear_params = {'C': [0.1, 1, 10, 100]}
svm_linear_grid = GridSearchCV(SVC(kernel='linear', random_state=0), svm_linear_params, scoring='accuracy', cv=5)
svm_linear_grid.fit(X_train, y_train)
best_svm_linear_model = svm_linear_grid.best_estimator_
best_svm_linear_accuracy = svm_linear_grid.best_score_

### Support Vector Machine (RBF Kernel)

In [13]:
svm_rbf_params = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}
svm_rbf_grid = GridSearchCV(SVC(kernel='rbf', random_state=0), svm_rbf_params, scoring='accuracy', cv=5)
svm_rbf_grid.fit(X_train, y_train)
best_svm_rbf_model = svm_rbf_grid.best_estimator_
best_svm_rbf_accuracy = svm_rbf_grid.best_score_

### Naive Bayes

In [14]:
nb_params = {'var_smoothing': np.logspace(0, -9, num=100)}
nb_grid = GridSearchCV(GaussianNB(), nb_params, scoring='accuracy', cv=5)
nb_grid.fit(X_train, y_train)
best_nb_model = nb_grid.best_estimator_
best_nb_accuracy = nb_grid.best_score_

### Decision Tree

In [15]:
dt_params = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=0), dt_params, scoring='accuracy', cv=5)
dt_grid.fit(X_train, y_train)
best_dt_model = dt_grid.best_estimator_
best_dt_accuracy = dt_grid.best_score_

### Random Forest

In [16]:
rf_params = {
    'n_estimators': [10, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=0), rf_params, scoring='accuracy', cv=5)
rf_grid.fit(X_train, y_train)
best_rf_model = rf_grid.best_estimator_
best_rf_accuracy = rf_grid.best_score_

## Model Evaluation

### Predictions

In [17]:
logistic_y_pred = best_logistic_model.predict(X_test)
knn_y_pred = best_knn_model.predict(X_test)
svm_linear_y_pred = best_svm_linear_model.predict(X_test)
svm_rbf_y_pred = best_svm_rbf_model.predict(X_test)
nb_y_pred = best_nb_model.predict(X_test)
dt_y_pred = best_dt_model.predict(X_test)
rf_y_pred = best_rf_model.predict(X_test)

### Accuracy and Confusion Matrices

In [18]:
logistic_test_accuracy = accuracy_score(y_test, logistic_y_pred)
knn_test_accuracy = accuracy_score(y_test, knn_y_pred)
svm_linear_test_accuracy = accuracy_score(y_test, svm_linear_y_pred)
svm_rbf_test_accuracy = accuracy_score(y_test, svm_rbf_y_pred)
nb_test_accuracy = accuracy_score(y_test, nb_y_pred)
dt_test_accuracy = accuracy_score(y_test, dt_y_pred)
rf_test_accuracy = accuracy_score(y_test, rf_y_pred)

print("Logistic Regression Accuracy (Test):", logistic_test_accuracy)
print("KNN Accuracy (Test):", knn_test_accuracy)
print("SVM (Linear Kernel) Accuracy (Test):", svm_linear_test_accuracy)
print("SVM (RBF Kernel) Accuracy (Test):", svm_rbf_test_accuracy)
print("Naive Bayes Accuracy (Test):", nb_test_accuracy)
print("Decision Tree Accuracy (Test):", dt_test_accuracy)
print("Random Forest Accuracy (Test):", rf_test_accuracy)

Logistic Regression Accuracy (Test): 0.9019607843137255
KNN Accuracy (Test): 0.7647058823529411
SVM (Linear Kernel) Accuracy (Test): 0.8627450980392157
SVM (RBF Kernel) Accuracy (Test): 0.8823529411764706
Naive Bayes Accuracy (Test): 0.8823529411764706
Decision Tree Accuracy (Test): 0.7843137254901961
Random Forest Accuracy (Test): 0.9607843137254902


## Comparing Models

In [19]:
models = pd.DataFrame(
    {
        'Classifier': ['Logistic Regression', 'KNN', 'SVM', 'Kernel SVM', 'Naive Bayes', 'Decision Tree', 'Random Forest'],
        'Accuracy': [logistic_test_accuracy, knn_test_accuracy, svm_linear_test_accuracy, svm_rbf_test_accuracy, nb_test_accuracy, dt_test_accuracy, rf_test_accuracy]
    }
)

models.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Classifier,Accuracy
6,Random Forest,0.960784
0,Logistic Regression,0.901961
3,Kernel SVM,0.882353
4,Naive Bayes,0.882353
2,SVM,0.862745
5,Decision Tree,0.784314
1,KNN,0.764706


# Recommendations

Based on the accuracies obtained, the following models are recommended for deployment:

1. **Best Model: Random Forest** with an accuracy of **96.08%**

   **Recommendation:** Random Forest demonstrates the highest accuracy and is highly reliable. It is the best choice for deployment, especially for datasets with complex patterns or when robustness and generalization are required.

2. **Second Best Model: Logistic Regression** with an accuracy of **90.20%**

   **Recommendation:** Logistic Regression offers a strong balance of simplicity and performance. It is well-suited for problems with linear relationships and provides excellent interpretability.

3. **Third Best Model: Kernel SVM** with an accuracy of **88.24%**

   **Recommendation:** Kernel SVM performs well and can capture non-linear relationships effectively. It is a strong contender for deployment when computational resources allow for more complex modeling.

## Additional Considerations:

* **Naive Bayes (88.24%)**: Naive Bayes performs competitively and is very fast to train. It is a good option for scenarios where feature independence assumptions are reasonable and quick results are needed.

* **SVM (86.27%)**: SVM delivers good accuracy and is especially useful for datasets with clear margins between classes. It can be considered if kernel-based methods are not computationally feasible.

* **Decision Tree (78.43%)**: Decision Trees are interpretable and useful for quick insights but tend to overfit. They are better suited for exploratory analysis or as part of an ensemble like Random Forest.

* **K-Nearest Neighbors (76.47%)**: KNN has the lowest accuracy among the models and is not recommended for deployment in its current form. It is computationally intensive and less effective for complex datasets.

## Save .pkl Files

In [20]:
best_model = best_rf_model 

# Save the model
with open('diabetes_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)