Project Name:Health Risk Classification


Roll No = 23-Ai-48

In [2]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [3]:
data = pd.read_csv("diabetes.csv")
# Display first few rows
data.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Check missing values
(data.isnull().sum())

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [5]:
#  Data Preprocessing

# Some features (like Insulin, SkinThickness, BMI) can have 0s instead of missing values
# Replace 0s with NaN in those columns
cols_to_replace = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
data[cols_to_replace] = data[cols_to_replace].replace(0, np.nan)

In [6]:
# Fill missing values with mean
data.fillna(data.mean(), inplace=True)


In [9]:
data


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.00000,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.00000,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.00000,94.000000,28.1,0.167,21,0
4,0,137.0,40.0,35.00000,168.000000,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.00000,180.000000,32.9,0.171,63,0
764,2,122.0,70.0,27.00000,155.548223,36.8,0.340,27,0
765,5,121.0,72.0,23.00000,112.000000,26.2,0.245,30,0
766,1,126.0,60.0,29.15342,155.548223,30.1,0.349,47,1


In [7]:
# Normalize / Scale numeric data
scaler = StandardScaler()
X = data.drop("Outcome", axis=1)
y = data["Outcome"]
X_scaled = scaler.fit_transform(X)

In [8]:
# Split into training & testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [10]:
# Model 1 — Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluation for SVM
print(" SVM Model Evaluation ")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))


 SVM Model Evaluation 
Accuracy: 0.7532467532467533
Confusion Matrix:
 [[82 17]
 [21 34]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154



In [15]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)


In [11]:
# Model 2 — Decision Tree
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluation for Decision Tree
print("Decision Tree Model Evaluation ")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))
print("Classification Report:\n", classification_report(y_test, y_pred_tree))


Decision Tree Model Evaluation 
Accuracy: 0.7207792207792207
Confusion Matrix:
 [[77 22]
 [21 34]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154



In [12]:
# Comparison Table
svm_acc = accuracy_score(y_test, y_pred_svm)
tree_acc = accuracy_score(y_test, y_pred_tree)

comparison = pd.DataFrame({
    "Model": ["Support Vector Machine", "Decision Tree"],
    "Accuracy": [svm_acc, tree_acc]
})

print(" Model Comparison:")
print(comparison)


 Model Comparison:
                    Model  Accuracy
0  Support Vector Machine  0.753247
1           Decision Tree  0.720779


In [16]:
#checking for over fitting
print("SVM Train Accuracy:", svm_model.score(X_train, y_train))
print("SVM Test Accuracy:", svm_model.score(X_test, y_test))
print("Decision Tree Train Accuracy:", dt_model.score(X_train, y_train))
print("Decision Tree Test Accuracy:", dt_model.score(X_test, y_test))


SVM Train Accuracy: 0.7736156351791531
SVM Test Accuracy: 0.7532467532467533
Decision Tree Train Accuracy: 1.0
Decision Tree Test Accuracy: 0.7207792207792207


Discussion:
The Decision Tree model achieved perfect accuracy on the training set but performed worse on the test data. This indicates overfitting, where the model memorizes training data and fails to generalize to unseen examples.
In other hands, the SVM model maintained slightly lower training accuracy but performed better on the test data, showing a better trade-off between accuracy and generalization.

Thus, SVM is the better model for this dataset, as it provides high accuracy with minimal overfitting.