In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [8]:
# Load the dataset
df = pd.read_csv("diabetes_prediction_dataset.csv")  # Ensure the correct path

# Display dataset info
print(df.head())
print(df.info())

# Check for missing values
print("Missing values:\n", df.isnull().sum())

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age               

In [9]:
#Convert categorical variables (gender, smoking_history) to numerical
label_enc = LabelEncoder()
df["gender"] = label_enc.fit_transform(df["gender"])  # Convert Male/Female to 0/1
df["smoking_history"] = label_enc.fit_transform(df["smoking_history"])  # Encode categories

# Selecting relevant features for better prediction
features = ["age", "gender", "bmi", "hypertension", "heart_disease", 
            "smoking_history", "HbA1c_level", "blood_glucose_level"]
X = df[features]  # Input features
y = df["diabetes"]  # Target variable (1 = Diabetic, 0 = Non-Diabetic)

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Feature scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


In [11]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.95865
Confusion Matrix:
 [[18127   165]
 [  662  1046]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     18292
           1       0.86      0.61      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.91      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000



In [14]:
# Ensure that LabelEncoder is fitted on the entire dataset before transforming
label_enc_gender = LabelEncoder()
df["gender"] = label_enc_gender.fit_transform(df["gender"])

label_enc_smoking = LabelEncoder()
df["smoking_history"] = label_enc_smoking.fit_transform(df["smoking_history"])

# Define a new test sample (manual input)
test_sample = pd.DataFrame({
    "age": [45],  
    "gender":[0],  # Encode 'Male' based on training data
    "bmi": [28.5],
    "hypertension": [1],  # 1 = Yes, 0 = No
    "heart_disease": [0],  # 1 = Yes, 0 = No
    "smoking_history": [0],  # Encode category
    "HbA1c_level": [6.2],
    "blood_glucose_level": [135]
})

# Apply the same scaling as training data
test_sample_scaled = scaler.transform(test_sample)

# Predict diabetes (0 = No Diabetes, 1 = Diabetes)
prediction = model.predict(test_sample_scaled)

print("Predicted Diabetes Status (0 = No, 1 = Yes):", prediction[0])


Predicted Diabetes Status (0 = No, 1 = Yes): 0
