<a href="https://colab.research.google.com/github/jk74u/FAIDM-Diabetes-Project/blob/main/Mainlogic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

# Section 1: Install additional libraries and import all packages

# Install libraries for SMOTE & Visualisation
!pip install imbalanced-learn yellowbrick -q

# Data Manipulation
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Clustering
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA

# Evaluation Metrics
from sklearn.metrics import (
    confusion_matrix, classification_report, accuracy_score,
    f1_score, silhouette_score
)

# Class Imbalance
from imblearn.over_sampling import SMOTE

# Visualisation Helpers
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

# Settings
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

print("All libraries imported successfully!")

All libraries imported successfully!


In [3]:
# Section 2: Loading the dataset
from google.colab import files
uploaded = files.upload()

#data into panda dataframe
df = pd.read_csv('CDC_Diabetes_Dataset.csv')
print(" Loading successful")
print(" Shape:" , df.shape)

Saving CDC_Diabetes_Dataset.csv to CDC_Diabetes_Dataset.csv
 Loading successful
 Shape: (253680, 22)


In [4]:
# data integrity load check
print("First 5 rows:")
df.head()

First 5 rows:


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [5]:
#data types and other info
print("Shape:" , df.shape)
print("\nData Types:\n" , df.dtypes)
print("\nData Types:\n" , df.dtypes)
df.info()

Shape: (253680, 22)

Data Types:
 Diabetes_012            float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

Data Types:
 Diabetes_012            float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDisea

In [6]:
# missing values check
print("Missing Values:")
print(df.isnull().sum())
print("\nTotal missing values:")
print(df.isnull().sum().sum())
# duplicate check
print("\nDuplicate Rows:")
print(df.duplicated().sum())
print("Dataset shape with duplicates")
print(df.shape)
df = df.drop_duplicates()
print("Dataset shape without duplicates")
print(df.shape)


Missing Values:
Diabetes_012            0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

Total missing values:
0

Duplicate Rows:
23899
Dataset shape with duplicates
(253680, 22)
Dataset shape without duplicates
(229781, 22)


In [7]:
# descriptive statistics
print("Summary statistics:")
df.describe().round(2)


Summary statistics:


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0,...,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0,229781.0
mean,0.33,0.45,0.44,0.96,28.69,0.47,0.04,0.1,0.73,0.61,...,0.95,0.09,2.6,3.51,4.68,0.19,0.44,8.09,4.98,5.89
std,0.72,0.5,0.5,0.2,6.79,0.5,0.21,0.3,0.44,0.49,...,0.23,0.29,1.06,7.71,9.05,0.39,0.5,3.09,0.99,2.09
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,8.0,5.0,6.0
75%,0.0,1.0,1.0,1.0,32.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,2.0,4.0,0.0,1.0,10.0,6.0,8.0
max,2.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0
