# 1. Data Cleaning and Preprocessing

In [402]:
import pandas as pd

### 1.1 Load Dataset

In [441]:
data = pd.read_csv('datasets/diabetesrisk (unclean dataset).csv')

In [443]:
data.head()

Unnamed: 0,Id,HouseholdMemberId,BmiRiskPoints,IsPhysicallyActive,EatsVegetablesEveryDay,TakingHighBloodPressureMedication,HasHighBloodGlucose,FamilyWithDiabetesRiskPoints,Status,DateCreated,LastModified,ModifiedById,WaistCircumferenceMenRiskPoints,WaistCircumferenceWomenRiskPoints,AgeRiskPoints,Birthday
0,1,54,0,0,1,1,1,3,1,12/24/2024,12/30/2024,67e552c1-19ea-4a81-bc91-009d1035307d,0.0,0.0,2,01/01/1974
1,2,55,3,0,0,1,1,5,1,12/24/2024,12/29/2024,67e552c1-19ea-4a81-bc91-009d1035307d,0.0,4.0,0,01/31/1989
2,3,54,1,0,1,0,0,3,1,12/26/2024,01/01/2025,67e552c1-19ea-4a81-bc91-009d1035307d,3.0,0.0,2,01/01/1974
3,4,54,1,0,1,0,0,3,1,12/26/2024,01/18/2025,67e552c1-19ea-4a81-bc91-009d1035307d,4.0,0.0,2,01/01/1974
4,5,54,1,0,0,1,1,3,1,12/26/2024,12/26/2024,67e552c1-19ea-4a81-bc91-009d1035307d,3.0,0.0,2,01/01/1974


### 1.2 Preprocess and Clean Data

In [446]:
from datetime import date, timedelta, datetime

def get_age(birth_date):
    birth_datetime = datetime.strptime(birth_date, "%m/%d/%Y")
    delta = date.today() - birth_datetime.date()
    return delta.days // 365.2425

In [448]:
data['Age'] = data['Birthday'].map(get_age)

In [450]:
def get_age_points(age):
    if age < 45:
        return 0
    elif age >= 45 and age < 54:
        return 2
    elif age >= 55 and age <64:
        return 3
    else:
        return 4

#### 1.2.1 Map corresponding points to binary data

In [453]:
is_active = {0:0, 1:2}
taking_blood_pressure_med = {0:0, 1:2}
has_high_blood_glucose = {0:0, 1:5}

data['IsPhysicallyActive'] = data['IsPhysicallyActive'].map(is_active)
data['TakingHighBloodPressureMedication'] = data['TakingHighBloodPressureMedication'].map(taking_blood_pressure_med)
data['HasHighBloodGlucose'] = data['HasHighBloodGlucose'].map(has_high_blood_glucose)
data['AgeRiskPoints'] = data['Age'].map(get_age_points)

#### 1.2.2 Drop unnecessary columns

In [456]:
data = data.drop(columns=[
    'Id', 
    'HouseholdMemberId', 
    'Status', 
    'DateCreated',
    'LastModified',
    'ModifiedById',
    'Birthday',
    'Age'])

#### 1.2.3 Check for null values and replace with 0 (0 still has meaningful value)

In [459]:
data.isnull().sum()

BmiRiskPoints                          0
IsPhysicallyActive                     0
EatsVegetablesEveryDay                 0
TakingHighBloodPressureMedication      0
HasHighBloodGlucose                    0
FamilyWithDiabetesRiskPoints           0
WaistCircumferenceMenRiskPoints      212
WaistCircumferenceWomenRiskPoints    148
AgeRiskPoints                          0
dtype: int64

In [461]:
data['WaistCircumferenceMenRiskPoints'] = data['WaistCircumferenceMenRiskPoints'].fillna(0)
data['WaistCircumferenceWomenRiskPoints'] = data['WaistCircumferenceWomenRiskPoints'].fillna(0)

In [463]:
data.isnull().sum()

BmiRiskPoints                        0
IsPhysicallyActive                   0
EatsVegetablesEveryDay               0
TakingHighBloodPressureMedication    0
HasHighBloodGlucose                  0
FamilyWithDiabetesRiskPoints         0
WaistCircumferenceMenRiskPoints      0
WaistCircumferenceWomenRiskPoints    0
AgeRiskPoints                        0
dtype: int64

### 1.3 Add Necessary Column
Calculation of risk score and risk category

In [466]:
def get_risk_score(row, columns):
    return row[columns].sum()

In [468]:
def categorize_risk(score):
    if score < 7:
        return "low"
    elif 7 <= score <= 11:
        return "slightly elevated"
    elif 12 <= score <= 14:
        return "moderate"
    elif 15 <= score <= 20:
        return "high"
    else: 
        return "very high"

In [470]:
risk_score = data.apply(lambda row: get_risk_score(row, data.columns), axis=1)
data['RiskCategory'] = risk_score.map(categorize_risk)

### Preview cleaned data

In [473]:
data.head()

Unnamed: 0,BmiRiskPoints,IsPhysicallyActive,EatsVegetablesEveryDay,TakingHighBloodPressureMedication,HasHighBloodGlucose,FamilyWithDiabetesRiskPoints,WaistCircumferenceMenRiskPoints,WaistCircumferenceWomenRiskPoints,AgeRiskPoints,RiskCategory
0,0,0,1,2,5,3,0.0,0.0,2,moderate
1,3,0,0,2,5,5,0.0,4.0,0,high
2,1,0,1,0,0,3,3.0,0.0,2,slightly elevated
3,1,0,1,0,0,3,4.0,0.0,2,slightly elevated
4,1,0,0,2,5,3,3.0,0.0,2,high


In [439]:
data.to_csv('datasets/diabetes_risk.csv', index=False)

# Model Training

### 2.1 Feature and Target Selection

In [None]:
X = data.drop('RiskCategory', axis=1)
y = data['RiskCategory']

### 2.2 Train Test Splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 2.3 Normalizing Features

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 2.4 Logistic Regression Building and Training

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(
    multi_class='multinomial', 
    solver='lbfgs',
    penalty='l2',
    random_state=42,
)

lr_model.fit(X_train_scaled, y_train)

### 2.5 Decision Tree Building and Training

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    criterion='gini',
    max_depth=7,
    min_samples_leaf=1,
    random_state=42
)

dt_model.fit(X_train_scaled, y_train)

### 2.6 Random Forest Building and Training

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    criterion='gini',
    bootstrap=True,
    max_depth=30,
    random_state=42
)

rf_model.fit(X_train_scaled, y_train)