In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
import joblib

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df.head(7)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1


**Create BMI_category column**

In [4]:
# Define a function to categorize BMI
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 24.9:
        return 'Normal'
    elif bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

# Apply the function to create the BMI_category column
df['BMI_category'] = df['BMI'].apply(bmi_category)
print(df[['BMI', 'BMI_category']].head())  # Display BMI and the new category


    BMI BMI_category
0  33.6        Obese
1  26.6   Overweight
2  23.3       Normal
3  28.1   Overweight
4  43.1        Obese


In [5]:
df.head(7)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_category
0,6,148,72,35,0,33.6,0.627,50,1,Obese
1,1,85,66,29,0,26.6,0.351,31,0,Overweight
2,8,183,64,0,0,23.3,0.672,32,1,Normal
3,1,89,66,23,94,28.1,0.167,21,0,Overweight
4,0,137,40,35,168,43.1,2.288,33,1,Obese
5,5,116,74,0,0,25.6,0.201,30,0,Overweight
6,3,78,50,32,88,31.0,0.248,26,1,Obese


**Split the data into train and validation sets**

In [6]:
from sklearn.model_selection import train_test_split

# Split into train and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Check the shapes of the resulting datasets
print(f"Train shape: {train_df.shape}, Validation shape: {val_df.shape}")


Train shape: (614, 10), Validation shape: (154, 10)


**Apply Standard Scaler on numeric features**

In [7]:
from sklearn.preprocessing import StandardScaler

# Identify numeric features
numeric_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit on train data and transform
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])

# Only transform on validation data
val_df[numeric_features] = scaler.transform(val_df[numeric_features])

# Display scaled features
print(train_df[numeric_features].head())


      Glucose  BloodPressure  SkinThickness   Insulin       BMI       Age
60  -1.151398      -3.752683      -1.322774 -0.701206 -4.135256 -1.035940
618 -0.276643       0.680345       0.233505 -0.701206 -0.489169  1.487101
346  0.566871      -1.265862      -0.090720  0.013448 -0.424522 -0.948939
294  1.254179      -1.049617      -1.322774 -0.701206 -1.303720  2.792122
231  0.410665       0.572222       1.076490  2.484601  1.838121  1.139095


**Apply One-hot Encoding to categorical features**

In [9]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical features
categorical_features = ['BMI_category']

# Initialize the OneHotEncoder with the correct parameter
encoder = OneHotEncoder(sparse_output=False)

# Fit on train data and transform
train_encoded = encoder.fit_transform(train_df[categorical_features])

# Transform validation data
val_encoded = encoder.transform(val_df[categorical_features])

# Convert to DataFrame
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_features))
val_encoded_df = pd.DataFrame(val_encoded, columns=encoder.get_feature_names_out(categorical_features))

# Concatenate encoded categorical features with scaled numeric features
X_train = pd.concat([train_df[numeric_features].reset_index(drop=True), train_encoded_df], axis=1)
X_val = pd.concat([val_df[numeric_features].reset_index(drop=True), val_encoded_df], axis=1)

# Target variable
y_train = train_df['Outcome']
y_val = val_df['Outcome']

# Display the final train feature set
print(X_train.head())


    Glucose  BloodPressure  SkinThickness   Insulin       BMI       Age  \
0 -1.151398      -3.752683      -1.322774 -0.701206 -4.135256 -1.035940   
1 -0.276643       0.680345       0.233505 -0.701206 -0.489169  1.487101   
2  0.566871      -1.265862      -0.090720  0.013448 -0.424522 -0.948939   
3  1.254179      -1.049617      -1.322774 -0.701206 -1.303720  2.792122   
4  0.410665       0.572222       1.076490  2.484601  1.838121  1.139095   

   BMI_category_Normal  BMI_category_Obese  BMI_category_Overweight  \
0                  0.0                 0.0                      0.0   
1                  0.0                 0.0                      1.0   
2                  0.0                 0.0                      1.0   
3                  1.0                 0.0                      0.0   
4                  0.0                 1.0                      0.0   

   BMI_category_Underweight  
0                       1.0  
1                       0.0  
2                       0.0  
3 

**Build a KNN classifier**

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

# Experiment with different k values
best_knn_f1, best_k = 0, 0
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    preds = knn.predict(X_val)
    f1 = f1_score(y_val, preds)
    print(f"k={k}, F1 Score: {f1}")
    if f1 > best_knn_f1:
        best_knn_f1, best_k = f1, k

print(f"Best K: {best_k}, Best F1 Score: {best_knn_f1}")


k=3, F1 Score: 0.5454545454545454
k=5, F1 Score: 0.6037735849056604
k=7, F1 Score: 0.6605504587155964
Best K: 7, Best F1 Score: 0.6605504587155964


**Build a Decision Tree classifier**

In [11]:
from sklearn.tree import DecisionTreeClassifier

# Experiment with different max_depth values
best_dt_f1, best_depth = 0, 0
for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    preds = dt.predict(X_val)
    f1 = f1_score(y_val, preds)
    print(f"max_depth={depth}, F1 Score: {f1}")
    if f1 > best_dt_f1:
        best_dt_f1, best_depth = f1, depth

print(f"Best max_depth: {best_depth}, Best F1 Score: {best_dt_f1}")


max_depth=3, F1 Score: 0.6476190476190476
max_depth=5, F1 Score: 0.6379310344827587
max_depth=7, F1 Score: 0.5535714285714286
Best max_depth: 3, Best F1 Score: 0.6476190476190476


**Build the inference pipeline**

In [12]:
import joblib

# Save the scaler, encoder, and best model
best_model = knn if best_knn_f1 > best_dt_f1 else dt
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(encoder, 'encoder.pkl')
joblib.dump(best_model, 'best_model.pkl')

# Inference function
def inference(sample):
    # Load the saved components
    scaler = joblib.load('scaler.pkl')
    encoder = joblib.load('encoder.pkl')
    model = joblib.load('best_model.pkl')

    # Scale numeric features
    sample_numeric = scaler.transform(sample[numeric_features])

    # Encode categorical features
    sample_categorical = encoder.transform(sample[categorical_features])

    # Combine scaled and encoded features
    sample_transformed = pd.concat([pd.DataFrame(sample_numeric), pd.DataFrame(sample_categorical)], axis=1)

    # Predict the class
    prediction = model.predict(sample_transformed)
    return prediction

# Demonstrate inference with 5 validation samples
for i in range(5):
    sample = val_df.iloc[[i]]
    print(f"Sample {i + 1} Prediction: {inference(sample)}")


Sample 1 Prediction: [0]
Sample 2 Prediction: [0]
Sample 3 Prediction: [0]
Sample 4 Prediction: [0]
Sample 5 Prediction: [0]


