In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("mohankrishnathalla/diabetes-health-indicators-dataset")

print("Path to dataset files:", path)
csv_path = os.path.join(path, 'diabetes_dataset.csv')
print("Path to csv file:", csv_path)

In [None]:
# check the data
import pandas as pd
df = pd.read_csv(csv_path)
df.head()

In [None]:
# check the features
df.info()


In [None]:
# for the features with Dtype=object, show the values
for col in df.columns:
    if df[col].dtype == 'object':
        print(col, df[col].unique())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# --- Create Sample Data ---
data = {
    'A_Nominal': ['Red', 'Blue', 'Green', 'Red', 'Blue', 'Green', 'Red', 'Blue', 'Green', 'Red'],
    'B_Ordinal': ['Low', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low'],
    'C_Ordinal': ['Small', 'Medium', 'Large', 'Large', 'Medium', 'Small', 'Small', 'Medium', 'Large', 'Large'],
    'Numerical_Feature': [10.5, 22.1, 5.0, 15.3, 30.0, 8.8, 12.0, 25.5, 6.2, 18.1],
    'T_Target': ['Class_Y', 'Class_Z', 'Class_Y', 'Class_X', 'Class_Z', 'Class_X', 'Class_Y', 'Class_Z', 'Class_X', 'Class_Y']
}
df = pd.DataFrame(data)

df.head()


In [None]:
ordinal_map_B = {'Low': 0, 'Medium': 1, 'High': 2}
ordinal_map_C = {'Small': 0, 'Medium': 1, 'Large': 2}

df['B_Ordinal_Encoded'] = df['B_Ordinal'].map(ordinal_map_B)
df['C_Ordinal_Encoded'] = df['C_Ordinal'].map(ordinal_map_C)

df_one_hot = pd.get_dummies(df['A_Nominal'], prefix='A') # Let the users know these features are from A_Nominal.
df_one_hot.head()

In [None]:
features_df = pd.concat([df_one_hot, df[['B_Ordinal_Encoded', 'C_Ordinal_Encoded', 'Numerical_Feature']]], axis=1)
features_df.head()

In [None]:
le = LabelEncoder()
target = le.fit_transform(df['T_Target'])

target # 'Class_X' -> 0 'Class_Y' -> 1 'Class_Z' -> 2

# why don't we use

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.3, random_state=42)


# # Cross Validation
from sklearn.model_selection import StratifiedKFold
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold_index, (train_indices, test_indices) in enumerate(cv_strategy.split(features_df, target)):

    X_train_fold = features_df.iloc[train_indices]
    y_train_fold = target[train_indices]
    X_test_fold = features_df.iloc[test_indices]
    y_test_fold = target[test_indices]

In [None]:
# Transform the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [None]:
# Predict
y_pred = model.predict(X_test_scaled)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))