In [31]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_predict
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:
print("Step 1: Data Pre-processing")
df = pd.read_csv("diabetes project.csv") 
df.info()

In [None]:
#check the information of the data
#remove outlier
#impute missing value with median
#check the number of missing value 
#normalize the data with zscore

In [32]:
print(df.head(5))
print(df.isnull().sum())
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df_no_outlier = (df >= lower) & (df <= upper)
outlier_count = (~df_no_outlier).sum().sum()
df[~df_no_outlier] = np.nan
# Impute missing values
median_values = df.median()
df = df.fillna(median_values)
print("median is:")
print(median_values)
print(df_no_outlier.isnull().sum())

#normalize data
scaler = StandardScaler()
df_normalized = pd.DataFrame(
    scaler.fit_transform(df),
    columns=df.columns
)
print(df_normalized)

   Pregnancies  Glucose  BloodPressure  SkinThickness   BMI  \
0            6    148.0             72           35.0  33.6   
1            1     85.0             66           29.0  26.6   
2            8    117.0             64          -35.0  23.3   
3            1     89.0             66           23.0  28.1   
4            0    137.0             40           35.0  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     0.355   33  
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64
median is:
Pregnancies                   3.000
Glucose                     117.000
BloodPressure                72.000
SkinThickness                29.000
BMI                          32.2

In [None]:
#Use K-means clustering on three features of Glucose, BMI and Age to cluster data into two clusters.
#Assign ‘Diabetes’ name to the cluster with higher average Glucose and ‘No Diabetes’ to the other cluster.
#Add a new column (Outcome) to the dataset containing 1 for ‘Diabetes’ and 0 for ‘No Diabetes’. Use these values as labels for classification (step 4).
 

In [33]:
# Step 2: Unsupervised learning for generating labels
cluster_df = df_normalized[['Glucose', 'BMI', 'Age']].copy()
kmeans = KMeans(n_clusters=2, random_state=42)
labels = kmeans.fit_predict(cluster_df)
cluster_df['Cluster_label'] = labels
print(cluster_df.head(10))
# Assign diabetes to cluster with higher glucose
cluster_glu_means = cluster_df.groupby(labels)["Glucose"].mean()
diabetes_cluster = cluster_glu_means.idxmax()
df_normalized["Outcome"] = (labels == diabetes_cluster).astype(int)
print(f"Labels generated: 0={np.sum(df_normalized['Outcome']==0)}, 1={np.sum(df_normalized['Outcome']==1)}")
print(cluster_glu_means)

    Glucose       BMI       Age  Cluster_label
0  1.143472  0.212745  1.428456              0
1 -1.337355 -0.878140 -0.198076              1
2 -0.077252 -1.392414 -0.112469              1
3 -1.179842 -0.644379 -1.054145              1
4  0.710312  1.693231 -0.026862              0
5 -0.077252 -1.033981 -0.283682              1
6 -0.077252 -0.192441 -0.626110              1
7 -0.077252 -0.270361  1.685277              0
8  0.237773  0.002360  1.770884              0
9 -0.352900  0.836108 -0.283682              1
Labels generated: 0=444, 1=289
0    0.721758
1   -0.469793
Name: Glucose, dtype: float64


In [34]:
print("Step 2: Unsupervised Learning for Label Generation")


# K-means clustering on Glucose, BMI, and Age
cluster_df = df_normalized[['Glucose', 'BMI', 'Age']]
kmeans = KMeans(n_clusters=2, random_state=42)
labels = kmeans.fit_predict(cluster_df)

# Assign diabetes to cluster with higher glucose
cluster_glu_means = cluster_df.groupby(labels)["Glucose"].mean()
diabetes_cluster = cluster_glu_means.idxmax()
df_normalized["Outcome"] = (labels == diabetes_cluster).astype(int)

print(f"Labels generated: 0={np.sum(df_normalized['Outcome']==0)}, 1={np.sum(df_normalized['Outcome']==1)}")

Step 2: Unsupervised Learning for Label Generation
Labels generated: 0=444, 1=289


In [None]:
#Split data into test and training sets (consider 20% for test).
Use PCA on the training data to create 3 new #components from existing features (all columns except outcome).
Transfer training and test data to the new dimensions (PCs).


In [35]:
print("Step 3: Feature Extraction")

# Split data into training and test sets
X = df_normalized.drop('Outcome', axis=1)
y = df_normalized['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Apply PCA to create 3 principal components
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"\nPCA training data shape: {X_train_pca.shape}")
print(f"PCA test data shape: {X_test_pca.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.4f}")

Step 3: Feature Extraction
Training set size: (586, 7)
Test set size: (147, 7)

PCA training data shape: (586, 3)
PCA test data shape: (147, 3)
Explained variance ratio: [0.30467407 0.19959433 0.13742567]
Total variance explained: 0.6417


In [None]:
# 3D PCA Visualization
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Blue: No diabetes
ax.scatter(X_train_pca[y_train==0, 0], 
           X_train_pca[y_train==0, 1], 
           X_train_pca[y_train==0, 2],
           c='blue', label='No Diabetes (0)', alpha=0.6)

# Red: Diabetes
ax.scatter(X_train_pca[y_train==1, 0], 
           X_train_pca[y_train==1, 1], 
           X_train_pca[y_train==1, 2],
           c='red', label='Diabetes (1)', alpha=0.6)

# Labels and title
ax.set_xlabel('PC 1')
ax.set_ylabel('PC 2')
ax.set_zlabel('PC 3')
ax.set_title('3D PCA Visualization of Training Data')
ax.legend()
plt.tight_layout()
plt.show()

In [37]:
#Define three classification models as base classifiers consisting of Naïve Bayes, Neural Network, and KNN.
#Define a decision tree as the meta learner.
#Train decision tree (meta learner) on outputs of three base classifiers using 5-fold cross validation.
#Find hyperparameters for all these models which provide the best accuracy rate.
#Report accuracy of the model on the test data.
#base learner:knn,nb mlp stacking 
#then meta learner: 

In [None]:

print("Step 4: Classification using a Super Learner")

# Define base classifiers
nb = GaussianNB()
knn = KNeighborsClassifier()
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

print("Generating meta-features using 5-fold cross-validation...")

# Generate meta-features using cross-validation
p1_train = cross_val_predict(nb, X_train_pca, y_train, cv=5)
p2_train = cross_val_predict(knn, X_train_pca, y_train, cv=5)
p3_train = cross_val_predict(mlp, X_train_pca, y_train, cv=5)

# Stack predictions to create meta-features
X_meta_train = np.column_stack((p1_train, p2_train, p3_train))

print(f"Meta-features shape: {X_meta_train.shape}")
print("✓ Meta-features generated!")

In [None]:
# Train base models on full training set
print("\nTraining base models on full training set...")

nb.fit(X_train_pca, y_train)
knn.fit(X_train_pca, y_train)
mlp.fit(X_train_pca, y_train)

print("✓ Base models trained!")

# Generate predictions on test set
print("\nGenerating predictions on test set...")
p1_test = nb.predict(X_test_pca)
p2_test = knn.predict(X_test_pca)
p3_test = mlp.predict(X_test_pca)

# Create meta-features for test set
X_meta_test = np.column_stack((p1_test, p2_test, p3_test))

print(f"Test meta-features shape: {X_meta_test.shape}")


In [None]:
# Tune meta-learner (Decision Tree)
print("Tuning Meta-Learner (Decision Tree)")

dt = DecisionTreeClassifier(random_state=42)

# Define parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10]
}

print(f"Testing {len(param_grid['criterion']) * len(param_grid['max_depth']) * len(param_grid['min_samples_split'])} parameter combinations...")
print("This will perform 5-fold cross-validation for each combination...\n")

# Grid search with cross-validation
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_meta_train, y_train)

# Best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_meta_test)

print("\n" + "=" * 70)
print("Best Parameters Found:")
print("=" * 70)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
print("\n✓ Meta-learner tuned!")

In [23]:
print("FINAL RESULTS")


# Super learner accuracy
super_learner_acc = accuracy_score(y_test, y_pred)
print(f"\nSuper Learner Test Accuracy: {super_learner_acc:.4f}")

# Individual model accuracies
print("\n" + "-" * 70)
print("Individual Base Model Performance:")
print("-" * 70)
nb_acc = accuracy_score(y_test, p1_test)
knn_acc = accuracy_score(y_test, p2_test)
mlp_acc = accuracy_score(y_test, p3_test)

print(f"  Naive Bayes:   {nb_acc:.4f}")
print(f"  KNN:           {knn_acc:.4f}")
print(f"  MLP:           {mlp_acc:.4f}")
print(f"  Super Learner: {super_learner_acc:.4f}")

# Improvement
best_base = max(nb_acc, knn_acc, mlp_acc)
improvement = super_learner_acc - best_base
print(f"\nImprovement over best base model: {improvement:.4f} ({improvement*100:.2f}%)")

# Classification report
print("\n" + "=" * 70)
print("Detailed Classification Report")
print("=" * 70)
print(classification_report(y_test, y_pred, 
                          target_names=['No Diabetes (0)', 'Diabetes (1)']))


FINAL RESULTS


NameError: name 'y_test' is not defined