# Diabetes Project — Phase 2

**Notebook:** Train Logistic Regression, measure training time, apply Random Projection, compare results.

**Instructions:** Put your CSV dataset file named `your_dataset.csv` in the same folder as this notebook before running the cells. If your target column has a different name, update the `TARGET_COLUMN` variable in the preprocessing cell.

In [10]:
# 1) Setup: imports and constants
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.random_projection import GaussianRandomProjection

# Change this if your file or target column has a different name
DATA_PATH = "E:\\DSc_project\\diabetes_012_health_indicators_BRFSS2015.csv"
TARGET_COLUMN = "Diabetes_012"  # update if needed

print("Ready. Make sure", DATA_PATH, "is in the notebook folder and TARGET_COLUMN is set correctly.")


# Convert multiclass (0,1,2) to binary diabetes (0,1)
# df = pd.read_csv(DATA_PATH)   # Load first, since conversion needs df
# df['Diabetes_012'] = df['Diabetes_012'].apply(lambda x: 1 if x > 0 else 0)

# Update target column to the binary version
TARGET_COLUMN = 'Diabetes_012'

Ready. Make sure E:\DSc_project\diabetes_012_health_indicators_BRFSS2015.csv is in the notebook folder and TARGET_COLUMN is set correctly.


In [2]:
# 2) Load data
df = pd.read_csv(DATA_PATH)
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (253680, 22)


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
# 3) Basic preprocessing
df_clean = df.copy()

# Drop columns with too many missing values (optional threshold)
threshold = 0.5 * len(df_clean)
cols_to_drop = [c for c in df_clean.columns if df_clean[c].isna().sum() > threshold]
if cols_to_drop:
    print("Dropping columns with many missing values:", cols_to_drop)
    df_clean = df_clean.drop(columns=cols_to_drop)

# Separate X and y
if TARGET_COLUMN not in df_clean.columns:
    raise ValueError(f"Target column '{TARGET_COLUMN}' not found in the dataset columns: {list(df_clean.columns)}")

X = df_clean.drop(columns=[TARGET_COLUMN])
y = df_clean[TARGET_COLUMN]

# Simple imputation and encoding
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

print('Numeric cols:', num_cols)
print('Categorical cols:', cat_cols)

# Impute numeric
num_imputer = SimpleImputer(strategy='median')
X_num = pd.DataFrame(num_imputer.fit_transform(X[num_cols]), columns=num_cols)

# Encode categorical (one-hot)
if cat_cols:
    X_cat = pd.get_dummies(X[cat_cols].astype(str), drop_first=True)
    X_processed = pd.concat([X_num.reset_index(drop=True), X_cat.reset_index(drop=True)], axis=1)
else:
    X_processed = X_num

print('Processed feature shape:', X_processed.shape)

Numeric cols: ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
Categorical cols: []
Processed feature shape: (253680, 21)


In [4]:
# 4) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y if len(np.unique(y))>1 else None
)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Train shape: (202944, 21) Test shape: (50736, 21)


In [5]:
# Check unique labels and counts
import numpy as np
print("Unique labels:", np.unique(y))
print("Label counts:\n", y.value_counts())


Unique labels: [0. 1. 2.]
Label counts:
 Diabetes_012
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64


In [6]:
# # 5) Train Logistic Regression on full data (with scaling in a pipeline)
# from sklearn.pipeline import Pipeline
# pipe_full = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=1000, solver='lbfgs'))])

# start = time.time()
# pipe_full.fit(X_train, y_train)
# end = time.time()
# train_time_full = end - start

# y_pred_full = pipe_full.predict(X_test)
# acc_full = accuracy_score(y_test, y_pred_full)
# roc_full = roc_auc_score(y_test, pipe_full.predict_proba(X_test)[:,1]) if len(np.unique(y_test))>1 else None

# print(f"Full-data training time: {train_time_full:.4f} seconds")
# print(f"Full-data accuracy: {acc_full:.4f}")
# if roc_full is not None:
#     print(f"Full-data ROC AUC: {roc_full:.4f}")
# print('\nClassification report (full data):')
# print(classification_report(y_test, y_pred_full))

# 5) Train Logistic Regression on full data (with scaling in a pipeline)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# If y is non-numeric (strings), we encode it so sklearn is happy
# (only do this if y has string/object dtype)
if y.dtype == 'object' or y.dtype.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)
    # re-split because we changed y
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y, test_size=0.2, random_state=42, stratify=y if len(np.unique(y))>1 else None
    )

pipe_full = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=1000, solver='lbfgs'))])

start = time.time()
pipe_full.fit(X_train, y_train)
end = time.time()
train_time_full = end - start

y_pred_full = pipe_full.predict(X_test)
acc_full = accuracy_score(y_test, y_pred_full)

# Safe ROC AUC calculation for binary or multiclass
proba_full = pipe_full.predict_proba(X_test)
unique_labels = np.unique(y_test)

if len(unique_labels) == 2:
    # binary: use probability of positive class
    # find positive class index (usually 1)
    # assume classes are ordered like [0,1]
    roc_full = roc_auc_score(y_test, proba_full[:, 1])
else:
    # multiclass: pass full probability matrix and choose multi_class method
    # use 'ovr' (one-vs-rest) which is common
    roc_full = roc_auc_score(y_test, proba_full, multi_class='ovr')

print(f"Full-data training time: {train_time_full:.4f} seconds")
print(f"Full-data accuracy: {acc_full:.4f}")
if roc_full is not None:
    print(f"Full-data ROC AUC: {roc_full:.4f}")
print('\nClassification report (full data):')
print(classification_report(y_test, y_pred_full))


Full-data training time: 0.7651 seconds
Full-data accuracy: 0.8455
Full-data ROC AUC: 0.7808

Classification report (full data):
              precision    recall  f1-score   support

         0.0       0.86      0.97      0.91     42741
         1.0       0.00      0.00      0.00       926
         2.0       0.51      0.17      0.26      7069

    accuracy                           0.85     50736
   macro avg       0.46      0.38      0.39     50736
weighted avg       0.80      0.85      0.81     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [7]:
import numpy as np
print("Unique labels in y_train:", np.unique(y_train))
print("Unique labels in y_test:", np.unique(y_test))


Unique labels in y_train: [0. 1. 2.]
Unique labels in y_test: [0. 1. 2.]


In [8]:
# -------------------------
# Block 6: Random Projection + retrain (robust ROC handling)
# -------------------------
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import roc_auc_score

# Make sure X_train, X_test, y_train, y_test exist and target is what you expect
print("Check unique labels before RP - train:", np.unique(y_train), " test:", np.unique(y_test))

# Choose k safely (you can change this)
d = X_train.shape[1]
k = min(d, max(10, int(d/2)))
print('Original dimension d =', d, 'Using k =', k)

# Create RP and transform
rp = GaussianRandomProjection(n_components=k, random_state=42)
X_train_rp = rp.fit_transform(X_train)
X_test_rp = rp.transform(X_test)
print('Reduced shapes:', X_train_rp.shape, X_test_rp.shape)

# Train logistic regression on reduced data (scale then train)
pipe_rp = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, solver='lbfgs'))
])

start = time.time()
pipe_rp.fit(X_train_rp, y_train)
end = time.time()
train_time_rp = end - start

y_pred_rp = pipe_rp.predict(X_test_rp)
acc_rp = accuracy_score(y_test, y_pred_rp)

# Compute ROC AUC robustly:
unique_labels = np.unique(y_test)
roc_rp = None
if len(unique_labels) == 1:
    print("WARNING: y_test contains only one class:", unique_labels, 
          "\nROC AUC cannot be computed for a single-class test set. Consider rerunning train_test_split with different random_state or larger test_size.")
else:
    proba = pipe_rp.predict_proba(X_test_rp)
    # binary case
    if proba.shape[1] == 2 and set(unique_labels) <= {0,1}:
        roc_rp = roc_auc_score(y_test, proba[:, 1])
    else:
        # multiclass case: use full probability matrix and 'ovr'
        try:
            roc_rp = roc_auc_score(y_test, proba, multi_class='ovr')
        except Exception as e:
            print("Could not compute multiclass ROC AUC automatically. Error:", e)
            # fallback: compute macro-averaged AUC manually using one-vs-rest
            from sklearn.preprocessing import label_binarize
            classes = np.unique(y_train)
            y_test_binarized = label_binarize(y_test, classes=classes)
            try:
                roc_rp = roc_auc_score(y_test_binarized, proba, average='macro')
            except Exception as e2:
                print("Fallback also failed:", e2)
                roc_rp = None

# Print results
print(f"RP-data training time: {train_time_rp:.4f} seconds")
print(f"RP-data accuracy: {acc_rp:.4f}")
if roc_rp is not None:
    print(f"RP-data ROC AUC: {roc_rp:.4f}")
else:
    print("RP-data ROC AUC: Not available")

print('\nClassification report (RP data):')
print(classification_report(y_test, y_pred_rp))


Check unique labels before RP - train: [0. 1. 2.]  test: [0. 1. 2.]
Original dimension d = 21 Using k = 10
Reduced shapes: (202944, 10) (50736, 10)
RP-data training time: 1.3665 seconds
RP-data accuracy: 0.8422
RP-data ROC AUC: 0.7353

Classification report (RP data):
              precision    recall  f1-score   support

         0.0       0.85      0.98      0.91     42741
         1.0       0.00      0.00      0.00       926
         2.0       0.48      0.09      0.16      7069

    accuracy                           0.84     50736
   macro avg       0.44      0.36      0.36     50736
weighted avg       0.78      0.84      0.79     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
# 7) Compare and summarize results
improvement = (train_time_full - train_time_rp) / train_time_full * 100 if train_time_full>0 else 0
print('Training time full:', round(train_time_full,4), 'seconds')
print('Training time after RP:', round(train_time_rp,4), 'seconds')
print(f'Improvement in training time: {improvement:.2f}%')

print('\nAccuracy (full):', round(acc_full,4))
print('Accuracy (RP):', round(acc_rp,4))

summary = pd.DataFrame([{
    'method': 'full',
    'train_time': train_time_full,
    'accuracy': acc_full,
    'roc_auc': roc_full
},{
    'method': 'random_projection',
    'k': k,
    'train_time': train_time_rp,
    'accuracy': acc_rp,
    'roc_auc': roc_rp
}])
summary.to_csv('training_comparison_summary.csv', index=False)
print('\nSaved summary to training_comparison_summary.csv')

Training time full: 0.7651 seconds
Training time after RP: 1.3665 seconds
Improvement in training time: -78.60%

Accuracy (full): 0.8455
Accuracy (RP): 0.8422

Saved summary to training_comparison_summary.csv


## Conclusion

- This notebook trains logistic regression on the full feature set, applies Random Projection to reduce dimensions, retrains the model, and compares training time and accuracy.
- Edit `TARGET_COLUMN` and `DATA_PATH` at the top if your dataset uses different names.

## Next steps / Notes

- Try different values of `k` in Random Projection and report the performance curve.
- You can also test other randomized scaling techniques (Feature Hashing, Sampling) if required.
- For slides, report the asymptotic running time: Logistic Regression ~ O(n * d); after RP it becomes O(n * k) with k << d.