# Diabetes Project — Phase 2

**Notebook:** Train Logistic Regression, measure training time, apply Random Projection, compare results.

**Instructions:** Put your CSV dataset file named `your_dataset.csv` in the same folder as this notebook before running the cells. If your target column has a different name, update the `TARGET_COLUMN` variable in the preprocessing cell.

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.pipeline import Pipeline

# Load dataset
df = pd.read_csv("E:\\DSc_project\\diabetes_012_health_indicators_BRFSS2015.csv")

# ---- Convert target to binary ----
# 0 → non-diabetic
# 1,2 → diabetic
df['Diabetes_012'] = df['Diabetes_012'].apply(lambda x: 1 if x > 0 else 0)

TARGET_COLUMN = "Diabetes_012"

# Features and target split
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [2]:
# 2) Load data
df = pd.read_csv("E:\\DSc_project\\diabetes_012_health_indicators_BRFSS2015.csv")
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (253680, 22)


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
# 3) Basic preprocessing
df_clean = df.copy()

# Drop columns with too many missing values (optional threshold)
threshold = 0.5 * len(df_clean)
cols_to_drop = [c for c in df_clean.columns if df_clean[c].isna().sum() > threshold]
if cols_to_drop:
    print("Dropping columns with many missing values:", cols_to_drop)
    df_clean = df_clean.drop(columns=cols_to_drop)

# Separate X and y
if TARGET_COLUMN not in df_clean.columns:
    raise ValueError(f"Target column '{TARGET_COLUMN}' not found in the dataset columns: {list(df_clean.columns)}")

X = df_clean.drop(columns=[TARGET_COLUMN])
y = df_clean[TARGET_COLUMN]

# Simple imputation and encoding
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

print('Numeric cols:', num_cols)
print('Categorical cols:', cat_cols)

# Impute numeric
num_imputer = SimpleImputer(strategy='median')
X_num = pd.DataFrame(num_imputer.fit_transform(X[num_cols]), columns=num_cols)

# Encode categorical (one-hot)
if cat_cols:
    X_cat = pd.get_dummies(X[cat_cols].astype(str), drop_first=True)
    X_processed = pd.concat([X_num.reset_index(drop=True), X_cat.reset_index(drop=True)], axis=1)
else:
    X_processed = X_num

print('Processed feature shape:', X_processed.shape)

Numeric cols: ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
Categorical cols: []


NameError: name 'SimpleImputer' is not defined

In [None]:
# 4) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y if len(np.unique(y))>1 else None
)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Train shape: (202944, 21) Test shape: (50736, 21)


In [None]:
results_full = train_and_evaluate_logistic(X_train, X_test, y_train, y_test)
print(results_full['train_time_sec'], results_full['accuracy'], results_full['roc_auc'])
print(results_full['classification_report'])


0.24960756301879883 0.8480960264900662 0.817190252459295
              precision    recall  f1-score   support

           0     0.8652    0.9710    0.9150     42741
           1     0.5520    0.1911    0.2839      7995

    accuracy                         0.8481     50736
   macro avg     0.7086    0.5811    0.5995     50736
weighted avg     0.8158    0.8481    0.8156     50736



In [None]:
# 5) Train Logistic Regression on full data (with scaling in a pipeline)
from sklearn.pipeline import Pipeline

pipe_full = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, solver='lbfgs'))
])

start = time.time()
pipe_full.fit(X_train, y_train)
end = time.time()
train_time_full = end - start

y_pred_full = pipe_full.predict(X_test)
acc_full = accuracy_score(y_test, y_pred_full)

# Now works because y is binary
roc_full = roc_auc_score(y_test, pipe_full.predict_proba(X_test)[:,1])

print(f"Full-data training time: {train_time_full:.4f} seconds")
print(f"Full-data accuracy: {acc_full:.4f}")
print(f"Full-data ROC AUC: {roc_full:.4f}")

print('\nClassification report (full data):')
print(classification_report(y_test, y_pred_full))


Full-data training time: 0.2402 seconds
Full-data accuracy: 0.8481
Full-data ROC AUC: 0.8172

Classification report (full data):
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     42741
           1       0.55      0.19      0.28      7995

    accuracy                           0.85     50736
   macro avg       0.71      0.58      0.60     50736
weighted avg       0.82      0.85      0.82     50736



In [None]:
# 6) Apply Random Projection and retrain
d = X_train.shape[1]
k = min(d, max(10, int(d/2)))

print('Original dimension d =', d, 'Using k =', k)

rp = GaussianRandomProjection(n_components=k, random_state=42)
X_train_rp = rp.fit_transform(X_train)
X_test_rp = rp.transform(X_test)

print('Reduced shapes:', X_train_rp.shape, X_test_rp.shape)

pipe_rp = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=1000, solver='lbfgs'))])

start = time.time()
pipe_rp.fit(X_train_rp, y_train)
end = time.time()
train_time_rp = end - start

y_pred_rp = pipe_rp.predict(X_test_rp)
acc_rp = accuracy_score(y_test, y_pred_rp)
roc_rp = roc_auc_score(y_test, pipe_rp.predict_proba(X_test_rp)[:,1]) if len(np.unique(y_test))>1 else None

print(f"RP-data training time: {train_time_rp:.4f} seconds")
print(f"RP-data accuracy: {acc_rp:.4f}")
if roc_rp is not None:
    print(f"RP-data ROC AUC: {roc_rp:.4f}")
print('\nClassification report (RP data):')
print(classification_report(y_test, y_pred_rp))

Original dimension d = 21 Using k = 10
Reduced shapes: (202944, 10) (50736, 10)
RP-data training time: 0.6722 seconds
RP-data accuracy: 0.8434
RP-data ROC AUC: 0.7681

Classification report (RP data):
              precision    recall  f1-score   support

           0       0.85      0.98      0.91     42741
           1       0.52      0.11      0.18      7995

    accuracy                           0.84     50736
   macro avg       0.69      0.54      0.55     50736
weighted avg       0.80      0.84      0.80     50736



In [None]:
# 7) Compare and summarize results
improvement = (train_time_full - train_time_rp) / train_time_full * 100 if train_time_full>0 else 0
print('Training time full:', round(train_time_full,4), 'seconds')
print('Training time after RP:', round(train_time_rp,4), 'seconds')
print(f'Improvement in training time: {improvement:.2f}%')

print('\nAccuracy (full):', round(acc_full,4))
print('Accuracy (RP):', round(acc_rp,4))

summary = pd.DataFrame([{
    'method': 'full',
    'train_time': train_time_full,
    'accuracy': acc_full,
    'roc_auc': roc_full
},{
    'method': 'random_projection',
    'k': k,
    'train_time': train_time_rp,
    'accuracy': acc_rp,
    'roc_auc': roc_rp
}])
summary.to_csv('training_comparison_summary.csv', index=False)
print('\nSaved summary to training_comparison_summary.csv')

Training time full: 0.228 seconds
Training time after RP: 0.6722 seconds
Improvement in training time: -194.79%

Accuracy (full): 0.8481
Accuracy (RP): 0.8434

Saved summary to training_comparison_summary.csv


## Conclusion

- This notebook trains logistic regression on the full feature set, applies Random Projection to reduce dimensions, retrains the model, and compares training time and accuracy.
- Edit `TARGET_COLUMN` and `DATA_PATH` at the top if your dataset uses different names.

## Next steps / Notes

- Try different values of `k` in Random Projection and report the performance curve.
- You can also test other randomized scaling techniques (Feature Hashing, Sampling) if required.
- For slides, report the asymptotic running time: Logistic Regression ~ O(n * d); after RP it becomes O(n * k) with k << d.