## MSDS 7331 Mini Lab Two: Logistic Regression and Support Vector Machine

### Authors: Jaren Shead, Kristin Henderson, Tom Hines

### Setup and Data Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re

%matplotlib inline

In [2]:
df = pd.read_csv('data/diabetes+130-us+hospitals+for+years+1999-2008/diabetic_data.csv')

df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


### Data Cleaning

In [3]:
# Replace `?` with NaN for now
df_clean = df.copy()               # make a copy of df called df_clean and then clean it
df_clean = df.replace('?', np.nan)

In [4]:
# Replace NaN ('?') with 'Unknown' in the specified columns
columns_to_update_1 = ['medical_specialty', 'payer_code', 'race']
df_clean[columns_to_update_1] = df_clean[columns_to_update_1].replace(np.nan, 'Unknown')

# Replace NaN ('?') with 'Unknown' in the specified columns
columns_to_update_2 = ['diag_1', 'diag_2', 'diag_3']
df_clean[columns_to_update_2] = df_clean[columns_to_update_2].replace(np.nan, 'Unknown/None')

# Replace NaN with 'Untested' in the specified columns
columns_to_update_3 = ['max_glu_serum', 'A1Cresult']
df_clean[columns_to_update_3] = df_clean[columns_to_update_3].replace(np.nan, 'Untested')

In [5]:
# Convert categorical variables `patient_nbr`, `admission_type_id`, `discharge_disposition_id`, `admission_source_id`
# from integer to object datatype.
categoricalInt_cols = ['patient_nbr', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id']
df_clean[categoricalInt_cols] = df_clean[categoricalInt_cols].astype('category')
df_clean[['patient_nbr', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id']].dtypes

patient_nbr                 category
admission_type_id           category
discharge_disposition_id    category
admission_source_id         category
dtype: object

In [6]:
# Remove encounter id, examide, citoglipton and weight from the dataset.
df_clean = df_clean.drop(columns=['encounter_id'])            # id variable
df_clean = df_clean.drop(columns=['examide', 'citoglipton'])  # zero variance
df_clean = df_clean.drop(columns=['weight'])                  # high percent missing

print( df_clean.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 46 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   patient_nbr               101766 non-null  category
 1   race                      101766 non-null  object  
 2   gender                    101766 non-null  object  
 3   age                       101766 non-null  object  
 4   admission_type_id         101766 non-null  category
 5   discharge_disposition_id  101766 non-null  category
 6   admission_source_id       101766 non-null  category
 7   time_in_hospital          101766 non-null  int64   
 8   payer_code                101766 non-null  object  
 9   medical_specialty         101766 non-null  object  
 10  num_lab_procedures        101766 non-null  int64   
 11  num_procedures            101766 non-null  int64   
 12  num_medications           101766 non-null  int64   
 13  number_outpatient         101

In [7]:
# Find the proportion of each response class.
print(df_clean['readmitted'].value_counts()/df_clean['readmitted'].count())

readmitted
NO     0.539119
>30    0.349282
<30    0.111599
Name: count, dtype: float64


The response class is unbalanced with 54% patient records not being readmitted, 35% being readmitted in greater than 30 days, and 11% being readmitted in less than 30 days.

#### Data preprocessing: Explicitly set the order of ordinal variables

In [8]:
# Define the correct order for each variable
readmit_order = ['<30', '>30', 'NO']
drug_order = ['No', 'Down', 'Steady', 'Up']
max_glu_serum_order = ['Untested', 'Norm', '>200', '>300']
a1cresult_order = ['Untested', 'Norm', '>7', '>8']
age_order = ['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)',
             '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)'] 

# List of drug-related variables
drug_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 
                'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'tolazamide', 
                'pioglitazone', 'rosiglitazone', 'troglitazone', 'acarbose', 'miglitol', 
                'insulin', 'glyburide-metformin', 'glipizide-metformin',
                'metformin-rosiglitazone', 'metformin-pioglitazone', 'glimepiride-pioglitazone']

# Reorder categories in the DataFrame
df_clean['readmitted'] = pd.Categorical(df_clean['readmitted'], categories=readmit_order, ordered=True)
df_clean['max_glu_serum'] = pd.Categorical(df_clean['max_glu_serum'], categories=max_glu_serum_order, ordered=True)
df_clean['A1Cresult'] = pd.Categorical(df_clean['A1Cresult'], categories=a1cresult_order, ordered=True)
df_clean['age'] = pd.Categorical(df_clean['age'], categories=age_order, ordered=True)

for col in drug_columns:
    if col in df_clean.columns:
        df_clean[col] = pd.Categorical(df_clean[col], categories=drug_order, ordered=True)

#### One Hot Encoding the Full Dataset for SVG Logistic Regression

In [9]:
# One Hot Encoding (drop first for logistic regression

# Remove columns with many missing values and patient IDs
columns_to_drop = ['patient_nbr'] #, 'max_glu_serum', 'A1Cresult']
df_encoded = df_clean.drop(columns= columns_to_drop)             # make a copy of df_clean called df_encoded dropping patient_nbr (id variable)

# Preprocess diag_1, diag_2, diag_3 combining all codes with decimals under their integer values 
for col in ['diag_1', 'diag_2', 'diag_3']:
    df_encoded[col] = df_encoded[col].str.split('.').str[0]  # Drop decimals and digits after
# print(df_encoded[['diag_1', 'diag_2', 'diag_3']].head(20))

# # check for missing values
# print(df_encoded.isna().sum())
# # Drop rows with NaN values
# df_encoded = df_encoded.dropna()

# Remove target (split into X and y, for plotting with y later)
X_df_encoded = df_encoded.drop(columns='readmitted')
y_df_encoded = df_encoded['readmitted']

# Handle categorical variables with one-hot encoding
categorical_columns = X_df_encoded.select_dtypes(include=['object', 'category']).columns
X_df_encoded = pd.get_dummies(X_df_encoded, columns=categorical_columns, drop_first=True) # drop_first for multicollinearity issues


In [10]:
X_df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Columns: 2387 entries, time_in_hospital to diabetesMed_Yes
dtypes: bool(2379), int64(8)
memory usage: 237.1 MB


In [17]:
# Feature Selection with RFECV

from sklearn.feature_selection import RFECV
from sklearn.linear_model import SGDClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Identify numerical features (categoricals are already one-hot encoded)
numeric_features = X_df_encoded.select_dtypes(include=['int64', 'float64']).columns
all_features = X_df_encoded.columns  # One-hot encoded features are already included

# Scale numerical features but keep categorical features unchanged
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features)  # Scale only numerical features
], remainder='passthrough')  # Keep categorical (one-hot encoded) features unchanged

# Apply transformation
X_scaled = preprocessor.fit_transform(X_df_encoded)

# Ensure `feature_names` correctly represents all transformed features
feature_names = list(all_features)

# Ensure feature names match the dataset shape
assert len(feature_names) == X_scaled.shape[1], f"Feature names length ({len(feature_names)}) does not match dataset shape ({X_scaled.shape[1]})!"

# Split into train (80%) and test (20%) - Stratified
Xrfecv_train, Xrfecv_test, yrfecv_train, yrfecv_test = train_test_split(X_scaled, y_df_encoded, test_size=0.2, stratify=y_df_encoded, random_state=1234)

# Define a fast logistic regression model with L1 regularization
sgd_model = SGDClassifier(loss='log_loss', penalty='l1', max_iter=1000, random_state=1234)

# Apply RFECV to automatically select the optimal number of features
rfecv = RFECV(estimator=sgd_model, step=0.2, cv=5, scoring='accuracy', n_jobs=-1)  # Adjust scoring if needed
X_rfecv = rfecv.fit_transform(Xrfecv_train, yrfecv_train)

# Get selected feature mask
selected_mask = rfecv.support_

# Get selected feature names
selected_features_rfecv = np.array(feature_names)[selected_mask]

print(f"\nOptimal number of features selected: {rfecv.n_features_}")
print("\nSelected Features from RFECV:")
print(selected_features_rfecv)



Selected Features from RFE:
['discharge_disposition_id_11' 'discharge_disposition_id_13'
 'discharge_disposition_id_14'
 'medical_specialty_Endocrinology-Metabolism' 'diag_1_227' 'diag_1_242'
 'diag_1_378' 'diag_1_967' 'diag_2_421' 'diag_2_513']


#### Reduce Dataset to Important Variables for Logistic Regression

In [11]:
rf_features_js = ['num_lab_procedures', 'diag_1', 'diag_2', 'diag_3', 'num_medications', 'time_in_hospital', 'age', 
                  'number_inpatient', 'medical_specialty', 'discharge_disposition_id', 'payer_code', 'num_procedures', 
                  'number_diagnoses', 'admission_type_id', 'admission_source_id']
rf_features_kh = ['num_lab_procedures', 'num_medications', 'time_in_hospital', 'number_inpatient', 'number_diagnoses', 
                  'num_procedures', 'number_outpatient', 'number_emergency', 'diag_3', 'gender', 'diag_1', 'medical_specialty', 
                  'diag_2', 'payer_code', 'race', 'discharge_disposition_id']

# Convert lists to sets
set_js = set(rf_features_js)
set_kh = set(rf_features_kh)

# List of all unique features (Union)
rf_features_all = list(set_js | set_kh)  # OR use set_js.union(set_kh)

# List of only common features (Intersection)
rf_features_common = list(set_js & set_kh)  # OR use set_js.intersection(set_kh)

# Print results
print(f"All unique features ({len(rf_features_all)}):\n", rf_features_all)
print(f"\nFeatures in both lists ({len(rf_features_common)}):\n", rf_features_common)

All unique features (19):
 ['diag_1', 'num_procedures', 'age', 'time_in_hospital', 'admission_source_id', 'num_medications', 'gender', 'number_inpatient', 'payer_code', 'admission_type_id', 'diag_2', 'diag_3', 'medical_specialty', 'number_outpatient', 'discharge_disposition_id', 'number_emergency', 'race', 'number_diagnoses', 'num_lab_procedures']

Features in both lists (12):
 ['diag_2', 'medical_specialty', 'diag_3', 'num_procedures', 'diag_1', 'discharge_disposition_id', 'time_in_hospital', 'number_diagnoses', 'num_medications', 'number_inpatient', 'num_lab_procedures', 'payer_code']


In [12]:
# Make a copy of the cleaned dataset to reduce further
df_reduced = df_clean.copy()

# Preprocess diag_1, diag_2, diag_3 combining all codes with decimals under their integer values 
for col in ['diag_1', 'diag_2', 'diag_3']:
    df_reduced[col] = df_reduced[col].str.split('.').str[0]  # Drop decimals and digits after

# split into X and y
X_reduced = df_reduced.drop(columns='readmitted')
y_reduced = df_reduced['readmitted']

# List of important features to keep in reduced dataset
top_features = rf_features_all
# top_features = rf_features_common # option for a smaller feature list if needed

# Keep only selected features BEFORE one-hot encoding
X_reduced = X_reduced[top_features]
X_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   diag_1                    101766 non-null  object  
 1   num_procedures            101766 non-null  int64   
 2   age                       101766 non-null  category
 3   time_in_hospital          101766 non-null  int64   
 4   admission_source_id       101766 non-null  category
 5   num_medications           101766 non-null  int64   
 6   gender                    101766 non-null  object  
 7   number_inpatient          101766 non-null  int64   
 8   payer_code                101766 non-null  object  
 9   admission_type_id         101766 non-null  category
 10  diag_2                    101766 non-null  object  
 11  diag_3                    101766 non-null  object  
 12  medical_specialty         101766 non-null  object  
 13  number_outpatient         101

#### Encode Reduced Dataset

In [13]:
# Select categorical columns in the reduced dataset
categorical_columns = X_reduced.select_dtypes(include=['object', 'category']).columns

# Apply One-Hot Encoding
X_reduced_encoded = pd.get_dummies(X_reduced, columns=categorical_columns, drop_first=True)  # drop_first=True avoids multicollinearity

print(f"Final shape after one-hot encoding: {X_reduced_encoded.shape}")

X_reduced_encoded.info()

Final shape after one-hot encoding: (101766, 2316)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Columns: 2316 entries, num_procedures to race_Unknown
dtypes: bool(2308), int64(8)
memory usage: 230.2 MB


#### Standardize (only numerical variables) for `saga`

In [14]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns only
num_features = X_reduced_encoded.select_dtypes(include=['int64', 'float64']).columns

sc = StandardScaler()

# # Scale all features
# X_reduced_scaled = sc.fit_transform(X_reduced_encoded)  # Scale all features (numeric + encoded)

# Scale only numerical features
X_reduced_scaled_num = sc.fit_transform(X_reduced_encoded[num_features])  # Scale only numerical features
# Convert back to DataFrame
X_reduced_scaled_num = pd.DataFrame(X_reduced_scaled_num, columns=num_features, index=X_reduced_encoded.index)
# Keep categorical (one-hot encoded) variables unchanged
X_reduced_scaled = pd.concat([X_reduced_scaled_num, X_reduced_encoded.drop(columns=num_features)], axis=1)


#### Train/Test Splits for Full Dataset and for Reduced Dataset

In [15]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
import numpy as np

# Convert DataFrames to numpy arrays, full dataset
y = y_df_encoded.values
X = X_df_encoded.values
# Split into train (80%) and test (20%) - Stratified
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1234)


# Dataset with reduced features
y_reduced = y_reduced.values # convert to np.array if only num features scaled?
# X_reduced = X_reduced_scaled # np.array if all features scaled
X_reduced = X_reduced_scaled.values # dataframe if only num features scaled
Xred_train, Xred_test, yred_train, yred_test = train_test_split(X_reduced, y_reduced, test_size=0.2, stratify=y_reduced, random_state=1234)

#### Logistic Regression

Try different solvers: saga which may need more iters and scaled data, and lbfgs.  
Also, try reducing the features to use only the important variables from random forest from EDA.

In [16]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics as mt
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [37]:
# Use a Logistic Regression SGD Classifier with reduced dataset

# Initialize SGD Classifier for Logistic Regression
sgd_clf = SGDClassifier(loss="log_loss", penalty="l2", 
                        max_iter=1000, class_weight="balanced",
                        learning_rate="optimal",
                        n_jobs=-1, random_state=1234)

# Perform K-Fold Cross-Validation
num_folds = 5
cv_object = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234)

cv_accuracies = []

for train_idx, val_idx in cv_object.split(Xred_train, yred_train):
    X_train_fold, X_val_fold = Xred_train[train_idx], Xred_train[val_idx]
    y_train_fold, y_val_fold = yred_train[train_idx], yred_train[val_idx]

    # Train on training fold
    sgd_clf.fit(X_train_fold, y_train_fold)

    # Validate on validation fold
    y_val_pred = sgd_clf.predict(X_val_fold)
    acc = mt.accuracy_score(y_val_fold, y_val_pred)
    cv_accuracies.append(acc)

print(f"Cross-Validation Mean Accuracy: {np.mean(cv_accuracies):.3f}")

# Train Final Model on Full Training Data
sgd_clf.fit(Xred_train, yred_train)

# Evaluate on Independent Test Set
y_test_pred = sgd_clf.predict(Xred_test)
test_acc = mt.accuracy_score(yred_test, y_test_pred)
conf_matrix = mt.confusion_matrix(yred_test, y_test_pred)

print("Results from model with 'optimal' learning_rate parameter.")
print(f"Model converged in {sgd_clf.n_iter_} iterations.")
print(f"Final Model Test Accuracy: {test_acc:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(classification_report(yred_test, y_test_pred, target_names=['<30', '>30', 'NO']))

Cross-Validation Mean Accuracy: 0.558
Results from model with 'optimal' learning_rate parameter.
Model converged in 23 iterations.
Final Model Test Accuracy: 0.563
Confusion Matrix:
[[ 356 1042  874]
 [ 490 3595 3024]
 [ 509 2952 7512]]
              precision    recall  f1-score   support

         <30       0.26      0.16      0.20      2272
         >30       0.47      0.51      0.49      7109
          NO       0.66      0.68      0.67     10973

    accuracy                           0.56     20354
   macro avg       0.46      0.45      0.45     20354
weighted avg       0.55      0.56      0.55     20354



In [26]:
# Define hyperparameter grid
param_grid = {
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    'eta0': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'learning_rate': ['adaptive'],
    'max_iter': [1000],  # Adjust if needed
}

# Initialize SGDClassifier
sgd_clf = SGDClassifier(loss="log_loss", penalty="l2", n_jobs=-1, random_state=1234)

# Grid search with cross-validation
grid_search = GridSearchCV(sgd_clf, param_grid, cv=3, scoring="accuracy", verbose=2)
grid_search.fit(Xred_train, yred_train)

# Best parameters and performance
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV Accuracy: {grid_search.best_score_:.3f}")


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END alpha=1e-05, eta0=0.0001, learning_rate=adaptive, max_iter=1000; total time=   8.8s
[CV] END alpha=1e-05, eta0=0.0001, learning_rate=adaptive, max_iter=1000; total time=   8.7s
[CV] END alpha=1e-05, eta0=0.0001, learning_rate=adaptive, max_iter=1000; total time=   8.7s
[CV] END alpha=1e-05, eta0=0.001, learning_rate=adaptive, max_iter=1000; total time=  11.6s
[CV] END alpha=1e-05, eta0=0.001, learning_rate=adaptive, max_iter=1000; total time=  11.2s
[CV] END alpha=1e-05, eta0=0.001, learning_rate=adaptive, max_iter=1000; total time=  11.2s
[CV] END alpha=1e-05, eta0=0.01, learning_rate=adaptive, max_iter=1000; total time=  13.5s
[CV] END alpha=1e-05, eta0=0.01, learning_rate=adaptive, max_iter=1000; total time=  12.4s
[CV] END alpha=1e-05, eta0=0.01, learning_rate=adaptive, max_iter=1000; total time=  12.9s
[CV] END alpha=1e-05, eta0=0.1, learning_rate=adaptive, max_iter=1000; total time=  15.6s
[CV] END alpha=1e-05,

In [27]:
# Checking the order of response classes
print(np.unique(yred_test))

['<30' '>30' 'NO']


In [40]:
# Use a Logistic Regression SGD Classifier with reduced dataset

# Initialize SGD Classifier for Logistic Regression
sgd_clf = SGDClassifier(loss="log_loss", penalty="l2", 
                        max_iter=1000, class_weight="balanced", # balanced class weight decreases overall accuracy but improves performance for <30 and >30 classes
                        learning_rate="adaptive", eta0=1.0,     # adaptive seems to perform better than optimal
                        n_jobs=-1, random_state=1234)

# Perform K-Fold Cross-Validation
num_folds = 5
cv_object = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234)

cv_accuracies = []

for train_idx, val_idx in cv_object.split(Xred_train, yred_train):
    X_train_fold, X_val_fold = Xred_train[train_idx], Xred_train[val_idx]
    y_train_fold, y_val_fold = yred_train[train_idx], yred_train[val_idx]

    # Train on training fold
    sgd_clf.fit(X_train_fold, y_train_fold)

    # Validate on validation fold
    y_val_pred = sgd_clf.predict(X_val_fold)
    acc = mt.accuracy_score(y_val_fold, y_val_pred)
    cv_accuracies.append(acc)

print(f"Cross-Validation Mean Accuracy: {np.mean(cv_accuracies):.3f}")

# Train Final Model on Full Training Data
sgd_clf.fit(Xred_train, yred_train)

# Evaluate on Independent Test Set
y_test_pred = sgd_clf.predict(Xred_test)
test_acc = mt.accuracy_score(yred_test, y_test_pred)
conf_matrix = mt.confusion_matrix(yred_test, y_test_pred)

print(f"Model converged in {sgd_clf.n_iter_} iterations.")
print(f"Final Model Test Accuracy: {test_acc:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(classification_report(yred_test, y_test_pred, target_names=['<30', '>30', 'NO']))


Cross-Validation Mean Accuracy: 0.562
Model converged in 73 iterations.
Final Model Test Accuracy: 0.569
Confusion Matrix:
[[ 477  823  972]
 [ 674 2989 3446]
 [ 640 2209 8124]]
              precision    recall  f1-score   support

         <30       0.27      0.21      0.23      2272
         >30       0.50      0.42      0.46      7109
          NO       0.65      0.74      0.69     10973

    accuracy                           0.57     20354
   macro avg       0.47      0.46      0.46     20354
weighted avg       0.55      0.57      0.56     20354



In [41]:
from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from sklearn import metrics as mt
from sklearn.model_selection import StratifiedKFold

# Define mini-batch size
batch_size = 256
num_epochs = 10
num_folds = 3

# Perform K-Fold Cross-Validation
cv_object = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234)

cv_accuracies = []

for train_idx, val_idx in cv_object.split(Xred_train, yred_train):
    X_train_fold, X_val_fold = Xred_train[train_idx], Xred_train[val_idx]
    y_train_fold, y_val_fold = yred_train[train_idx], yred_train[val_idx]

    # Reinitialize model in each fold (important!)
    sgd_clf = SGDClassifier(loss="log_loss", penalty="l2", max_iter=1, learning_rate="optimal", n_jobs=-1, random_state=1234, warm_start=True)

    # Mini-batch SGD Training (Per Fold)
    for epoch in range(num_epochs):
        X_train_fold, y_train_fold = shuffle(X_train_fold, y_train_fold, random_state=epoch)
        for i in range(0, X_train_fold.shape[0], batch_size):
            X_batch = X_train_fold[i:i+batch_size]
            y_batch = y_train_fold[i:i+batch_size]
            sgd_clf.partial_fit(X_batch, y_batch, classes=np.unique(yred_train))

    # Validate on validation fold
    y_val_pred = sgd_clf.predict(X_val_fold)
    acc = mt.accuracy_score(y_val_fold, y_val_pred)
    cv_accuracies.append(acc)

print(f"Cross-Validation Mean Accuracy: {np.mean(cv_accuracies):.3f}")

# Train Final Model on Full Training Data
sgd_clf = SGDClassifier(loss="log_loss", penalty="l2", max_iter=1, learning_rate="optimal", n_jobs=-1, random_state=1234, warm_start=True)

for epoch in range(num_epochs):
    Xred_train, yred_train = shuffle(Xred_train, yred_train, random_state=epoch)
    for i in range(0, Xred_train.shape[0], batch_size):
        X_batch = Xred_train[i:i+batch_size]
        y_batch = yred_train[i:i+batch_size]
        sgd_clf.partial_fit(X_batch, y_batch, classes=np.unique(yred_train))

# Evaluate on Independent Test Set
y_test_pred = sgd_clf.predict(Xred_test)
test_acc = mt.accuracy_score(yred_test, y_test_pred)
conf_matrix = mt.confusion_matrix(yred_test, y_test_pred)

print(f"Final Model Test Accuracy: {test_acc:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(classification_report(yred_test, y_test_pred, target_names=['<30', '>30', 'NO']))


Cross-Validation Mean Accuracy: 0.571
Final Model Test Accuracy: 0.579
Confusion Matrix:
[[  48  876 1348]
 [  38 2798 4273]
 [  19 2010 8944]]


In [42]:
print(classification_report(yred_test, y_test_pred, target_names=['<30', '>30', 'NO']))

              precision    recall  f1-score   support

         <30       0.46      0.02      0.04      2272
         >30       0.49      0.39      0.44      7109
          NO       0.61      0.82      0.70     10973

    accuracy                           0.58     20354
   macro avg       0.52      0.41      0.39     20354
weighted avg       0.55      0.58      0.53     20354



In [17]:
# Use a Logistic Regression SGD Classifier with full dataset

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics as mt

# Initialize SGD Classifier for Logistic Regression
sgd_clf = SGDClassifier(loss="log_loss", penalty="l2", max_iter=500, learning_rate="optimal", n_jobs=-1, random_state=1234)

# Perform K-Fold Cross-Validation
num_folds = 3
cv_object = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234)

cv_accuracies = []

for train_idx, val_idx in cv_object.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # Train on training fold
    sgd_clf.fit(X_train_fold, y_train_fold)

    # Validate on validation fold
    y_val_pred = sgd_clf.predict(X_val_fold)
    acc = mt.accuracy_score(y_val_fold, y_val_pred)
    cv_accuracies.append(acc)

print(f"Cross-Validation Mean Accuracy: {np.mean(cv_accuracies):.3f}")

# Train Final Model on Full Training Data
sgd_clf.fit(X_train, y_train)

# Evaluate on Independent Test Set
y_test_pred = sgd_clf.predict(X_test)
test_acc = mt.accuracy_score(y_test, y_test_pred)
conf_matrix = mt.confusion_matrix(y_test, y_test_pred)

print(f"Final Model Test Accuracy: {test_acc:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(classification_report(y_test, y_test_pred, target_names=['<30', '>30', 'NO']))

Cross-Validation Mean Accuracy: 0.535
Final Model Test Accuracy: 0.589
Confusion Matrix:
[[  25  920 1327]
 [  11 2759 4339]
 [   2 1776 9195]]


In [18]:
# Mini-Batch SGD Logistic Regression

from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from sklearn import metrics as mt

# Define mini-batch size
batch_size = 256

# Initialize SGD Classifier for Logistic Regression
sgd_clf = SGDClassifier(loss="log_loss", penalty="l2", max_iter=1, learning_rate="optimal", n_jobs=-1, random_state=1234, warm_start=True)

# Number of epochs (how many times to go through the data)
num_epochs = 10

# Train using mini-batch updates
for epoch in range(num_epochs):
    Xred_train, yred_train = shuffle(Xred_train, yred_train, random_state=epoch)  # Shuffle at the start of each epoch
    for i in range(0, Xred_train.shape[0], batch_size):
        X_batch = Xred_train[i:i+batch_size]
        y_batch = yred_train[i:i+batch_size]
        
        # Perform one mini-batch update
        sgd_clf.partial_fit(X_batch, y_batch, classes=np.unique(yred_train))
    
    # Print progress
    print(f"Epoch {epoch+1}/{num_epochs} completed.")

# Evaluate on the Test Set
y_test_pred = sgd_clf.predict(Xred_test)
test_acc = mt.accuracy_score(yred_test, y_test_pred)
conf_matrix = mt.confusion_matrix(yred_test, y_test_pred)

print(f"Final Model Test Accuracy: {test_acc:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")


Epoch 1/10 completed.
Epoch 2/10 completed.
Epoch 3/10 completed.
Epoch 4/10 completed.
Epoch 5/10 completed.
Epoch 6/10 completed.
Epoch 7/10 completed.
Epoch 8/10 completed.
Epoch 9/10 completed.
Epoch 10/10 completed.
Final Model Test Accuracy: 0.583
Confusion Matrix:
[[  68  733 1471]
 [  47 2267 4795]
 [  27 1420 9526]]


In [16]:
# Perform K-Fold Cross-Validation on Train Set
num_folds = 3
cv_object = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234) # stratify the response classes

# Initialize logistic regression model
# solver: try saga for very large and sparse datasets and lbfgs (default) for small/medium datasets (liblinear for binary response)
# penalty='l2' and C=1.0 are default, tune max_iter depending on convergence
lr_clf = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=5000, n_jobs=-1)

cv_accuracies = []

# Split the 80% train set into train/validation splits for K-fold CV
for train_idx, val_idx in cv_object.split(Xred_train, yred_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # Train on training fold
    lr_clf.fit(X_train_fold, y_train_fold)

    # Validate on validation fold
    y_val_pred = lr_clf.predict(X_val_fold)
    acc = mt.accuracy_score(y_val_fold, y_val_pred)
    cv_accuracies.append(acc)

print(f"Cross-Validation Mean Accuracy: {np.mean(cv_accuracies):.3f}")

# Train Final Model on Full Training Data
lr_clf.fit(Xred_train, yred_train)

# Evaluate on Independent Test Set
y_test_pred = lr_clf.predict(Xred_test)
test_acc = mt.accuracy_score(yred_test, y_test_pred)
conf_matrix = mt.confusion_matrix(yred_test, y_test_pred)

print(f"Final Model Test Accuracy: {test_acc:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")

KeyboardInterrupt: 

In [None]:
# Perform K-Fold Cross-Validation on Train Set
num_folds = 3
cv_object = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234) # stratify the response classes

# Initialize logistic regression model
# solver: try saga for very large and sparse datasets and lbfgs (default) for small/medium datasets (liblinear for binary response)
# penalty='l2' and C=1.0 are default, tune max_iter depending on convergence
lr_clf = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=5000, n_jobs=-1)

cv_accuracies = []

# Split the 80% train set into train/validation splits for K-fold CV
for train_idx, val_idx in cv_object.split(Xred_train, yred_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # Train on training fold
    lr_clf.fit(X_train_fold, y_train_fold)

    # Validate on validation fold
    y_val_pred = lr_clf.predict(X_val_fold)
    acc = mt.accuracy_score(y_val_fold, y_val_pred)
    cv_accuracies.append(acc)

print(f"Cross-Validation Mean Accuracy: {np.mean(cv_accuracies):.3f}")

# Train Final Model on Full Training Data
lr_clf.fit(Xred_train, yred_train)

# Evaluate on Independent Test Set
y_test_pred = lr_clf.predict(Xred_test)
test_acc = mt.accuracy_score(yred_test, y_test_pred)
conf_matrix = mt.confusion_matrix(yred_test, y_test_pred)

print(f"Final Model Test Accuracy: {test_acc:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")

#### Support Vector Machine Model

In [None]:
from sklearn.svm import SVC

# Let's try a linear model

# Train
# Perform K-Fold Cross-Validation on Train Set
num_folds = 3
cv_object = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234) # stratify the response classes

# Initialize SVM model
svm = SVC(kernel='linear', random_state=1, C=1.0)

cv_accuracies = []

# Split the 80% train set into train/validation splits for K-fold CV
for train_idx, val_idx in cv_object.split(Xred_train, yred_train):
    X_train_fold, X_val_fold = Xred_train[train_idx], Xred_train[val_idx]
    y_train_fold, y_val_fold = yred_train[train_idx], yred_train[val_idx]

    # Train on training fold
    svm.fit(X_train_fold, y_train_fold)

    # Validate on validation fold
    y_val_pred = svm.predict(X_val_fold)
    acc = accuracy_score(y_val_fold, y_val_pred)
    cv_accuracies.append(acc)

print(f"Cross-Validation Mean Accuracy: {np.mean(cv_accuracies):.3f}")

# Train Final Model on Full Training Data
svm.fit(Xred_train, yred_train)

# Predict
# Evaluate on Independent Test Set
y_test_pred = svm.predict(Xred_test)
print('Length y_pred: ', len(y_test_pred))
test_acc = mt.accuracy_score(yred_test, y_test_pred)
conf_matrix = mt.confusion_matrix(yred_test, y_test_pred)

print(f"Final Model Test Accuracy: {test_acc:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(classification_report(yred_test, y_test_pred, target_names=['<30', '>30', 'NO']))