In [10]:
import pandas as pd

df = pd.read_csv('../final_data/clean_model_data.csv')

missing_summary = df.isnull().sum()
missing_percentage = (df.isnull().mean()) * 100

# Combine results into a summary dataframe
summary = pd.DataFrame({
    "Total Missing": missing_summary,
    "Percentage Missing": missing_percentage
}).sort_values(by="Total Missing", ascending=False)

summary.head()

Unnamed: 0,Total Missing,Percentage Missing
rafeduc,33700,15.105876
mapa_self,26407,11.836821
rameduc,21589,9.677174
climsa_self,21331,9.561526
mealsa_self,18080,8.10428


In [11]:
# target and features
y = df['nhmliv_self_next_wave'].to_numpy()
X = df.iloc[:, list(df.columns).index('nhmliv_self_next_wave')+1:].to_numpy()

In [12]:
### MODEL 1) VANILLA LOGISTIC REGRESSION

import numpy as np
# Cross Validation
from sklearn.model_selection import StratifiedKFold
# Imputing NaNs
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# Oversampling Minority Class
from imblearn.over_sampling import RandomOverSampler
# Model
from sklearn.linear_model import LogisticRegression
# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss, f1_score
# Preprocessing continuous scales for Logistic Regression
from sklearn.preprocessing import StandardScaler

# Opting to impute missing values with the mean for Logistic Regression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(max_iter=1000))
])

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=720)

# Results storage
fold_accuracies = []
fold_conf_matrices = []
fold_reports = []
fold_pseudo_r2 = []
fold_f1_scores = []

# Perform Stratified K-Fold CV
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"\n=== Fold {fold} ===")
    
    # Split the dataset into training and testing subsets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Apply oversampling to the training data only
    ros = RandomOverSampler(random_state=720)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

    # Fit and Predict
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=False)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Compute log-likelihoods for McFadden's pseudo R^2
    log_likelihood_full = -log_loss(y_train, pipeline.predict_proba(X_train), normalize=False)
    log_likelihood_null = -log_loss(y_train, np.full_like(y_train, y_train.mean()), normalize=False)
    mcfadden_r2 = 1 - (log_likelihood_full / log_likelihood_null)
    fold_pseudo_r2.append(mcfadden_r2)

    # Store results
    fold_accuracies.append(accuracy)
    fold_conf_matrices.append(conf_matrix)
    fold_reports.append(class_report)
    fold_f1_scores.append(f1)

    # Print results for this fold
    # print(f"Fold Accuracy: {accuracy:.4f}")
    # print(f"Fold F1 Score (Weighted): {f1:.4f}")
    # print("Confusion Matrix:")
    # print(conf_matrix)
    # print("Classification Report:")
    # print(class_report)
    # print(f"McFadden's Pseudo R^2: {mcfadden_r2:.4f}")

# Summarize cross-validation results
print("\n=== Cross-Validation Results ===")
print(f"Mean Accuracy: {np.mean(fold_accuracies):.4f}")
print(f"Standard Deviation of Accuracy: {np.std(fold_accuracies):.4f}")
print(f"Mean F1 Score (Weighted): {np.mean(fold_f1_scores):.4f}")
print(f"Standard Deviation of F1 Score: {np.std(fold_f1_scores):.4f}")
print(f"Mean McFadden's Pseudo R^2: {np.mean(fold_pseudo_r2):.4f}")
print(f"Standard Deviation of McFadden's Pseudo R^2: {np.std(fold_pseudo_r2):.4f}")


=== Fold 1 ===

=== Fold 2 ===

=== Fold 3 ===

=== Fold 4 ===

=== Cross-Validation Results ===
Mean Accuracy: 0.9865
Standard Deviation of Accuracy: 0.0001
Mean F1 Score (Weighted): 0.9806
Standard Deviation of F1 Score: 0.0000
Mean McFadden's Pseudo R^2: 0.2715
Standard Deviation of McFadden's Pseudo R^2: 0.0032


In [None]:
### MODEL 2) ELASTIC NET REGULARIZATION FOR LOGISTIC REGRESSION

# Opting to impute missing values with the mean for Logistic Regression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=1000))
])

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=720)

# Results storage
fold_accuracies = []
fold_conf_matrices = []
fold_reports = []
fold_pseudo_r2 = []
fold_f1_scores = []

# Perform Stratified K-Fold CV
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"\n=== Fold {fold} ===")
    
    # Split the dataset into training and testing subsets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Apply oversampling to the training data only
    ros = RandomOverSampler(random_state=720)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

    # Fit and Predict
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=False)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Compute log-likelihoods for McFadden's pseudo R^2
    log_likelihood_full = -log_loss(y_train, pipeline.predict_proba(X_train), normalize=False)
    log_likelihood_null = -log_loss(y_train, np.full_like(y_train, y_train.mean()), normalize=False)
    mcfadden_r2 = 1 - (log_likelihood_full / log_likelihood_null)
    fold_pseudo_r2.append(mcfadden_r2)

    # Store results
    fold_accuracies.append(accuracy)
    fold_conf_matrices.append(conf_matrix)
    fold_reports.append(class_report)
    fold_f1_scores.append(f1)

# Summarize cross-validation results
print("\n=== Cross-Validation Results ===")
print(f"Mean Accuracy: {np.mean(fold_accuracies):.4f}")
print(f"Standard Deviation of Accuracy: {np.std(fold_accuracies):.4f}")
print(f"Mean F1 Score (Weighted): {np.mean(fold_f1_scores):.4f}")
print(f"Standard Deviation of F1 Score: {np.std(fold_f1_scores):.4f}")
print(f"Mean McFadden's Pseudo R^2: {np.mean(fold_pseudo_r2):.4f}")
print(f"Standard Deviation of McFadden's Pseudo R^2: {np.std(fold_pseudo_r2):.4f}")


=== Fold 1 ===





=== Fold 2 ===





=== Fold 3 ===





=== Fold 4 ===





=== Cross-Validation Results ===
Mean Accuracy: 0.9865
Standard Deviation of Accuracy: 0.0001
Mean F1 Score (Weighted): 0.9806
Standard Deviation of F1 Score: 0.0001
Mean McFadden's Pseudo R^2: 0.2717
Standard Deviation of McFadden's Pseudo R^2: 0.0031


In [None]:
### MODEL 3) Random Forest

# Model
from sklearn.ensemble import RandomForestClassifier
# Metrics
from sklearn.metrics import mean_squared_error

# Opting to impute missing values with the mean for Random Forest
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('random_forest', RandomForestClassifier(
        random_state=720,
        n_estimators=50,
        max_depth=5,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        oob_score=True,
    ))
])

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=720)

# Results storage
fold_accuracies = []
fold_conf_matrices = []
fold_reports = []
fold_pseudo_r2 = []
fold_f1_scores = []

# Perform Stratified K-Fold CV
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"\n=== Fold {fold} ===")
    
    # Split the dataset into training and testing subsets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Apply oversampling to the training data only
    ros = RandomOverSampler(random_state=720)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

    # Fit and Predict
    pipeline.fit(X_train_resampled, y_train_resampled)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=False)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Compute McFadden's pseudo R^2
    log_likelihood_full = -log_loss(y_test, y_pred_proba, normalize=False)
    y_test_baseline_prob = np.full((len(y_test), len(np.unique(y_train))), 1 / len(np.unique(y_train)))  # Null model: uniform probabilities
    log_likelihood_null = -log_loss(y_test, y_test_baseline_prob, normalize=False)
    pseudo_r2 = 1 - (log_likelihood_full / log_likelihood_null)
    fold_pseudo_r2.append(pseudo_r2)

    # Store results
    fold_accuracies.append(accuracy)
    fold_conf_matrices.append(conf_matrix)
    fold_reports.append(class_report)
    fold_f1_scores.append(f1)

# Summarize cross-validation results
print("\n=== Cross-Validation Results ===")
print(f"Mean Accuracy: {np.mean(fold_accuracies):.4f}")
print(f"Standard Deviation of Accuracy: {np.std(fold_accuracies):.4f}")
print(f"Mean F1 Score (Weighted): {np.mean(fold_f1_scores):.4f}")
print(f"Standard Deviation of F1 Score: {np.std(fold_f1_scores):.4f}")
print(f"Mean Pseudo R²: {np.mean(fold_pseudo_r2):.4f}")
print(f"Standard Deviation of Pseudo R²: {np.std(fold_pseudo_r2):.4f}")



=== Fold 1 ===

=== Fold 2 ===

=== Fold 3 ===

=== Fold 4 ===

=== Cross-Validation Results ===
Mean Accuracy: 0.8123
Standard Deviation of Accuracy: 0.0019
Mean F1 Score (Weighted): 0.8848
Standard Deviation of F1 Score: 0.0012
Mean Pseudo R²: 0.4199
Standard Deviation of Pseudo R²: 0.0040


In [17]:
### MODEL 4) XGBOOST

# Model
from xgboost import XGBClassifier

# Opting to impute missing values with the mean for XGBoost
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('xgboost', XGBClassifier(
        random_state=720,
        use_label_encoder=False,
        eval_metric='logloss',
        max_depth=5,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=1,
        learning_rate=0.05,
        n_estimators=100  # Lower for faster testing
    ))
])

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=720)

# Results storage
fold_accuracies = []
fold_conf_matrices = []
fold_reports = []
fold_pseudo_r2 = []
fold_f1_scores = []

# Perform Stratified K-Fold CV
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"\n=== Fold {fold} ===")
    
    # Split the dataset into training and testing subsets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Apply oversampling to the training data only
    ros = RandomOverSampler(random_state=720)
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
    
    # Fit and Predict
    pipeline.fit(X_train_resampled, y_train_resampled)
    y_pred = pipeline.predict(X_test)

    # Fit and Predict
    pipeline.fit(X_train_resampled, y_train_resampled)
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=False)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Compute log-likelihoods for McFadden's pseudo R^2
    log_likelihood_full = -log_loss(y_train_resampled, pipeline.predict_proba(X_train_resampled), normalize=False)
    log_likelihood_null = -log_loss(y_train_resampled, np.full_like(y_train_resampled, y_train_resampled.mean()), normalize=False)
    mcfadden_r2 = 1 - (log_likelihood_full / log_likelihood_null)
    fold_pseudo_r2.append(mcfadden_r2)

    # Store results
    fold_accuracies.append(accuracy)
    fold_conf_matrices.append(conf_matrix)
    fold_reports.append(class_report)
    fold_f1_scores.append(f1)

# Summarize cross-validation results
print("\n=== Cross-Validation Results ===")
print(f"Mean Accuracy: {np.mean(fold_accuracies):.4f}")
print(f"Standard Deviation of Accuracy: {np.std(fold_accuracies):.4f}")
print(f"Mean F1 Score (Weighted): {np.mean(fold_f1_scores):.4f}")
print(f"Standard Deviation of F1 Score: {np.std(fold_f1_scores):.4f}")
print(f"Mean McFadden's Pseudo R^2: {np.mean(fold_pseudo_r2):.4f}")
print(f"Standard Deviation of McFadden's Pseudo R^2: {np.std(fold_pseudo_r2):.4f}")


=== Fold 1 ===


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




=== Fold 2 ===


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




=== Fold 3 ===


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




=== Fold 4 ===


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




=== Cross-Validation Results ===
Mean Accuracy: 0.8513
Standard Deviation of Accuracy: 0.0016
Mean F1 Score (Weighted): 0.9083
Standard Deviation of F1 Score: 0.0009
Mean McFadden's Pseudo R^2: 0.5561
Standard Deviation of McFadden's Pseudo R^2: 0.0045
