In [1]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install shap

Collecting shap
  Downloading shap-0.46.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (24 kB)
Collecting tqdm>=4.27.0 (from shap)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba (from shap)
  Downloading numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting llvmlite<0.44,>=0.43.0dev0 (from numba->shap)
  Downloading llvmlite-0.43.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.8 kB)
Downloading shap-0.46.0-cp311-cp311-macosx_11_0_arm64.whl (455 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.8/455.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m[31m10.2 MB/s[0m eta [36m0:00

In [5]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

# Load the data
df = pd.read_csv('./data/Train.csv')

# Handle missing values in the target variable
df = df.dropna(subset=['bank_account'])

# Fill missing values in other columns if necessary
df.fillna(method='ffill', inplace=True)

# One-Hot Encode categorical variables
categorical_features = ['country', 'location_type', 'cellphone_access', 'gender_of_respondent', 
                        'relationship_with_head', 'marital_status', 'education_level', 'job_type']

encoder = OneHotEncoder()
df_encoded = encoder.fit_transform(df[categorical_features]).toarray()  # Convert to dense array

# Identify numerical features
numerical_features = ['household_size', 'age_of_respondent']

# Scale numerical features using MinMaxScaler
scaler = MinMaxScaler()
scaled_numerical_features = scaler.fit_transform(df[numerical_features])

# Combine the scaled numerical features with the one-hot encoded features
X = np.hstack((df_encoded, scaled_numerical_features))

# Extract the target variable
y = df['bank_account'].map({'Yes': 1, 'No': 0})

# Verify that there are no NaNs in y
assert not np.any(pd.isna(y)), "Target variable y contains NaN values"

# Check the distribution of the target variable
print("Class distribution in target variable y:\n", y.value_counts())

# Split the data before applying SMOTE to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the distribution of the resampled target variable
print("Class distribution after applying SMOTE to training set:\n", pd.Series(y_train_resampled).value_counts())

# Model selection and training
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Evaluate models using cross-validation with additional metrics
for name, model in models.items():
    mae_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='neg_mean_absolute_error')
    accuracy_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
    precision_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='precision')
    recall_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='recall')
    f1_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='f1')
    
    print(f'{name} MAE: {-mae_scores.mean()}')
    print(f'{name} Accuracy: {accuracy_scores.mean()}')
    print(f'{name} Precision: {precision_scores.mean()}')
    print(f'{name} Recall: {recall_scores.mean()}')
    print(f'{name} F1 Score: {f1_scores.mean()}')

# Fit the best model (as an example, using RandomForest here)
best_model = RandomForestClassifier()
best_model.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate the final model
predictions = best_model.predict(X_test)

# Check if the model is making constant predictions
unique_predictions = np.unique(predictions)
print(f'Unique predictions: {unique_predictions}')

# If the model is making constant predictions, we need to re-evaluate the approach
if len(unique_predictions) == 1:
    print("Model is making constant predictions. Re-evaluate the approach.")
else:
    mae = mean_absolute_error(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    
    print(f'Random Forest MAE on Test Set: {mae}')
    print(f'Random Forest Accuracy on Test Set: {accuracy}')
    print(f'Random Forest Precision on Test Set: {precision}')
    print(f'Random Forest Recall on Test Set: {recall}')
    print(f'Random Forest F1 Score on Test Set: {f1}')

    # Model Interpretation using SHAP
    import shap

    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test)
    shap.summary_plot(shap_values, X_test)

    # Save the model
    import joblib
    joblib.dump(best_model, 'best_model.pkl')


Class distribution in target variable y:
 bank_account
0    20212
1     3312
Name: count, dtype: int64
Class distribution after applying SMOTE to training set:
 bank_account
0    16169
1    16169
Name: count, dtype: int64
Logistic Regression MAE: 0.22969884975230795
Logistic Regression Accuracy: 0.7703011502476921
Logistic Regression Precision: 0.7841685713637674
Logistic Regression Recall: 0.7459345406188232
Logistic Regression F1 Score: 0.7645539405478707
Decision Tree MAE: 0.13126820347324447
Decision Tree Accuracy: 0.8687319112808962
Decision Tree Precision: 0.8794828814564182
Decision Tree Recall: 0.8551590441873682
Decision Tree F1 Score: 0.8611954159192653
Random Forest MAE: 0.10207678696821379
Random Forest Accuracy: 0.8978304821232653
Random Forest Precision: 0.8980311642373463
Random Forest Recall: 0.8975234713293128
Random Forest F1 Score: 0.8943705397418285
Unique predictions: [0 1]
Random Forest MAE on Test Set: 0.15536663124335812
Random Forest Accuracy on Test Set: 0.844