# Modelling

In [22]:
! rm -r app

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, precision_recall_curve, auc, balanced_accuracy_score, accuracy_score
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Load dataset
data = pd.read_csv('data/bank.csv')

# Define feature groups
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
binary_features = ['deposit', 'housing', 'default']
categorical_features = ['job', 'marital', 'contact', 'poutcome']
ordinal_features = ['education', 'month']

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for binary data (using OneHotEncoder with drop='if_binary')
binary_transformer = OneHotEncoder(drop='if_binary', sparse=False)

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Preprocessing for Ordinal Features
education_categories = ['unknown', 'primary', 'secondary', 'tertiary']
month_categories = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
ordinal_transformer = OrdinalEncoder(categories=[education_categories, month_categories])

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('bin', binary_transformer, binary_features),
        ('cat', categorical_transformer, categorical_features),
        ('ord', ordinal_transformer, ordinal_features)
    ])

# Split the data into features (X) and target (y)
X = data.drop(columns=['loan'])
y = data['loan'].apply(lambda x: 1 if x == 'yes' else 0)

# First split: 90% for training and test, 10% for holdout (to be evaluated at the end)
X_train_full, X_holdout, y_train_full, y_holdout = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Preprocess the 90% training set before applying SMOTE
X_train_full_preprocessed = preprocessor.fit_transform(X_train_full)

# Handle class imbalance using SMOTE on the training set
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_full_preprocessed, y_train_full)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled)

# Preprocess the holdout set
X_holdout_preprocessed = preprocessor.transform(X_holdout)

# Define pipelines for each model
pipelines = {
    'Logistic Regression': Pipeline(steps=[
        ('classifier', LogisticRegression(solver='liblinear', class_weight='balanced'))
    ]),
    'Decision Tree': Pipeline(steps=[
        ('classifier', DecisionTreeClassifier(random_state=42, max_depth=5, class_weight='balanced'))
    ]),
    'Random Forest': Pipeline(steps=[
        ('classifier', RandomForestClassifier(random_state=42, n_estimators=100, max_depth=5, class_weight='balanced'))
    ]),
    'XGBoost': Pipeline(steps=[
        ('classifier', xgb.XGBClassifier(random_state=42, eval_metric='logloss',
                                         scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train)))
    ])
}

# Model Ensembling using Voting Classifier (soft voting)
voting_pipeline = VotingClassifier(estimators=[
    ('lr', pipelines['Logistic Regression']),
    ('dt', pipelines['Decision Tree']),
    ('rf', pipelines['Random Forest']),
    ('xgb', pipelines['XGBoost'])
], voting='soft')

# Train Voting Classifier
voting_pipeline.fit(X_train, y_train)

# Store model results for F1-Score, PR AUC, Accuracy, and Balanced Accuracy
model_reports = {}
pr_auc_scores = {}
accuracies = {}
balanced_accuracies = {}
pr_curves = {}

for model_name, model_pipeline in pipelines.items():
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1]

    # Calculate F1-Score
    model_reports[model_name] = f1_score(y_test, y_pred)

    # Precision-Recall Curve and AUC
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)
    pr_auc_scores[model_name] = pr_auc
    pr_curves[model_name] = (precision, recall)

    # Calculate Accuracy
    accuracies[model_name] = accuracy_score(y_test, y_pred)

    # Calculate Balanced Accuracy
    balanced_accuracies[model_name] = balanced_accuracy_score(y_test, y_pred)

    # Perform cross-validation for Balanced Accuracy
    cv_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='balanced_accuracy')
    print(f"{model_name} - Cross-validation Balanced Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Add Voting Classifier Results
y_pred_voting = voting_pipeline.predict(X_test)
y_pred_proba_voting = voting_pipeline.predict_proba(X_test)[:, 1]
model_reports['Voting Classifier'] = f1_score(y_test, y_pred_voting)

# Precision-Recall for Voting Classifier
precision_voting, recall_voting, _ = precision_recall_curve(y_test, y_pred_proba_voting)
pr_auc_voting = auc(recall_voting, precision_voting)
pr_auc_scores['Voting Classifier'] = pr_auc_voting

# Accuracy for Voting Classifier
accuracies['Voting Classifier'] = accuracy_score(y_test, y_pred_voting)

# Balanced Accuracy for Voting Classifier
balanced_accuracies['Voting Classifier'] = balanced_accuracy_score(y_test, y_pred_voting)

# Create a DataFrame for the results
df_results = pd.DataFrame({
    'Model': list(model_reports.keys()),
    'F1-Score': list(model_reports.values()),
    'PR AUC': list(pr_auc_scores.values()),
    'Accuracy': list(accuracies.values()),
    'Balanced Accuracy': list(balanced_accuracies.values())
})

# Display the DataFrame for test set
print("Test Set Results:")
print(df_results)

# Evaluate on holdout set
holdout_results = {}
holdout_pr_auc_scores = {}
holdout_accuracies = {}
holdout_balanced_accuracies = {}

for model_name, model_pipeline in pipelines.items():
    y_pred_holdout = model_pipeline.predict(X_holdout_preprocessed)
    y_pred_proba_holdout = model_pipeline.predict_proba(X_holdout_preprocessed)[:, 1]

    # F1-Score on holdout
    holdout_results[model_name] = f1_score(y_holdout, y_pred_holdout)

    # Precision-Recall AUC on holdout
    precision, recall, _ = precision_recall_curve(y_holdout, y_pred_proba_holdout)
    holdout_pr_auc_scores[model_name] = auc(recall, precision)

    # Accuracy on holdout
    holdout_accuracies[model_name] = accuracy_score(y_holdout, y_pred_holdout)

    # Balanced Accuracy on holdout
    holdout_balanced_accuracies[model_name] = balanced_accuracy_score(y_holdout, y_pred_holdout)

# Add Voting Classifier Results on holdout
y_pred_voting_holdout = voting_pipeline.predict(X_holdout_preprocessed)
y_pred_proba_voting_holdout = voting_pipeline.predict_proba(X_holdout_preprocessed)[:, 1]

holdout_results['Voting Classifier'] = f1_score(y_holdout, y_pred_voting_holdout)

# Precision-Recall for Voting Classifier on holdout
precision_voting, recall_voting, _ = precision_recall_curve(y_holdout, y_pred_proba_voting_holdout)
holdout_pr_auc_scores['Voting Classifier'] = auc(recall_voting, precision_voting)

# Accuracy for Voting Classifier on holdout
holdout_accuracies['Voting Classifier'] = accuracy_score(y_holdout, y_pred_voting_holdout)

# Balanced Accuracy for Voting Classifier on holdout
holdout_balanced_accuracies['Voting Classifier'] = balanced_accuracy_score(y_holdout, y_pred_voting_holdout)

# Create a DataFrame for the holdout set results
df_holdout_results = pd.DataFrame({
    'Model': list(holdout_results.keys()),
    'F1-Score': list(holdout_results.values()),
    'PR AUC': list(holdout_pr_auc_scores.values()),
    'Accuracy': list(holdout_accuracies.values()),
    'Balanced Accuracy': list(holdout_balanced_accuracies.values())
})

# Display the DataFrame for holdout set
print("\nHoldout Set Results:")
print(df_holdout_results)

# Select the best model based on Balanced Accuracy
best_model_name = df_holdout_results.loc[df_holdout_results['Balanced Accuracy'].idxmax(), 'Model']
best_model = pipelines[best_model_name] if best_model_name != 'Voting Classifier' else voting_pipeline

print(f"\nBest model based on Balanced Accuracy: {best_model_name}")

# Feature Importance Analysis (for interpretable models)
if best_model_name in ['Logistic Regression', 'Decision Tree', 'Random Forest']:
    feature_names = (
        numerical_features +
        [f"{feat}_1" for feat in binary_features] +
        list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)) +
        ordinal_features
    )

    if best_model_name == 'Logistic Regression':
        importances = best_model.named_steps['classifier'].coef_[0]
    elif best_model_name in ['Decision Tree', 'Random Forest']:
        importances = best_model.named_steps['classifier'].feature_importances_

    feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})
    feature_importance = feature_importance.sort_values('importance', ascending=False)

    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))

    # Plot feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
    plt.title(f'Top 20 Feature Importances - {best_model_name}')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

# Plot model performance
plt.figure(figsize=(12, 8))
x = range(len(df_results['Model']))
width = 0.2

plt.bar(x, df_results['F1-Score'], width, label='F1-Score', align='center')
plt.bar([i + width for i in x], df_results['PR AUC'], width, label='PR AUC', align='center')
plt.bar([i + 2 * width for i in x], df_results['Accuracy'], width, label='Accuracy', align='center')
plt.bar([i + 3 * width for i in x], df_results['Balanced Accuracy'], width, label='Balanced Accuracy', align='center')

plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Model Performance on Test Set')
plt.xticks([i + 1.5 * width for i in x], df_results['Model'], rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('model_performance_test.png')
plt.close()

# Plot model performance on holdout set
plt.figure(figsize=(12, 8))
x = range(len(df_holdout_results['Model']))
width = 0.2

plt.bar(x, df_holdout_results['F1-Score'], width, label='F1-Score', align='center')
plt.bar([i + width for i in x], df_holdout_results['PR AUC'], width, label='PR AUC', align='center')
plt.bar([i + 2 * width for i in x], df_holdout_results['Accuracy'], width, label='Accuracy', align='center')
plt.bar([i + 3 * width for i in x], df_holdout_results['Balanced Accuracy'], width, label='Balanced Accuracy', align='center')

plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Model Performance on Holdout Set')
plt.xticks([i + 1.5 * width for i in x], df_holdout_results['Model'], rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('model_performance_holdout.png')
plt.close()

# Prepare the holdout set for JSON export
holdout_data = X_holdout.copy()
holdout_data['loan'] = y_holdout

# Convert the holdout set to a list of dictionaries
holdout_list = holdout_data.to_dict('records')

# Export the holdout set as JSON
os.makedirs('app/models/artifacts', exist_ok=True)
with open('app/models/artifacts/validation_set.json', 'w') as f:
    json.dump(holdout_list, f)

print("Validation set exported as JSON successfully!")

# Test The Model
# Create a sample input (make sure the order matches your feature list)
sample_input = pd.DataFrame({
    'age': [40],
    'balance': [1500],
    'day': [15],
    'duration': [300],
    'campaign': [2],
    'pdays': [999],
    'previous': [0],
    'deposit': ['no'],
    'housing': ['yes'],
    'job': ['technician'],
    'marital': ['married'],
    'default': ['no'],
    'contact': ['cellular'],
    'poutcome': ['unknown'],
    'education': ['secondary'],
    'month': ['may']
})

print("\nSample input:")
print(sample_input)

# Preprocess the sample input
sample_preprocessed = preprocessor.transform(sample_input)

# Make a prediction
prediction = best_model.predict(sample_preprocessed)
prediction_proba = best_model.predict_proba(sample_preprocessed)

print(f"\nPrediction: {'Yes' if prediction[0] == 1 else 'No'}")
print(f"Probability of Yes: {prediction_proba[0][1]:.2f}")

# Deploy Model
# Create the artifacts directory if it doesn't exist
os.makedirs('app/models/artifacts', exist_ok=True)

# Export the best model
with open('app/models/artifacts/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('app/models/artifacts/preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print("Model and preprocessor saved successfully!")

# Print feature order for API input
feature_names = (
    numerical_features +
    binary_features +
    categorical_features +
    ordinal_features
)

print("\nFeature order for API input:")
for i, feature in enumerate(feature_names):
    print(f"{i+1}. {feature}")

# Print the shape of the preprocessed input
print(f"\nShape of preprocessed input: {X_holdout_preprocessed.shape}")

print("\nArtifacts generated:")
print("1. app/models/artifacts/best_model.pkl")
print("2. app/models/artifacts/preprocessor.pkl")
print("3. app/models/artifacts/validation_set.json")
print("4. feature_importance.png (if applicable)")
print("5. model_performance_test.png")
print("6. model_performance_holdout.png")

print("\nExecution complete.")



Logistic Regression - Cross-validation Balanced Accuracy: 0.6482 (+/- 0.0089)
Decision Tree - Cross-validation Balanced Accuracy: 0.7123 (+/- 0.0113)
Random Forest - Cross-validation Balanced Accuracy: 0.7683 (+/- 0.0187)
XGBoost - Cross-validation Balanced Accuracy: 0.9148 (+/- 0.0092)
Test Set Results:
                 Model  F1-Score    PR AUC  Accuracy  Balanced Accuracy
0  Logistic Regression  0.675177  0.662233  0.650315           0.650330
1        Decision Tree  0.720044  0.819830  0.709677           0.709685
2        Random Forest  0.791074  0.884040  0.773048           0.773065
3              XGBoost  0.919352  0.972312  0.923077           0.923068
4    Voting Classifier  0.879017  0.957831  0.877839           0.877841

Holdout Set Results:
                 Model  F1-Score    PR AUC  Accuracy  Balanced Accuracy
0  Logistic Regression  0.339036  0.242799  0.619517           0.673494
1        Decision Tree  0.282098  0.138673  0.644584           0.597710
2        Random Forest  