### Objective:
The objective of this assignment is to run ML models and make publication-worthy figures or tables

We use the German Credit Risk dataset to answer the questions given in this notebook.

### German Credit Risk Data

**About dataset**\
The dataset consists of following columns
1. **checking_balance**           : Amount of money available in account of customers
2. **months_loan_duration**       : Duration since loan taken
3. **credit_history**             : credit history of each customers 
4. **purpose**                    : Purpose why loan has been taken
5. **amount**                     : Amount of loan taken
6. **savings_balance**            : Balance in account
7. **employment_duration**        : Duration of employment
8. **percent_of_income**          : Percentage of monthly income
9. **years_at_residence**         : Duration of current residence
10. **age**                       : Age of customer
11. **other_credit**              : Any other credits taken
12. **housing**                   : Type of housing, rent or own
13. **existing_loans_count**      : Existing count of loans
14. **job**                       : Job type
15. **dependents**                : Any dependents on customer
16. **phone**                     : Having phone or not
17. **default**                   : Default status (Target column)

#### Install Libraries

In [16]:
#install the libraries
!pip install xgboost
import pandas as pd
import numpy as np
import matplotlib as plot

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.ensemble import (
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
    VotingClassifier
)

# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split



Create train test split of data

In [None]:
def create_train_test_split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    return X_train, X_test, y_train, y_test
    

Run model and see results

In [29]:
def train_evaluate_model(model, X, y, test_size=0.2, random_state=42):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Get feature importances
    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
        
        # Print or visualize feature importances
        for feature_name, importance in zip(X.columns, feature_importances):
            print(f"{feature_name}: {importance}")
            
        plt.barh(X.columns, feature_importances)
        plt.xlabel('Feature Importance')
        plt.ylabel('Feature')
        plt.show()
    
    # Make predictions
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cls_report = classification_report(y_test, y_pred,  output_dict=True)
    return cls_report

HyperParameter tuning for Gradient Boost and XGBoost

In [30]:
def tune_boosting_hyperparameters(model, X, y, test_size=0.2, random_state=42):
     # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    #hyperparameters 
    grid = {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': np.arange(100, 500, 100),
        'max_depth': [2, 3, 4, 5, 6, 7]
    }

    model_cv = GridSearchCV(model, grid, cv=4)
    model_cv.fit(X_train, y_train)

    y_pred = model_cv.predict(X_test)

    print("Best Parameters:", model_cv.best_params_)
    print("Train Score:", model_cv.best_score_)
    print("Test Score:", model_cv.score(X_test, y_test))
    print(classification_report(y_test, y_pred))
    cls_report = classification_report(y_test, y_pred, output_dict=True)
    return cls_report

In [2]:
def train_voting_classifier(model1, model2, model3, model4, model5, X, y,voting):
    # Create the VotingClassifier
    voting_clf = VotingClassifier(
        estimators=[('gbc', model1), ('bc', model2), ('rf', model3), ('xgb', model4), ('svc', model5)],
        voting=voting  # 'hard' for majority vote or 'soft' for weighted probabilities
    )
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Train the VotingClassifier
    voting_clf.fit(X_train, y_train)

    # Make predictions
    y_pred = voting_clf.predict(X_test)
    
    print(classification_report(y_test, y_pred))
    # return classification report
    cls_report = classification_report(y_test, y_pred, output_dict=True)
    return cls_report

In [32]:
def tune_bagging_classifier(X, y, base_classifier=None, random_state=1):
    if base_classifier is None:
        base_classifier = DecisionTreeClassifier()
    
    bagging_clf = BaggingClassifier(base_estimator=base_classifier, random_state=random_state)

    param_grid = {
        'n_estimators': [10, 50, 100, 200],
        'max_samples': [0.5, 0.7, 0.9],
        'max_features': [0.5, 0.7, 0.9]
    }

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    grid_search = GridSearchCV(bagging_clf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    print("Best Parameters:", best_params)

    y_pred = best_estimator.predict(X_test)

    # return classification report
    print(classification_report(y_test, y_pred))
    cls_report = classification_report(y_test, y_pred, output_dict=True)
    return cls_report
