# Bank Marketing Classification Task.

## Introduction

*Information*

The data is related to direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required to assess if the product (bank term deposit) would be subscribed ('yes') or not ('no').

*Classification Goal*

The goal is to predict if the client will subscribe (yes/no) to a term deposit (variable y).

## Preparation

### Imports

In [None]:
import os
import pathlib
from functools import partial
from itertools import product
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import squarify 

from scipy.stats import chi2_contingency
from scipy import stats


from statsmodels.stats.outliers_influence import variance_inflation_factor


from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, confusion_matrix, precision_score


from aws.aws_funcs import upload_to_s3

### Useful Functions

#### AWS

In [None]:
path_to_local_plots = 'path/to/file'
def upload_to_aws_png(file_name):
    upload_to_s3(f'{path_to_local_plots}{file_name}.png', f'path/png/{file_name}.png')

def upload_plot(plot_name):
    plt.savefig(f'{path_to_local_plots}{plot_name}.png', dpi=300)
    upload_to_aws_png(plot_name)

In [None]:
path_to_local_csv = 'path/to/file'
def upload_to_aws_csv(file_name):
    upload_to_s3(f'{path_to_local_csv}{file_name}.csv', f'path/csv/{file_name}.csv')

def upload_csv(df, csv_name, index=False):
    df.to_csv(f'{path_to_local_csv}{csv_name}.csv', index=index)
    upload_to_aws_csv(csv_name)

#### Visualization

In [None]:
def count_categorical(df, column, target=0):
    '''
    Visualizes the distribution of a categorical variable and its relationship with a binary target variable in a DataFrame.

    This function creates a two-part visualization: a treemap and a bar chart. The treemap shows the frequency of each category
    in the specified column, providing a visual representation of the size of each category. The bar chart displays the count
    of observations for each category, split by the values of a binary target variable ('y'), allowing for an examination of
    the relationship between the categorical variable and the target.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the data to be visualized.
    - column (str): The name of the categorical column to visualize.
    - target (int, optional): The binary target variable to compare against. Defaults to 0, assuming 'y' is the target column.

    The function does not return any value but displays two plots:
    1. A treemap visualization of the categorical variable's frequency.
    2. A bar chart showing the count of observations for each category, colored by the binary target variable's value.
    '''
    fig, ax = plt.subplots(2, 1, figsize=(17,14), dpi=300)
    category_sizes = df[column].value_counts().reset_index()
    category_sizes.columns = [column, 'counts']

    colors = plt.cm.tab20c.colors
    squarify.plot(sizes=category_sizes['counts'], label=category_sizes[column], alpha=0.6, color=colors, ax=ax[0])
    ax[0].axis('off')
    ax[0].set_title(f'Treemap of {column.capitalize()}s', pad=20)
    ax[0].set_xlabel('Category', labelpad=20)
    ax[0].set_ylabel('Frequency', labelpad=20);
    ax[0].tick_params(axis='x', rotation=45);

    temp_df = df.groupby([column, 'y']).size().unstack(fill_value=0)
    colormap = plt.cm.coolwarm
    colors = [colormap(i) for i in np.linspace(0.1, 0.85, temp_df.shape[1])]
    temp_df.plot(kind='bar', stacked=False, ax=ax[1], color=colors)
    ax[1].tick_params(axis='x', rotation=45)

In [None]:
def summary_numerical(df, col, q_min, q_max, upload=1, engineered=0):
    """
    Generates and visualizes statistical summaries for a numerical column in a DataFrame.

    This function creates a 2x2 grid of plots for a specified numerical column: histogram with KDE, Q-Q plot, boxplot,
    and a quantile line plot for detecting outliers. It allows for an extensive examination of the distribution,
    normality, and potential outliers within the data.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the data.
    - col (str): The numerical column to analyze.
    - q_min (float): The minimum quantile to start the range for the outlier plot.
    - q_max (float): The maximum quantile to end the range for the outlier plot.
    - upload (int, optional): Flag to upload the plot. Defaults to 1 (true).
    - engineered (int, optional): Flag indicating if the column is engineered. Defaults to 0 (false).

    No return value; the function plots and optionally uploads the visualizations.
    """
    
    fig, ax = plt.subplots(2, 2, figsize=(15, 8))

    sns.histplot(data=df, x=col, kde=True, ax=ax[0, 0])

    stats.probplot(x=df[col], dist=stats.norm, plot=ax[0, 1])

    sns.boxplot(data=df, x=col, ax=ax[1, 0])

    pts = df[col].quantile(q=np.arange(q_min, q_max, 0.01))
    sns.lineplot(x=pts.index, y=pts, ax=ax[1, 1])

    titles_name = [["Histogram", "QQ plot"], ["Boxplot", "Outlier"]]

    for i, j in product(range(2), repeat=2):
        ax[i, j].set_title(titles_name[i][j].capitalize(), pad=20)

    plt.suptitle(f"Distribution of: {col.capitalize()}", fontsize=15)
    plt.tight_layout()
    if upload:
        if engineered:
            upload_plot(f'distribution_{col}_engineered')
        else:
            upload_plot(f'distribution_{col}')

#### Statistical

In [None]:
def cat_to_target(df, cat, target='y'):
    """
    Calculates the Chi-squared test statistic and p-value for the relationship between a categorical variable and a target variable.

    This function applies a Chi-squared test of independence to examine if there is a significant relationship between the categorical
    variable and the target variable in the provided DataFrame.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the data.
    - cat (str): Column name of the categorical variable.
    - target (str, optional): Column name of the target variable. Defaults to 'y'.

    Returns:
    - chi2 (float): The Chi-squared test statistic.
    - p_value (float): The p-value of the test.
    - dof (int): Degrees of freedom of the Chi-squared test.
    - expected (numpy.ndarray): The expected frequencies, based on the marginal sums of the table.
    """
    
    temp = pd.crosstab(df[cat], df[target])
    chi2, p_value, dof, expected = chi2_contingency(temp)
    return chi2, p_value, dof, expected

#### Modeling

In [None]:
def train_models(features, labels, models, n_folds=5):
    """
    Trains multiple models and evaluates their performance using cross-validation.

    This function iterates over a dictionary of models, trains each model using stratified k-fold cross-validation, and 
    calculates the average accuracy, recall, f1 score, and precision for each model. The performance metrics are then 
    aggregated into a DataFrame for comparison.

    Parameters:
    - features (pandas.DataFrame): The feature variables.
    - labels (pandas.Series): The target variable.
    - models (dict): A dictionary of model name and model instance pairs.

    Returns:
    - results_df (pandas.DataFrame): A DataFrame containing the average accuracy, recall, f1 score, and precision for each model.
    - models (dict): The same dictionary of models that was passed in, after they have been fit to the data.
    """

    results = {
        'accuracy': [],
        'recall': [],
        'f1 score': [],
        'precision': []
    }

    s_fold = StratifiedKFold(
    n_splits=n_folds,
    shuffle=True,
    random_state=42
    )

    for name, model in models.items():
        fold_accuracy, fold_recall, fold_f1_score, fold_precision = [], [], [], []

        for train_idx, val_idx in s_fold.split(features, labels):
            X_train, X_val = features.iloc[train_idx], features.iloc[val_idx]
            y_train, y_val = labels.iloc[train_idx], labels.iloc[val_idx]

            model.fit(X_train, y_train)
            preds = model.predict(X_val)
            fold_accuracy.append(accuracy_score(y_val, preds))
            fold_recall.append(recall_score(y_val, preds))
            fold_f1_score.append(f1_score(y_val, preds))
            fold_precision.append(precision_score(y_val, preds))

        avg_accuracy = np.mean(fold_accuracy)
        avg_recall = np.mean(fold_recall)
        avg_f1_score = np.mean(fold_f1_score)
        avg_precision = np.mean(fold_precision)
        
        results['accuracy'].append(avg_accuracy)
        results['recall'].append(avg_recall)
        results['f1 score'].append(avg_f1_score)
        results['precision'].append(avg_precision)

    results_df = pd.DataFrame(results)
    results_df.index = models.keys()
    return results_df, models


def test_models(features, labels, models):
    """
    Tests the performance of multiple trained models on a dataset.

    Evaluates each model's accuracy, recall, f1 score, and precision on the provided features and labels. The function iterates 
    through a dictionary of trained model instances, uses them to predict labels for the given features, and calculates the 
    performance metrics for each model. These metrics are compiled into a DataFrame for easy comparison.

    Parameters:
    - features (pandas.DataFrame): The feature variables for testing.
    - labels (pandas.Series): The true labels for performance evaluation.
    - models (dict): A dictionary where keys are model names and values are trained model instances.

    Returns:
    - results_df (pandas.DataFrame): A DataFrame with each model's accuracy, recall, f1 score, and precision.
    """

    results = {
        'accuracy': [],
        'recall': [],
        'f1 score': [],
        'precision': []
    }

    for name, model in models.items():
        preds = model.predict(features)
        results['accuracy'].append(accuracy_score(labels, preds))
        results['recall'].append(recall_score(labels, preds))
        results['f1 score'].append(f1_score(labels, preds))
        results['precision'].append(precision_score(labels, preds))

    results_df = pd.DataFrame(results)
    results_df.index = models.keys()
    return results_df 


def confusion_matrix_plot(features, labels, models, upload=0):
    """
    Plots confusion matrices for given models on a dataset and optionally uploads the plots.

    This function iterates over a dictionary of models, predicts labels for a given set of features using each model, and then 
    plots the confusion matrix for the predicted versus actual labels. It can also upload the generated plots if required.

    Parameters:
    - features (pandas.DataFrame): The feature variables for the dataset.
    - labels (pandas.Series): The true labels for the dataset.
    - models (dict): A dictionary of model name and model instance pairs.
    - upload (int, optional): Flag to determine whether to upload the plots. If 1, plots are uploaded. Defaults to 0.
    """

    for name, model in models.items():
        preds = model.predict(features)
        confusion_matrix_model = confusion_matrix(labels, preds)
        fig, ax = plt.subplots(figsize=(8, 4), dpi=150)
        sns.heatmap(confusion_matrix_model, annot=True, fmt='d', cmap='coolwarm_r', ax=ax, alpha=0.5)
        ax.set_xlabel('Predictions', labelpad=20)
        ax.set_ylabel('True', labelpad=20)
        ax.set_title(name, pad=20)
        if upload:
            upload_plot(f'{upload}_{name}')

### Get the Data and the First Impression

In [None]:
path_data = pathlib.Path('data/bank_data.csv')
df = pd.read_csv(path_data.__str__())
df.head(20)

In [None]:
upload_csv(df.head(20), 'raw_data')

We can immediately get rid of the column *Unnamed 0*, since it duplicates the index column and doesn't have any useful information 

In [None]:
df = df[[column for column in df.columns if column != 'Unnamed: 0']]
df.head()

In [None]:
upload_csv(df.head(20), 'without_null')

In [None]:
df.info()

In [None]:
df_nulls = pd.DataFrame(df.isnull().sum()).reset_index()
df_nulls.columns = ['Column', 'Number of Nulls']

df_nulls

We can see that there are no *NULL* values in our df and we want to separate numerical and categorical types of columns to make there analysis more productive

In [None]:
upload_csv(df_nulls, 'is_there_null')

In [None]:
numerical_df = df.select_dtypes(include='number')
numerical_df.head()

In [None]:
upload_csv(numerical_df.head(20), 'numerical_df')

In [None]:
numerical_df.describe()

In [None]:
upload_csv(numerical_df.describe(), 'numerical_describe', index=True)

In [None]:
categorical_df = df[[column for column in df.columns if column not in numerical_df.columns]]
categorical_df

In [None]:
upload_csv(categorical_df.head(20), 'categorical_df')

In [None]:
categorical_df.describe()

In [None]:
upload_csv(categorical_df.describe(), 'categorical_describe', index=True)

### EDA

#### Numerical

In [None]:
for column in numerical_df.columns:
    summary_numerical(numerical_df, column, 0.85, 1)

In [None]:
# %%writefile /Users/konstantinsokolovskiy/Desktop/My_Big_Project/final/projects/projects/project_3/data/code_snippets/correlation_matrix.txt
fig, ax = plt.subplots(figsize = (5, 5), dpi=200)

sns.heatmap(
    data=numerical_df.corr(),
    vmin=-1,
    vmax=1,
    linecolor="white",
    linewidth=0.5,
    annot=True,
    fmt=".2f",
    ax=ax
)
plt.xticks(rotation=45)
plt.title('Correlations'); 
upload_plot('correlation_matrix')

##### Multicollinearity

In [None]:
# %%writefile /Users/konstantinsokolovskiy/Desktop/My_Big_Project/final/projects/projects/project_3/data/code_snippets/multicollinearity.txt
vif_data = pd.DataFrame()
vif_data["feature"] = numerical_df.columns

vif_data["VIF"] = [variance_inflation_factor(numerical_df.values, i) 
                   for i in range(numerical_df.shape[1])]

vif_data

In [None]:
upload_csv(vif_data, 'vif_data')

#### Categorical

In [None]:
count_categorical_features = partial(count_categorical, df=categorical_df)

##### Target 

In [None]:
count_categorical_features(column='y', target=1)

##### Job

In [None]:
count_categorical_features(column='job')
upload_plot('distribution_job')

In [None]:
chi2_job, p_job, _, _ = cat_to_target(categorical_df, 'job')
p_job

##### Marital

In [None]:
count_categorical_features(column='marital')
upload_plot('distribution_marital')

In [None]:
chi2_marital, p_marital, _, _ = cat_to_target(categorical_df, 'marital')
p_marital

##### Education

In [None]:
count_categorical_features(column='education')
upload_plot('distribution_education')

In [None]:
chi2_education, p_education, _, _ = cat_to_target(categorical_df, 'education')
p_education

##### Default

In [None]:
count_categorical_features(column='default')
upload_plot('distribution_default')

In [None]:
chi2_default, p_default, _, _ = cat_to_target(categorical_df, 'default')
p_default

##### Housing

In [None]:
count_categorical_features(column='housing')
upload_plot('distribution_housing')

In [None]:
chi2_housing, p_housing, _, _ = cat_to_target(categorical_df, 'housing')
p_housing

##### Loan

In [None]:
count_categorical_features(column='loan')
upload_plot('distribution_loan')

In [None]:
chi2_loan, p_loan, _, _ = cat_to_target(categorical_df, 'loan')
p_loan

##### Contact

In [None]:
count_categorical_features(column='contact')
upload_plot('distribution_contact')

In [None]:
chi2_contact, p_contact, _, _ = cat_to_target(categorical_df, 'contact')
p_contact

##### Month

In [None]:
count_categorical_features(column='month')
upload_plot('distribution_month')

In [None]:
chi2_month, p_month, _, _ = cat_to_target(categorical_df, 'month')
p_month

##### Poutcome

In [None]:
count_categorical_features(column='poutcome')
upload_plot('distribution_poutcome')

In [None]:
chi2_poutcome, p_poutcome, _, _ = cat_to_target(categorical_df, 'poutcome')
p_poutcome

#### Results of Statistical Tests

In [None]:
categorical_df.columns

In [None]:
p_result = pd.DataFrame({
    'category': ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact','month', 'poutcome'],
    'p_value': [p_job, p_marital, p_education, p_default, p_housing, p_loan, p_contact, p_month, p_poutcome], 
    })
p_result['is_it'] = p_result['p_value'].apply(lambda x: 1 if x < 0.05 else 0)
p_result

In [None]:
upload_csv(p_result, 'chi2_test')

After conducting the Chi2 test we see that only one categorical feature, *default*, doesn't have the influence on result, so we can get rid of it

## Modelling

### Base Model. No Feature Engineering.

#### Preparing

In [None]:
df_for_base = df.copy()

In [None]:
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(df_for_base.drop('y', axis=1), 
                                                    df_for_base['y'], 
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df_for_base['y'])


X_train_base.reset_index(drop=True, inplace=True)
X_test_base.reset_index(drop=True, inplace=True)

y_train_base = y_train_base.map({'no': 0, 'yes': 1}).reset_index(drop=True)
y_test_base = y_test_base.map({'no': 0, 'yes': 1}).reset_index(drop=True)

X_train_base_final = pd.get_dummies(
        X_train_base,
        columns=categorical_df.columns[:-1],
        drop_first=True)

X_test_base_final = pd.get_dummies(
        X_test_base,
        columns=categorical_df.columns[:-1],
        drop_first=True)

#### Fit the Models

In [None]:
base_models = {
    'Logistic Regregression': LogisticRegression(max_iter=5000,),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'SVC': SVC()
}

In [None]:
n_folds = 5
s_fold = StratifiedKFold(
    n_splits=n_folds,
    shuffle=True,
    random_state=42
)

In [None]:
base_results_train, trained_base_models = train_models(X_train_base_final, y_train_base, base_models)
base_results_test = test_models(X_test_base_final, y_test_base, trained_base_models)


In [None]:
base_results_train

In [None]:
base_results_test

In [None]:
upload_csv(base_results_train, csv_name='base_results_train', index=True)
upload_csv(base_results_test, csv_name='base_results_test', index=True)

In [None]:
fig, ax = plt.subplots(figsize=(12, 12), dpi=200)
plot_importance(trained_base_models['XGBoost'], ax=ax);
upload_plot('base_feature_importance')

In [None]:
confusion_matrix_plot(X_test_base_final, y_test_base, trained_base_models, upload='base')

### Top Features

In [None]:
feature_importance = trained_base_models['XGBoost'].get_booster().get_score(importance_type='weight')

sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

fig = plt.figure(figsize=(15, 7), dpi=200)
x = np.arange(len(sorted_importance))
y = [elem[1] for elem in sorted_importance]
plt.plot(x, y, marker='o');
plt.xlabel('Feature Number', labelpad=20)
plt.ylabel('Feature Importance', labelpad=20)
plt.grid(True)
upload_plot('base_feature_importance_1')

In [None]:
num_features = 20
top_features = [pair[0] for pair in sorted_importance][:num_features]

In [None]:
base_models_top_features = {
    'Logistic Regregression': LogisticRegression(max_iter=5000,),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
}

In [None]:
base_results_top_features_train, trained_base_models_top_features = train_models(
    X_train_base_final[top_features], 
    y_train_base, 
    base_models_top_features
    )
base_results_top_features_test = test_models(X_test_base_final[top_features], y_test_base, trained_base_models_top_features)

In [None]:
base_results_top_features_train

In [None]:
base_results_top_features_test

In [None]:
upload_csv(base_results_top_features_train, 'base_results_top_features_train', index=True)
upload_csv(base_results_top_features_test, 'base_results_top_features_test', index=True)

### Grid Search

In [None]:
base_models_grid = {
    'Logistic Regregression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
}

param_grid = {
    'Logistic Regregression': {
        'solver': ['liblinear', 'lbfgs'],
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'max_iter': [5_000]
    },

    'Random Forest': {
        'n_estimators': [10, 50, 100, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 4, 6]
    },

    'XGBoost': {
        'max_depth': [5, 6],
        'learning_rate': [0.001, 0.01],
        'n_estimators': [300, 1000, 2000],
        'subsample': [0.7, 0.85, 1]
        }
}
best_params = {}
for name, model in base_models_grid.items():

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid[name], scoring='precision', cv=3)
    grid_search.fit(X_train_base_final[top_features], y_train_base)
    best_params[name] = grid_search.best_params_
    print(f'For model {name} the best params: {grid_search.best_params_} \n\n')

In [None]:
base_best_params = pd.DataFrame(best_params).fillna('-')
upload_csv(base_best_params, 'base_best_params', index=True)

In [None]:
base_models_best = {
    'Logistic Regregression': LogisticRegression(**best_params['Logistic Regregression']),
    'Random Forest': RandomForestClassifier(**best_params['Random Forest']),
    'XGBoost': XGBClassifier(**best_params['XGBoost']),
}

base_models_best_results_train, trained_base_models_best = train_models(
    X_train_base_final[top_features], 
    y_train_base, 
    base_models_best
    )

base_models_best_results_test = test_models(X_test_base_final[top_features], y_test_base, base_models_best)

In [None]:
base_models_best_results_train

In [None]:
base_models_best_results_test

In [None]:
upload_csv(base_models_best_results_train, 'base_models_best_results_train', index=True)
upload_csv(base_models_best_results_test, 'base_models_best_results_test', index=True)

# Feature Engineering

In [None]:
numerical_df_engineered = numerical_df.copy()
numerical_df_engineered.columns

In [None]:
df_engineered = df.copy()

### Age

In [None]:
summary_numerical(df_engineered, 'age', 0.55, 1, 0)

In [None]:
df_engineered['age'] = np.log1p(df_engineered['age'])
summary_numerical(df_engineered, 'age', 0.55, 1, 1, 1)

### Balance

In [None]:
summary_numerical(df_engineered, 'balance', 0.55, 1, 0)

In [None]:
df_engineered = df_engineered[df_engineered['balance'] < 30_000]

In [None]:
# %%writefile /Users/konstantinsokolovskiy/Desktop/My_Big_Project/final/projects/projects/project_3/data/code_snippets/balance_engineered.txt
min_balance = abs(df_engineered['balance'].min())
df_engineered = df_engineered[df_engineered['balance'] != -min_balance]
df_engineered['balance'] = df_engineered['balance'] + min_balance + 1
df_engineered['balance'] = np.log1p(df_engineered['balance'])

In [None]:
summary_numerical(df_engineered, 'balance', 0.95, 1, 1, 1)

### Duration

In [None]:
summary_numerical(df_engineered, 'duration', 0.55, 1, 0)

In [None]:
df_engineered[df_engineered['duration'] > 2_000]['y'].value_counts()

In [None]:
df_engineered = df_engineered[df_engineered['duration'] <= 2_000]

In [None]:
df_engineered['duration'] = np.log1p(df_engineered['duration'])
summary_numerical(df_engineered, 'duration', 0.95, 1, 1, 1)

### Campaign

In [None]:
summary_numerical(df_engineered, 'campaign', 0.55, 1, 0)

In [None]:
numerical_df['campaign'].value_counts()

In [None]:
def define_campaign(x):
    if x == 1:
        return '1'
    elif x == 2:
        return '2'
    elif x in [3, 4]:
        return '3-4'
    elif x >= 5:
        return '5+'

df_engineered['campaign'] = df_engineered['campaign'].apply(define_campaign)

_, p_value, _, _ = cat_to_target(df_engineered, 'campaign')

In [None]:
p_value

### Pdays

In [None]:
summary_numerical(df_engineered, 'pdays', 0.55, 1, 0)

In [None]:
df_engineered['pdays'].value_counts()

In [None]:
def define_pdays(x):
    if x == -1:
        return 'no'
    elif -1 < x <= 200:
        return '< 200'
    else:
        return '>= 200'

df_engineered['pdays'] = df_engineered['pdays'].apply(define_pdays)
_, p_value, _, _ = cat_to_target(df_engineered, 'pdays')

In [None]:
p_value

### Previous

In [None]:
summary_numerical(df_engineered, 'previous', 0.55, 1, 0)

In [None]:
df_engineered['previous'].value_counts()

In [None]:
def define_previous(x):
    if x == 0:
        return '0'
    elif x in [1, 2, 3]:
        return '1-3'
    else:
        return '>3'

df_engineered['previous'] = df_engineered['previous'].apply(define_previous)

_, p_value, _, _ = cat_to_target(df_engineered, 'previous')

In [None]:
p_value

# Re-Train with fixed Parameters

#### Preparing

In [None]:
df_for_engineered = df_engineered.copy()
X_train_engineered, X_test_engineered, y_train_engineered, y_test_engineered = train_test_split(df_for_engineered.drop('y', axis=1), 
                                                    df_for_engineered['y'], 
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df_for_engineered['y'])

X_train_engineered.reset_index(drop=True, inplace=True)
X_test_engineered.reset_index(drop=True, inplace=True)

y_train_engineered_final = y_train_engineered.map({'no': 0, 'yes': 1}).reset_index(drop=True)
y_test_engineered_final = y_test_engineered.map({'no': 0, 'yes': 1}).reset_index(drop=True)

columns_cat = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'campaign', 'pdays', 'previous', 'poutcome']

X_train_engineered_final = pd.get_dummies(
        X_train_engineered,
        columns=columns_cat,
        drop_first=True)

X_test_engineered_final = pd.get_dummies(
        X_test_engineered,
        columns=columns_cat,
        drop_first=True)

#### Fit the Models

In [None]:
engineered_models = {
    'Logistic Regregression': LogisticRegression(max_iter=5000,),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(**{'n_estimators': 600, 'subsample': 0.85, 'learning_rate': 0.001530897464421364, 'max_depth': 4})
}

In [None]:
engineered_results_train, trained_engineered_models = train_models(X_train_engineered_final, y_train_engineered_final, engineered_models)
engineered_results_test = test_models(X_test_engineered_final, y_test_engineered_final, trained_engineered_models)

In [None]:
engineered_results_train

In [None]:
engineered_results_test

In [None]:
upload_csv(engineered_results_train, 'engineered_results_train', index=True)
upload_csv(engineered_results_test, 'engineered_results_test', index=True)

## Optuna Optimisation

In [None]:
import optuna
from optuna.samplers import TPESampler

In [None]:
from sklearn.model_selection import cross_val_score

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', low=300, high=1500, step=100)
    subsample = trial.suggest_float('subsample', low=0.75, high=0.9, step=0.05)
    learning_rate = trial.suggest_float('learning_rate', low=0.001, high=0.05, log=True)
    max_depth = trial.suggest_int('max_depth', low=3, high=7, step=1)

    xgb_clf = XGBClassifier(
        n_estimators=n_estimators,
        subsample=subsample,
        learning_rate=learning_rate,
        max_depth=max_depth
    )

    return cross_val_score(
        estimator=xgb_clf,
        X=X_train_engineered_final,
        y=y_train_engineered_final,
        scoring='precision'
        ).mean()

study = optuna.create_study(
    sampler=TPESampler(),
    direction='maximize'
)

study.optimize(objective, n_trials=100)

In [None]:
print("---Bayesian Optimization---")
print('Number of iterations: 100')
print(f"Best trial index: {study.best_trial.number}")
print(f"Best score: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_params}")

In [None]:
xgb_clf = XGBClassifier(
        n_estimators=300,
        subsample=0.9,
        learning_rate=0.0024,
        max_depth=6
    )
xgb_clf.fit(X_train_engineered_final, y_train_engineered_final)

In [None]:
scores = cross_val_score(xgb_clf, X_train_engineered_final, y_train_engineered_final, cv=s_fold, scoring='precision')
print(scores.mean())

In [None]:
preds = xgb_clf.predict(X_train_engineered_final)
1 - (preds.shape[0] - (preds == y_train_engineered_final.to_numpy()).sum()) / preds.shape[0]

In [None]:
preds = xgb_clf.predict(X_test_engineered_final)
1 - (preds.shape[0] - (preds == y_test_engineered_final.to_numpy()).sum()) / preds.shape[0]