In [62]:
# Import necessary libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # For Logistic Regression model
from sklearn.tree import DecisionTreeClassifier      # For Decision Tree model
from sklearn.model_selection import GridSearchCV      # For hyperparameter tuning
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, f1_score, recall_score  # For evaluation metrics
import numpy as np                                   # For numerical operations
import pandas as pd

In [23]:
df = pd.read_csv("dataset/bank-additional-full.csv", sep = ";")
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [52]:
y = df['y']
X = df.drop(columns='y')


In [53]:
CatCols = X.select_dtypes(include=['object','category']).columns
NumCols = X.select_dtypes(include=['number']).columns

In [54]:
ohe = pd.get_dummies(X[CatCols], drop_first=True)
X = pd.concat([X, ohe], axis= 1)
X = X.drop(columns=CatCols)
X

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,True,False,False,False,True,False,False,False,True,False
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,True,False,False,False,True,False,False,False,True,False
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,True,False,False,False,True,False,False,False,True,False
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,True,False,False,False,True,False,False,False,True,False
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,True,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,334,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,False,True,False,False,False,False,False,False,True,False
41184,46,383,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,False,True,False,False,False,False,False,False,True,False
41185,56,189,2,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,False,True,False,False,False,False,False,False,True,False
41186,44,442,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,False,True,False,False,False,False,False,False,True,False


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [56]:
scale = StandardScaler()

X_train[NumCols] = scale.fit_transform(X_train[NumCols])
X_test[NumCols] = scale.fit_transform(X_test[NumCols])

## Placeholders to Replace

### Finalized Datasets:
- Replace `X_train_final` with the finalized training feature set.
- Replace `X_train_sample1_final`, `X_train_sample2_final`, `X_train_sample3_final` with the finalized feature sets for the sample datasets.
- Replace `y_train_final` with the finalized target variable for the training dataset.
- Replace `y_train_sample1_final`, `y_train_sample2_final`, `y_train_sample3_final` with the finalized target variables for the sample datasets.

In [63]:
def test_model(X_train, X_test, y_train, y_test):
    # Initialize models
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier()
    }

    # Store results
    results = {}

    for model_name, model in models.items():
        # Train Model
        model.fit(X_train, y_train)

        # Predictions
        y_pred = model.predict(X_test)

        # Calculate Metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label='yes')  # Change 'yes' if your positive class is different
        recall = recall_score(y_test, y_pred, pos_label='yes')
        f1 = f1_score(y_test, y_pred, pos_label='yes')
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

        # Store results in a dictionary
        results[model_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'auc': auc
        }

        results = pd.DataFrame(results)

    return results, 

# Example of how to call the function
# results = test_model(X_train, X_test, y_train, y_test)
# print(results)

In [68]:
def test_model2(X_train, X_test, y_train, y_test):
    # Initialize models
    models = {
        "Logistic Regression": LogisticRegression(class_weight='balanced'),
        "Decision Tree": DecisionTreeClassifier(class_weight='balanced')
    }

    # Store results
    results = {}

    for model_name, model in models.items():
        # Train Model
        model.fit(X_train, y_train)

        # Predictions
        y_pred = model.predict(X_test)

        # Calculate Metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label='yes')  # Change 'yes' if your positive class is different
        recall = recall_score(y_test, y_pred, pos_label='yes')
        f1 = f1_score(y_test, y_pred, pos_label='yes')
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

        # Store results in a dictionary
        results[model_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'auc': auc
        }

        results = pd.DataFrame(results)

    return results, 

# Example of how to call the function
# results = test_model(X_train, X_test, y_train, y_test)
# print(results)

In [76]:
baseline_model1_model2 = test_model(X_train, X_test, y_train, y_test)

In [77]:
baseline_model1_model2

Unnamed: 0,Logistic Regression,Decision Tree
accuracy,0.9126,0.878935
precision,0.680139,0.465287
recall,0.423132,0.500718
f1_score,0.521701,0.482353
auc,0.938359,0.713834


In [78]:
model7 = test_model2(X_train, X_test, y_train, y_test)

In [79]:
model8 = test_model2(X_train, X_test, y_train, y_test)