**Naive Bayes**

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = '/Users/jean-paulhendriksen/Documents/Data Driven Decision Making in Business/DataMining/BankChurners.csv'
data = pd.read_csv(file_path)

# Drop irrelevant columns
data = data.drop(columns=['CLIENTNUM', 
                          'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                          'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'])

# Define features (X) and target (y)
X = data.drop(['Attrition_Flag'], axis=1)
y = data['Attrition_Flag']

# Encode categorical features
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Encode the target variable if it is categorical
y = LabelEncoder().fit_transform(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Create and train the Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = nb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Accuracy: 0.7655478775913129

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.72      0.50       327
           1       0.93      0.77      0.85      1699

    accuracy                           0.77      2026
   macro avg       0.66      0.75      0.67      2026
weighted avg       0.85      0.77      0.79      2026


Confusion Matrix:
 [[ 235   92]
 [ 383 1316]]


**Logistic Regression Model**

In [9]:
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test = scaler.transform(X_test)

# Increase max_iter and fit the Logistic Regression model
log_reg = LogisticRegression(max_iter=2000, random_state=42)
log_reg.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Accuracy: 0.8553800592300099

Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.75      0.63       327
           1       0.95      0.88      0.91      1699

    accuracy                           0.86      2026
   macro avg       0.74      0.81      0.77      2026
weighted avg       0.88      0.86      0.86      2026


Confusion Matrix:
 [[ 245   82]
 [ 211 1488]]
