In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Load the dataset
file_path = 'bankadditional.csv'
data = pd.read_csv(file_path, delimiter=';')

# Display basic information about the dataset
print("Dataset Information:")
print(data.info())

# Display the first few rows of the dataset
print("First Few Rows of the Dataset:")
print(data.head())

# Check for missing values
print("Missing Values in the Dataset:")
print(data.isnull().sum())

# Drop duplicate rows if any
data = data.drop_duplicates()

# Identify categorical and numerical columns
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
numerical_features = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Apply the preprocessing steps
data_preprocessed = preprocessor.fit_transform(data.drop(columns=['y']))
target = data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Train a Random Forest model to get feature importances
rf = RandomForestClassifier(random_state=42)
rf.fit(data_preprocessed, target)
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Select top 20 features
top_k = 20
top_features = indices[:top_k]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_preprocessed[:, top_features], target, test_size=0.2, random_state=42)

# Initialize the models
logreg = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
mlp = MLPClassifier(random_state=42, max_iter=1000, learning_rate_init=0.001)

# Train and evaluate Logistic Regression
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
logreg_scores = {
    'accuracy': accuracy_score(y_test, y_pred_logreg),
    'precision': precision_score(y_test, y_pred_logreg),
    'recall': recall_score(y_test, y_pred_logreg),
    'f1': f1_score(y_test, y_pred_logreg)
}
print("Logistic Regression Scores:")
print(logreg_scores)

# Train and evaluate Random Forest
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rf_scores = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'precision': precision_score(y_test, y_pred_rf),
    'recall': recall_score(y_test, y_pred_rf),
    'f1': f1_score(y_test, y_pred_rf)
}
print("Random Forest Scores:")
print(rf_scores)

# Train and evaluate Neural Network
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
mlp_scores = {
    'accuracy': accuracy_score(y_test, y_pred_mlp),
    'precision': precision_score(y_test, y_pred_mlp),
    'recall': recall_score(y_test, y_pred_mlp),
    'f1': f1_score(y_test, y_pred_mlp)
}
print("Neural Network Scores:")
print(mlp_scores)

# Grid Search for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='f1')
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best_rf = best_rf.predict(X_test)
best_rf_scores = {
    'accuracy': accuracy_score(y_test, y_pred_best_rf),
    'precision': precision_score(y_test, y_pred_best_rf),
    'recall': recall_score(y_test, y_pred_best_rf),
    'f1': f1_score(y_test, y_pred_best_rf)
}
print("Best Random Forest Scores after Grid Search:")
print(best_rf_scores)

# Save the best Random Forest model, preprocessor, and top features
joblib.dump(best_rf, 'best_rf_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(top_features, 'top_features.pkl')

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 non-null   float64
 16  cons.price.idx  4119 non-null   float64
 17  cons.conf.id

['top_features.pkl']