In [5]:
import pandas as pd
import seaborn as sns
import numpy as np
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import time
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore", category=UserWarning)


# Load dataset
df = pd.read_csv('bank-additional-full.csv', sep=';')

df.dropna()
# Replace 'unknown' with NaN
df.replace('unknown', pd.NA, inplace=True)
# Drop rows with NaN values in any column
df.dropna(inplace=True)
# Replace 'nonexistent' with NaN
df.replace('nonexistent', pd.NA, inplace=True)
# Drop rows with NaN values in any column
df.dropna(inplace=True)

# Split the data
X=df.drop(columns=['y'])
y=df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sample data
#categorical_columns = {'job': ['housemaid', 'services', 'admin.', 'blue-collar', 'technician', 'management', 'entrepreneur', 'student', 'retired', 'unemployed'],
#        'marital': ['married', 'single', 'divorced'],
#        'education' : ['basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree', 'illeterate'],
#        'default' : ['no'],
#        'housing' : ['no', 'yes'],
#        'loan' : ['no', 'yes'],
#        'contact' : ['telephone', 'cellular'],
#        'day_of_week' : ['mon', 'tue', 'wed', 'thu', 'fri'],
#        'month' : ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'],
#        'poutcome' : ['failure', 'success'],
#        'y' : ['yes', 'no']
#           }

# Identify categorical and numerical columns
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

#df['y_num'] = df['y'].replace({'yes': 1, 'no': 0})
categorical_columns_y = df['y'].unique().tolist()

# Print the identified columns
print("Categorical columns:", categorical_columns)
print("Numerical columns:", numerical_columns)
print("Target variable categories:", categorical_columns_y)

# Create transformers
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_columns),
        ('cat', cat_transformer, categorical_columns)
    ])

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Define the hyperparameters to tune
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}

# Create a grid search object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', error_score='raise')

# Fit the model and time it
start_time = time.time()
grid_search.fit(X_train, y_train)
fit_time = (time.time() - start_time) / len(grid_search.cv_results_['mean_fit_time'])

# Get the best estimator
best_model = grid_search.best_estimator_

# Evaluate on training and test sets
train_score = best_model.score(X_train, y_train)
test_score = best_model.score(X_test, y_test)

# Make predictions on the test set
y_pred = best_model.predict(X_test) # This line is added to assign y_pred

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)
print(grid_search.cv_results_)
print(grid_search.scorer_)
print(grid_search.n_splits_)

print(fit_time)     #LR=36s;
print(train_score)  #LR=0.84;
print(test_score)   #LR=0.85;



Categorical columns: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
Numerical columns: ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
Target variable categories: ['yes', 'no']
0.8539205155746509
[[629  55]
 [ 81 166]]
              precision    recall  f1-score   support

          no       0.89      0.92      0.90       684
         yes       0.75      0.67      0.71       247

    accuracy                           0.85       931
   macro avg       0.82      0.80      0.81       931
weighted avg       0.85      0.85      0.85       931

{'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
0.8341830843616945
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                       

ValueError: X has 56 features, but ColumnTransformer is expecting 20 features as input.