In [1]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting numpy<1.27,>=1.21 (from pycaret)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m504.0 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas<2.2.0 (from pycaret)
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret)
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-2.0.4.tar.gz (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from pycaret.classification import *
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score

In [3]:
df = pd.read_csv('bank marketing campaign.csv')
df.head()

Unnamed: 0,Age,Job,Marital Status,Education,Has credit in default,Avg. credit balance,Has housing loan,Has personal loan,Contact type,Last contact day,Last contact month,Last contact duration (sec),Number of contacts,Days passed,Previous contacts,Outcome previous campaign,Subscribed deposit
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [4]:
# Rename the column called "Subscribed deposit" in df to "target"

df = df.rename(columns={"Subscribed deposit": "target"})

In [5]:
df.columns

Index(['Age', 'Job', 'Marital Status', 'Education', 'Has credit in default',
       'Avg. credit balance', 'Has housing loan', 'Has personal loan',
       'Contact type', 'Last contact day', 'Last contact month',
       'Last contact duration (sec)', 'Number of contacts', 'Days passed',
       'Previous contacts', 'Outcome previous campaign', 'target'],
      dtype='object')

In [6]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [17]:
# Initialize PyCaret setup
clf = setup(data=train_data, target='target',
            fix_imbalance = True, train_size=0.8, session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,target
2,Target type,Binary
3,Target mapping,"no: 0, yes: 1"
4,Original data shape,"(3616, 17)"
5,Transformed data shape,"(5832, 49)"
6,Transformed train set shape,"(5108, 49)"
7,Transformed test set shape,"(724, 49)"
8,Numeric features,7
9,Categorical features,9


In [18]:
# Compare multiple models
best_models = compare_models(include=['lr', 'rf', 'xgboost', 'lightgbm', 'svm'], n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.898,0.889,0.898,0.8809,0.8786,0.3359,0.3767,0.981
lightgbm,Light Gradient Boosting Machine,0.8963,0.8982,0.8963,0.8858,0.8885,0.4267,0.4367,4.684
xgboost,Extreme Gradient Boosting,0.8887,0.886,0.8887,0.8771,0.8804,0.3856,0.3951,0.778
lr,Logistic Regression,0.8406,0.8876,0.8406,0.8984,0.8599,0.443,0.474,1.507
svm,SVM - Linear Kernel,0.5394,0.6719,0.5394,0.8595,0.5913,0.0998,0.1701,0.235


Processing:   0%|          | 0/29 [00:00<?, ?it/s]

In [19]:
# Create a list to store results
results = []

# Evaluate each model
for model in best_models:
	# Get the model name
	model_name = model.__class__.__name__
	print(model_name)

	# Make predictions on test data
	predictions = predict_model(model, data=test_data)

	# Calculate confusion matrix
	cm = confusion_matrix(test_data['target'], predictions['prediction_label'])
	# Calculate metrics
	accuracy = accuracy_score(test_data['target'], predictions['prediction_label'])
	balanced_acc = balanced_accuracy_score(test_data['target'], predictions['prediction_label'])

	# Calculate sensitivity and specificity for each class
	sensitivities = []
	specificities = []

	for i in range(len(cm)):
		tp = cm[i, i]
		fn = np.sum(cm[i, :]) - tp
		fp = np.sum(cm[:, i]) - tp

	tn = np.sum(cm) - (tp + fn + fp)

	sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
	specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

	# Store results
	results.append({
			'Model': model_name,
			'Accuracy': accuracy,
			'Balanced Accuracy': balanced_acc,
			'Avg Sensitivity': sensitivity,
			'Avg Specificity': specificity
	})

RandomForestClassifier


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9006,0.9099,0.9006,0.8798,0.8783,0.2779,0.3239


LGBMClassifier


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8961,0.9211,0.8961,0.883,0.8877,0.3786,0.3863


XGBClassifier


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9028,0.9107,0.9028,0.8938,0.8973,0.4414,0.4458


LogisticRegression


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.8298,0.881,0.8298,0.8991,0.8535,0.399,0.4366


SGDClassifier


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.4431,0,0.4431,0.9003,0.5187,0.1078,0.2284


In [20]:
# Convert results to a DataFrame and display
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False, float_format='{:.4f}'.format))

                 Model  Accuracy  Balanced Accuracy  Avg Sensitivity  Avg Specificity
RandomForestClassifier    0.9006             0.5991           0.2143           0.9839
        LGBMClassifier    0.8961             0.6638           0.3673           0.9603
         XGBClassifier    0.9028             0.6989           0.4388           0.9591
    LogisticRegression    0.8298             0.7925           0.7449           0.8401
         SGDClassifier    0.4431             0.6743           0.9694           0.3792
