In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import pickle
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA

Load the dataset from a CSV file



In [2]:
data = pd.read_csv('dataset.csv', sep=';')

In [3]:
data.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


 Preprocessing: Convert categorical variables to dummy variables
 

In [4]:
data = pd.get_dummies(data, drop_first=True)

In [5]:
data.head(5)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,y_yes
0,58,2143,5,261,1,-1,0,False,False,False,...,False,False,True,False,False,False,False,False,True,False
1,44,29,5,151,1,-1,0,False,False,False,...,False,False,True,False,False,False,False,False,True,False
2,33,2,5,76,1,-1,0,False,True,False,...,False,False,True,False,False,False,False,False,True,False
3,47,1506,5,92,1,-1,0,True,False,False,...,False,False,True,False,False,False,False,False,True,False
4,33,1,5,198,1,-1,0,False,False,False,...,False,False,True,False,False,False,False,False,True,False


features - dependent variables separation

In [6]:
X = data.drop('y_yes', axis=1)
y = data['y_yes']

splitting of train and test datasets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

feature scaling to avoid dominance of any value

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
print(X_train_scaled)

[[ 0.00651495 -0.16938112 -0.09821817 ... -0.20779147 -0.1860456
   0.47280581]
 [ 0.75993671  0.01784754  1.70342239 ... -0.20779147 -0.1860456
   0.47280581]
 [ 0.10069267  0.8206814   1.82353176 ... -0.20779147 -0.1860456
   0.47280581]
 ...
 [-0.65272909 -0.01457014 -0.09821817 ... -0.20779147 -0.1860456
   0.47280581]
 [-0.74690681 -0.39564332 -1.05909314 ... -0.20779147 -0.1860456
   0.47280581]
 [-0.27601821 -0.46378662  0.62243805 ... -0.20779147 -0.1860456
   0.47280581]]


In [10]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(),
    "Neural Network": MLPClassifier()
}


In [11]:
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy:.2f}')
    with open(f'{name}.pkl', 'wb') as file:
        pickle.dump(model, file)

Logistic Regression Accuracy: 0.90
Decision Tree Accuracy: 0.88
Random Forest Accuracy: 0.90
Gradient Boosting Accuracy: 0.90
KNN Accuracy: 0.89
Neural Network Accuracy: 0.90




predicting for input values

In [12]:
input_data = {
    "age": [40],
    "job": ["technician"],
    "marital": ["single"],
    "education": ["secondary"],
    "default": ["no"],
    "balance": [1500],
    "housing": ["yes"],
    "loan": ["no"],
    "contact": ["cellular"],
    "day": [15],
    "month": ["may"],
    "duration": [120],
    "campaign": [2],
    "pdays": [100],
    "previous": [1],
    "poutcome": ["success"]
}

input preprocessing

In [13]:

input_df = pd.DataFrame(input_data)

train_data = pd.read_csv('dataset.csv', sep=';')
train_data = pd.get_dummies(train_data, drop_first=True)

input_df = pd.get_dummies(input_df, drop_first=True)
input_df = input_df.reindex(columns=train_data.columns, fill_value=0)

scaler = StandardScaler()
scaler.fit(train_data)  

input_scaled = scaler.transform(input_df)
input_scaled = input_scaled[:, :-1]
print('done')

done


In [14]:

for name, model in models.items():
    with open(name+'.pkl', 'rb') as file:
        model = pickle.load(file)
        predictions = model.predict(input_scaled)
        print("Predicted outcome of algorithm "+name+":", predictions)
        


Predicted outcome of algorithm Logistic Regression: [False]
Predicted outcome of algorithm Decision Tree: [False]
Predicted outcome of algorithm Random Forest: [False]
Predicted outcome of algorithm Gradient Boosting: [False]
Predicted outcome of algorithm KNN: [False]
Predicted outcome of algorithm Neural Network: [False]
