Loading the Libraries

In [None]:
import pandas as pd
import urllib.request
import zipfile
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

Downloading the dataset

In [12]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
filename = "bank-additional.zip"
urllib.request.urlretrieve(url, filename)

#Unzip and loading the dataset
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall(".")

data = pd.read_csv("bank-additional/bank-additional-full.csv", sep=";")

In [13]:
#Priting the first row of the dataset
print(data.head())



   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed

In [14]:
#Printing the feature name
print(data.columns.tolist())

['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']


In [15]:
#Basic info about the Dataset
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

Data Preprocessing

In [16]:
#Converting categorical variables like jobs, marital, and education converted into a numerical format for model to interpret

data = pd.get_dummies(data, columns=['job', 'marital', 'education', 'default', 'housing', 'loan','contact','month','day_of_week' ,'poutcome']) 

#Converting the target variable y to binary numerical format 0 or 1
data['y'] = data['y'].map({'yes':1, 'no':0})

#Scaling numerical features using the Standard Scalers to ensure that all features contributes equally to the model's learning process and improve performance
#Particularly done for the SVM
numeric_features = ['age', 'duration', 'campaign', 'pdays', 'previous', 
                    'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 
                    'euribor3m', 'nr.employed']
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

Splitting Data into the Training and Test Sets

In [None]:
#Seprating the features and targets into X and Y
X = data.drop('y', axis=1) #Features
y = data['y'] #Target

#Splitting into training and test set, 80%-20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
