# Data understanding and data preperation according to crisp model.

### <ins> Importing modules and data </ins>

In [1]:
# import libraries
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB

get_ipython().run_line_magic('matplotlib', 'qt')

In [2]:
# import dataset
df = pd.read_csv('banking_classification/new_train.csv')
df.shape

(32950, 16)

### <ins> Exploring the data </ins>

In [3]:
# summary statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,32950.0,40.014112,10.403636,17.0,32.0,38.0,47.0,98.0
duration,32950.0,258.127466,258.975917,0.0,103.0,180.0,319.0,4918.0
campaign,32950.0,2.560607,2.752326,1.0,1.0,2.0,3.0,56.0
pdays,32950.0,962.052413,187.951096,0.0,999.0,999.0,999.0,999.0
previous,32950.0,0.174719,0.499025,0.0,0.0,0.0,0.0,7.0


In [4]:
# checking null values
df.isnull().sum()

age            0
job            0
marital        0
education      0
default        0
housing        0
loan           0
contact        0
month          0
day_of_week    0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
y              0
dtype: int64

In [5]:
# check duplicetes
df.duplicated().sum()

8

In [6]:
# remove dublicated rows
df.drop_duplicates(inplace= True)
df.duplicated().sum()

0

In [7]:
# check data types
df.dtypes

age             int64
job            object
marital        object
education      object
default        object
housing        object
loan           object
contact        object
month          object
day_of_week    object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
y              object
dtype: object

In [8]:
#seperating df into two dataframes, one for dategorical data and the other for numerical data.
df_categorical = pd.DataFrame()
df_numeric = pd.DataFrame()
columns = df.columns.values
for column in columns:
    if df[column].dtype != np.int64 and df[column].dtype != np.float64: 
        df_categorical[column] = df[column]
    else:
        df_numeric[column] = df[column]

In [9]:
df_numeric.head()

Unnamed: 0,age,duration,campaign,pdays,previous
0,49,227,4,999,0
1,37,202,2,999,1
2,78,1148,1,999,0
3,36,120,2,999,0
4,59,368,2,999,0


In [10]:
# droping pdays column
df_numeric.drop(columns= 'pdays', axis= 1, inplace= True)

In [11]:
# number of unique values for eac categorical feature
df_categorical.nunique()

job            12
marital         4
education       8
default         3
housing         3
loan            3
contact         2
month          10
day_of_week     5
poutcome        3
y               2
dtype: int64

In [12]:
# ploting bar charts for each categorical feature
for i in df_categorical.columns:
    plt.figure()
    plt.title('{}'.format(i))
    df.groupby(i)[i].count().plot(kind= 'bar')
    plt.xticks(rotation= 45);

###  <ins> Creating dummy variables </ins>

In [13]:
# dummy variables
columns = df_categorical.columns
df_encoded = pd.get_dummies(df_categorical[columns])
df_encoded.drop(columns= 'y_no', inplace= True)
df_encoded.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_yes
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [14]:
df_encoded.columns

Index(['job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu',
       'day_of_week_tue', 'day_of_week_wed', 'poutcome_failure',

In [15]:
# droping redundat columns
redundant_cols = ['job_unknown', 'marital_unknown', 'education_unknown', 'default_unknown', 'housing_unknown',
                 'loan_unknown', 'contact_telephone', 'poutcome_nonexistent']
df_encoded.drop(columns= redundant_cols, axis= 1, inplace= True)

# Modeling naives bayes using the script above as base

To filter out all warnings and import the necessary libaries

In [16]:
import warnings
warnings.filterwarnings("ignore")

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

Loading in the dataset

In [18]:
df = pd.read_csv('banking_classification/new_train.csv')

Separating numerical and categorical features is necessary for preventing errors when applying the naives bayes model

In [19]:
df_categorical = df.select_dtypes(include=['object'])
df_numeric = df.select_dtypes(exclude=['object'])

Dropping the 'pdays' column from the numeric features is necessary as it's being removed due to its high correlation and redundancy in the dataset.

In [20]:
df_numeric.drop(columns='pdays', axis=1, inplace=True)

Creating dummy variables for categorical features is essential to convert categorical data into a numerical format

In [21]:
df_encoded = pd.get_dummies(df_categorical, drop_first=True)

Dropping redundant columns is necessary to eliminate features that provide little or no additional information to improve model performance and reduce computational complexity

In [22]:
redundant_cols = ['job_unknown', 'marital_unknown', 'education_unknown', 'default_unknown', 'housing_unknown',
                 'loan_unknown', 'contact_telephone', 'poutcome_nonexistent']
df_encoded.drop(columns=redundant_cols, axis=1, inplace=True)

Concatenating numerical and categorical dataframes is essential to combine both types of features into a single dataset for modeling

In [23]:
df = pd.concat([df_numeric, df_encoded], axis=1)

Normalizing numerical features is crucial to scale them to a consistent range, preventing features with larger magnitudes from dominating the model

In [24]:
scaler = MinMaxScaler()
columns_to_normalize = ['age', 'duration', 'campaign', 'previous']
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

In this code, PCA is applied to reduce the dimensionality of the dataset while retaining 90% of the explained variance. I choose 90% because from tests in my python envoirment it appeared the accuracy was the highest like this.

In [25]:
pca = PCA(n_components=0.90)
X = pca.fit_transform(df.drop(columns='y_yes'))
y = df['y_yes']

code creates a Gaussian Naive Bayes classifier a machine learning model to perform classification tasks based on the assumption of feature independence and normal distribution of data.

In [26]:
nb_classifier = GaussianNB()

Performing cross-validation to estimate accuracy is important for assessing the performance of a machine learning model by splitting the dataset into multiple sets, training and testing the model on different sets, and computing the average accuracy to obtain a more robust performance estimate.

In [27]:
accuracy_scores = cross_val_score(nb_classifier, X, y, cv=5, scoring='accuracy')

to get the mean accuracy score involves averaging the accuracy scores obtained from multiple cross-validation runs. the outcome of this results in a singel more accurate value

In [28]:
mean_accuracy = np.mean(accuracy_scores)

prints the actual single store stated above

In [29]:
print("Mean Accuracy:", mean_accuracy)

Mean Accuracy: 0.8872230652503793
