# Data undrstanding and data preperation according to crisp model.

### <ins> Importing modules and data </ins>

In [1]:
import warnings

# To filter out all warnings
warnings.filterwarnings("ignore")

In [2]:
# import libraries
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'qt')

In [3]:
# import dataset
df = pd.read_csv('banking_classification/new_train.csv')
df.shape

(32950, 16)

In [4]:
(df['y'] == 'yes').sum() / df.y.count()

0.11265553869499241

### <ins> Exploring the data </ins>

In [5]:
# summary statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,32950.0,40.014112,10.403636,17.0,32.0,38.0,47.0,98.0
duration,32950.0,258.127466,258.975917,0.0,103.0,180.0,319.0,4918.0
campaign,32950.0,2.560607,2.752326,1.0,1.0,2.0,3.0,56.0
pdays,32950.0,962.052413,187.951096,0.0,999.0,999.0,999.0,999.0
previous,32950.0,0.174719,0.499025,0.0,0.0,0.0,0.0,7.0


In [6]:
# checking null values
df.isnull().sum()

age            0
job            0
marital        0
education      0
default        0
housing        0
loan           0
contact        0
month          0
day_of_week    0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
y              0
dtype: int64

In [7]:
# check duplicetes
df.duplicated().sum()

8

In [8]:
# remove dublicated rows
df.drop_duplicates(inplace= True)
df.duplicated().sum()

0

In [9]:
# check data types
df.dtypes

age             int64
job            object
marital        object
education      object
default        object
housing        object
loan           object
contact        object
month          object
day_of_week    object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
y              object
dtype: object

In [10]:
#seperating df into two dataframes, one for dategorical data and the other for numerical data.
df_categorical = pd.DataFrame()
df_numeric = pd.DataFrame()
columns = df.columns.values
for column in columns:
    if df[column].dtype != np.int64 and df[column].dtype != np.float64: 
        df_categorical[column] = df[column]
    else:
        df_numeric[column] = df[column]

In [11]:
df_numeric.head()

Unnamed: 0,age,duration,campaign,pdays,previous
0,49,227,4,999,0
1,37,202,2,999,1
2,78,1148,1,999,0
3,36,120,2,999,0
4,59,368,2,999,0


In [12]:
df_numeric.hist();

#### <span style="color:red"> Based on the data describtion and the destribution of pdays, it is more effiecient to drop it.</span>


In [13]:
# droping pdays column
df_numeric.drop(columns= 'pdays', axis= 1, inplace= True)

In [14]:
df_numeric.head(10)

Unnamed: 0,age,duration,campaign,previous
0,49,227,4,0
1,37,202,2,1
2,78,1148,1,0
3,36,120,2,0
4,59,368,2,0
5,29,256,2,0
6,26,449,1,0
7,30,126,2,0
8,50,574,1,0
9,33,498,5,0


In [15]:
# number of unique values for eac categorical feature
df_categorical.nunique()

job            12
marital         4
education       8
default         3
housing         3
loan            3
contact         2
month          10
day_of_week     5
poutcome        3
y               2
dtype: int64

In [16]:
# ploting bar charts for each categorical feature
for i in df_categorical.columns:
    plt.figure()
    plt.title('{}'.format(i))
    df.groupby(i)[i].count().plot(kind= 'bar')
    plt.xticks(rotation= 45);

###  <ins> Creating dummy variables </ins>

In [17]:
# dummy variables
columns = df_categorical.columns
df_encoded = pd.get_dummies(df_categorical[columns])
df_encoded.drop(columns= 'y_no', inplace= True)
df_encoded.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_yes
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [18]:
df_encoded.columns

Index(['job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu',
       'day_of_week_tue', 'day_of_week_wed', 'poutcome_failure',

In [19]:
# droping redundat columns
redundant_cols = ['job_unknown', 'marital_unknown', 'education_unknown', 'default_unknown', 'housing_unknown',
                 'loan_unknown', 'contact_telephone', 'poutcome_nonexistent']
df_encoded.drop(columns= redundant_cols, axis= 1, inplace= True)

In [20]:
df_encoded.shape

(32942, 46)

In [21]:
# cocatinate the numeric and categorigal dataframes in on dataframe
df = pd.concat([df_numeric, df_encoded], axis= 1)
df.head()

Unnamed: 0,age,duration,campaign,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_success,y_yes
0,49,227,4,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,37,202,2,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,78,1148,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,36,120,2,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,59,368,2,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [22]:
# correlation coesfficients 
sns.heatmap(df.corr());

#### <span style="color:red"> As shown from the heat map, there are some higly correlated features, which could create some noise.</span>

### <ins> Seperating Variables from our target variable. </ins>

In [23]:
X = df.iloc[: , : -1].values
y = df.iloc[:, -1].values

In [24]:
X.shape, y.shape

((32942, 49), (32942,))

In [25]:
X

array([[  49,  227,    4, ...,    1,    0,    0],
       [  37,  202,    2, ...,    1,    1,    0],
       [  78, 1148,    1, ...,    0,    0,    0],
       ...,
       [  54,  131,    4, ...,    0,    0,    0],
       [  29,  165,    1, ...,    0,    0,    0],
       [  35,  544,    3, ...,    0,    0,    0]])

### <ins> Scaling numerical values. </ins>

In [26]:
# It is useful in classification problems to normalize numerical features
# standard scalling based on min. and max. values because most of the features we have are hot encoded.
from sklearn.preprocessing import MinMaxScaler 
columns_to_normalize = [0, 1, 2]
# creating scalar
scaler = MinMaxScaler()
# tranforming data
X[:, columns_to_normalize] = scaler.fit_transform(X[:, columns_to_normalize])

In [27]:
X

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### <ins> Applying Gaussian PCA to reduce dimensions. </ins>

In [28]:
A = X.copy()

from sklearn.decomposition import KernelPCA
###### Define the RBF kernel PCA with the desired gamma (kernel width)
gamma = 15
kpca = KernelPCA(kernel='rbf', gamma=gamma, n_components=None)

###### Fit the kernel PCA model to the data
X_kpca = kpca.fit_transform(A)

###### Compute the explained variance ratio
explained_variance_ratio = kpca.explained_variance_ratio_

In [None]:
# import PCA class
from sklearn.decomposition import PCA
# creating pca object 
pca = PCA(n_components= None)
A = pca.fit_transform(A)
explained_variance = pca.explained_variance_ratio_

In [None]:
explained_variance

In [None]:
cumulative_variance = np.cumsum(explained_variance)
# for noise reduction purpose
# Find the number of components needed to explain at least 95% of the variance
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

In [None]:
print(f"Number of components to explain 95% of variance: {n_components_95}")

In [None]:
# show explained variance vs # of components
plt.figure()
x= np.arange(len(explained_variance))
plt.plot(explained_variance)
fill_value= n_components_95
plt.fill_between(x, explained_variance, where=(x <= fill_value), alpha=0.5, color='grey', label='Fill Area')
plt.xlabel('n-components')
plt.ylabel('explined variance');

In [None]:
B = X.copy()
# for data Visualization purpose
pca = PCA(n_components= 2)
B = pca.fit_transform(B)

In [None]:
B

In [None]:
# plot PC1 vs PC2
plt.scatter(B[:, 0], B[:, 1])
plt.xlabel('PC1')
plt.ylabel('PC2');

#### <span style="color:red"> Based on the graph, we think in SVC as a model .</span>

In [None]:
# reduce the dimensions to n_components that can explain 95% of the data variance
# creating pca object 
pca = PCA(n_components= n_components_95)
X = pca.fit_transform(X)

In [None]:
X.shape

In [None]:
X

# Modeling

In [None]:
# splitting the data into train-set and test-set
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets (e.g., 80% train and 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 0)

In [None]:
X_train_copy1 = X_train.copy()
y_train_copy1 = y_train.copy()

In [None]:
X_train_copy1

### <ins> Logistic Regression Model </ins>

In [None]:
import statsmodels.api as sm

# building the model and fitting the data
log_reg = sm.Logit(y_train_copy1, X_train_copy1).fit()

In [None]:
log_reg.summary()

In [None]:
# predicting x_test values 
# cm between y_predicted and y_test
# drop nonsignificant features 
# then fit, predict, cm , compare

### <ins> Applying logistic Regression before eliminating non-significant variables. </ins>

In [None]:
# performing predictions on the test dataset
yhat = log_reg.predict(X_test)

In [None]:
# Building LogisticRegression model using sklearn package
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# making classifier object from a class
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_copy1, y_train_copy1)

In [None]:
# predict x_test values
y_pred = logistic_regression.predict(X_test)

In [None]:
# compare y_pred, y_test
y_pred, y_test

In [None]:
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{confusion}')
print(f'Classification Report:\n{classification_rep}')

### <ins> Applying logistic Regression after eliminating non-significant variables. </ins>

In [None]:
X_train_copy2 = X_train.copy()
y_train_copy2 = y_train.copy()

In [None]:
X_train_copy2

In [None]:
# indices for variables with low significant effect on the target variable
non_significant_variables_indices = [5,7,10,13,14]

In [None]:
# remove those variables
X_train_del = np.delete(X_train_copy2, non_significant_variables_indices, 1)

In [None]:
X_train_copy2.shape, X_train_del.shape

In [None]:
# making classifier object from a class
logistic_regression2 = LogisticRegression()
logistic_regression2.fit(X_train_del, y_train_copy1)

In [None]:
# predict x_test values
y_pred2 = logistic_regression.predict(X_test)

In [None]:
# compare y_pred, y_test
y_pred2, y_test

In [None]:
accuracy2 = accuracy_score(y_test, y_pred2)
confusion2 = confusion_matrix(y_test, y_pred2)
classification_rep2 = classification_report(y_test, y_pred2)

print(f'Accuracy: {accuracy2}')
print(f'Confusion Matrix:\n{confusion2}')
print(f'Classification Report:\n{classification_rep2}')

### <ins> Artificial NuralNetwork(ANN) </ins>

In [None]:
X_train.shape

In [None]:
X_train_copy3 = X_train.copy()
y_train_copy3 = y_train.copy()
X_train_copy3.shape, y_train_copy3.shape

In [None]:
# required libraries
import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.callbacks import History
from tensorflow.keras.utils import plot_model 

#### <span style="color:red"> Try without eliminating nonsignificant values..</span>

In [None]:
# Initializing ANN model
model = Sequential()

In [None]:
# Input layer (specify input_dim as the number of features)
model.add(Dense(units=64, activation='relu', input_dim=X_train_copy3.shape[1]))

# Hidden layers
model.add(Dense(units=32, activation='relu'))

# Hidden layers
#model.add(Dense(units=16, activation='relu'))

# Output layer (use 'sigmoid' activation for binary classification)
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define a history object to store training metrics
history = History()

# Train the model
model.fit(X_train_copy3, y_train_copy3, epochs=10, batch_size= 100,validation_data=(X_test, y_test), callbacks=[history])

In [None]:
# Plot training and validation loss over epochs
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend();

In [None]:
# Make predictions on the test set
ANN_y_pred = model.predict(X_test)
ANN_y_pred = (y_pred > 0.5)  # Convert probabilities to binary predictions

# Evaluate the model
ANN_confusion = confusion_matrix(y_test, ANN_y_pred)
ANN_accuracy = accuracy_score(y_test, ANN_y_pred)
print(f'Accuracy: {ANN_accuracy}')
print("Confusion Matrix:\n", ANN_confusion)

### <ins> Support Vector Machine Model </ins>

In [None]:
X_train_copy4 = X_train.copy()
y_train_copy4 = y_train.copy()
X_train_copy4.shape, y_train_copy4.shape

In [None]:
from sklearn.svm import SVC

# Create an SVC model with an RBF kernel (you can choose a different kernel)
svc_model = SVC(kernel='rbf', C=1.0)

# Train the model
svc_model.fit(X_train_copy4, y_train_copy4)

# Make predictions on the test set
SVC_y_pred = svc_model.predict(X_test)

# Evaluate the model
SVC_accuracy = accuracy_score(y_test, SVC_y_pred)
SVC_confusion = confusion_matrix(y_test, SVC_y_pred)

print("Accuracy:", SVC_accuracy)
print("Confusion Matrix:\n", SVC_confusion)

In [None]:
# comparing confusion matrix
plt.figure()
plt.subplot(2,2,1)
plt.title('Logistic Regression_1')
sns.heatmap(confusion, annot= True)

plt.subplot(2,2,2)
plt.title('Logistic Regression_2')
sns.heatmap(confusion2, annot= True)

plt.subplot(2,2,3)
plt.title('ANN')
sns.heatmap(ANN_confusion, annot= True)

plt.subplot(2,2,4)
plt.title('SVC')
sns.heatmap(SVC_confusion, annot= True);

In [None]:
# comparing accuracy between models
plt.figure()
models_name = ["LogisticRegression_1", "LogisticRegression_2", "ANN", "SVC"]
values = [accuracy, accuracy2, ANN_accuracy, SVC_accuracy]
plt.bar(models_name, values)
plt.ylabel('Accuracy')
plt.xticks(rotation= 0)