In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')


In [None]:
bank_user = pd.read_csv(r'bank-full.csv', sep=";")
df = bank_user.copy()
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

### The effect of variables on the dependent variable 'y' with the correlation matrix.

In [7]:
df['y'] = df['y'].apply(lambda x: 1 if x == 'yes' else 0)

In [8]:
df_numeric = df.select_dtypes(include=('int64'))

In [None]:
df_numeric.head()

In [10]:
correlation_matrix = df_numeric.corr()

#### We removed the variables that have no effect on the dependent variable.

In [11]:
df = df.drop(columns = ['campaign','day'])

# Examine each variable separately

## Age

In [None]:
df['age'].unique()

In [None]:
df['age'].value_counts()

## Job

In [None]:
df['job'].unique()

In [None]:
df['job'].value_counts()

In [16]:
df = df[df['job'] != 'unknown']

## Marital

In [None]:
df['marital'].unique()

## Education

In [None]:
df['education'].unique()

In [None]:
df['education'].value_counts()

In [20]:
df = df[df['education'] != 'unknown']

## Default

In [None]:
df['default'].unique()

In [None]:
df['default'].value_counts()

## Housing	

In [None]:
df['housing'].unique()

In [None]:
df['housing'].value_counts()

## Loan

In [None]:
df['loan'].unique()

In [None]:
df['loan'].value_counts()

## Contact

In [None]:
df['contact'].unique()

In [None]:
df['contact'].value_counts()

Updating the dataset with the new values after allocating the unknown category's values proportionately between telephone and cellular based on their current counts in the dataset.

In [None]:
contact_counts = df['contact'].value_counts()

unknown_count = contact_counts['unknown']
cellular_count = contact_counts['cellular']
telephone_count = contact_counts['telephone']

total = cellular_count + telephone_count
cellular_ratio = cellular_count / total
telephone_ratio = telephone_count / total

cellular_add = int(unknown_count * cellular_ratio)
telephone_add = int(unknown_count * telephone_ratio)

df.loc[df['contact'] == 'unknown', 'contact'] = 'cellular'
df.loc[df.index[df['contact'] == 'cellular'][:telephone_add], 'contact'] = 'telephone'

new_contact_counts = df['contact'].value_counts()
print(new_contact_counts)


## Month

In [None]:
df['month'].unique()

In [None]:
df['month'].value_counts()

## Duration

In [None]:
df['duration'].unique()

In [None]:
df['duration'].value_counts()

## Pdays

In [None]:
df['pdays'].describe()

## Previous

In [None]:
df['pdays'].describe()

## Poutcome

In [None]:
df['poutcome'].unique()

In [None]:
df['poutcome'].value_counts()

In [None]:
df['poutcome'] = df['poutcome'].replace('unknown', 'other')

print(df['poutcome'].value_counts())

# y

In [None]:
df['y'].unique()

In [None]:
df['y'].value_counts()

# Label Encoder

In [None]:
encoder = LabelEncoder()

label_mappings = {}

for col in df.select_dtypes('object'):
    df[col] = encoder.fit_transform(df[col])
    label_mappings[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

for col, mapping in label_mappings.items():
    print(f"Column: {col}")
    for label, code in mapping.items():
        print(f"{code} -> {label}")
    print()