In [31]:
import pandas as pd        # Data manipulation
import numpy as np         # Numerical operations
import matplotlib.pyplot as plt  # Visualization
import seaborn as sns      # Advanced visualization
from sklearn.preprocessing import LabelEncoder    # For encoding categorical variables
from sklearn.model_selection import train_test_split  # Data splitting
from sklearn.tree import DecisionTreeClassifier       # Decision Tree model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Evaluation metrics

In [32]:
# Load the dataset
Bank = pd.read_csv('bank.csv')

In [33]:
# Step 1: Calculate the first and third quartile
Q1 = np.percentile(Bank['balance'], 25)
Q3 = np.percentile(Bank['balance'], 75)

#Step 2: Calculate the interquartile range (IQR)
IQR = Q3 - Q1

In [34]:
# Step 3: Calculate the lower and upper bound
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

In [35]:
# Step 6: Identify and remove the outliers
Bank_data = Bank[(Bank['balance'] >= lower_bound) & ( Bank['balance'] <= upper_bound)]

# Step 7: Print the data without outliers
print("Data after removing outliers:")
Bank_data.head()

Data after removing outliers:


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [36]:
# List of categorical columns
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'deposit']

# Print unique values for each categorical column
for col in categorical_columns:
    unique_values =     Bank_data[col].unique()
    print(f"Unique values in '{col}': {unique_values}")

Unique values in 'job': ['admin.' 'technician' 'services' 'management' 'retired' 'blue-collar'
 'unemployed' 'entrepreneur' 'housemaid' 'unknown' 'self-employed'
 'student']
Unique values in 'marital': ['married' 'single' 'divorced']
Unique values in 'education': ['secondary' 'tertiary' 'primary' 'unknown']
Unique values in 'default': ['no' 'yes']
Unique values in 'housing': ['yes' 'no']
Unique values in 'loan': ['no' 'yes']
Unique values in 'contact': ['unknown' 'cellular' 'telephone']
Unique values in 'month': ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr' 'sep']
Unique values in 'poutcome': ['unknown' 'other' 'failure' 'success']
Unique values in 'deposit': ['yes' 'no']


In [37]:
# Suppress the SettingWithCopyWarning
pd.set_option('mode.chained_assignment', None)

In [38]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# List of columns to encode
categorical_columns = ['education', 'default', 'housing', 'loan','deposit']

# Encode categorical columns
for col in categorical_columns:
    Bank_data[col] = label_encoder.fit_transform(Bank_data[col])

In [39]:
# Apply One-Hot Encoding to the rest of the categorical columns
Bank_data = pd.get_dummies(Bank_data, columns=['job', 'marital', 'contact', 'month', 'poutcome'], drop_first=False)

In [40]:
# Encode the Data
def encode (Bank_data):
    for col in Bank_data.columns:
        if Bank_data[col].dtype == 'object' or Bank_data[col].dtype == 'category':
            le = LabelEncoder()
            Bank_data[col] = le.fit_transform(Bank_data[col])
    return Bank_data
Bank_data = encode(Bank_data)

In [41]:
# Features (X) and Target (y)
X = Bank_data.drop('deposit', axis=1)  # Features (all columns except 'deposit')
y = Bank_data['deposit']              # Target (the 'deposit' column)

# Splitting the dataset into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting datasets
print(f'Training Features Shape: {X_train.shape}')
print(f'Testing Features Shape: {X_test.shape}')

Training Features Shape: (8085, 45)
Testing Features Shape: (2022, 45)


In [42]:
# Define the parameters for Decision Tree Classifier
dt_params = {'criterion': 'gini', 
             'max_depth': 10, 
             'max_features': None, 
             'min_samples_leaf': 6, 
             'min_samples_split':100}


# Define the parameters for Random Forest Classifier
dt = DecisionTreeClassifier(**dt_params, random_state=0)

# Fitting the model to the training data
dt.fit(X_train, y_train)

In [43]:
# Make predictions using the trained model
y_pred = dt.predict(X_test)

# Check the first few predictions
print(y_pred[:10])

[0 1 1 0 1 1 1 1 0 1]


In [44]:
# Predict using the trained model
y_pred_dt = dt.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_dt))

Accuracy: 0.8358061325420376
