# UCI Bank Marketing Dataset - Using XGBoost

## 1. Dataset Overview
The dataset contains information about marketing campaign contacts and demographic details. Each row represents a contact with a client, and the target variable `y` indicates whether they subscribed (`yes`) or not (`no`).

In [1]:
import pandas as pd

#Load dataset - ensure `bank-full.csv` is in the same directory or adjust the path
df = pd.read_csv('bank-full.csv', sep=';')
#https://drive.google.com/file/d/16xDiX8aiRsrimQBxaKrF7iGtZl2cifzI/view?usp=sharing
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [2]:
df.shape

(45211, 17)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [4]:
df.isnull().mean()

age          0.0
job          0.0
marital      0.0
education    0.0
default      0.0
balance      0.0
housing      0.0
loan         0.0
contact      0.0
day          0.0
month        0.0
duration     0.0
campaign     0.0
pdays        0.0
previous     0.0
poutcome     0.0
y            0.0
dtype: float64

In [5]:
df["pdays"]=df["pdays"].apply(lambda x:"non-contacted" if x==-1 else "contacted")

## Data Preprocessing

### Key Points:
- Scaling is not mandatory in tree models.
- Missing Value Imputation is not mandatory in tree models.
- Outlier Treatment is not mandatory in tree models.
- Dummy value creation is not required in tree models.

In [6]:
df.head(2)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,non-contacted,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,non-contacted,0,unknown,no


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [8]:
label_enc_cols = df.select_dtypes(include=['object']).columns
label_enc_cols

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'pdays', 'poutcome', 'y'],
      dtype='object')

In [9]:
label_encoder = LabelEncoder()
for col in label_enc_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [10]:
df.head(2)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,1,0,3,0


In [11]:
X = df.drop(columns=['y'])
y = df['y']

In [12]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print(X_train.shape)
print(X_test.shape)

(36168, 16)
(9043, 16)


In [14]:
# pip install xgboost

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [16]:
from sklearn.metrics import accuracy_score

## Modelling

In [17]:
dt_model = DecisionTreeClassifier() #Base Model

In [18]:
%%time
dt_model.fit(X_train, y_train)

CPU times: total: 609 ms
Wall time: 680 ms


In [19]:
# Predictions
y_pred_train = dt_model.predict(X_train)
y_pred_test = dt_model.predict(X_test)

In [20]:
# Accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Initial Model Train Accuracy: {train_accuracy}')
print(f'Initial Model Test Accuracy: {test_accuracy}')

Initial Model Train Accuracy: 1.0
Initial Model Test Accuracy: 0.8691805816653765


In [21]:
xgb_model = XGBClassifier() #Base Model

In [22]:
%%time
xgb_model.fit(X_train, y_train)

CPU times: total: 3.16 s
Wall time: 585 ms


In [23]:
# Predictions
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

In [24]:
# Accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Initial Model Train Accuracy: {train_accuracy}')
print(f'Initial Model Test Accuracy: {test_accuracy}')

Initial Model Train Accuracy: 0.954683698296837
Initial Model Test Accuracy: 0.9035718235098972


In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
param_grid = {
    'n_estimators': [50, 100, 200], # no of trees
    'max_depth': [3, 5, 7], # depth of tree maximum allowed
    'learning_rate': [0.01, 0.1, 0.2], # pace of learning
    'subsample': [0.6, 0.8, 1.0] # % amount of train data to be used in every next tree
}#81

In [27]:
%%time
xgb_model = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1,verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
CPU times: total: 24.6 s
Wall time: 1min 30s


In [28]:
#81*5 =405 in 26s

In [29]:
%%time
gbc_model = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator=gbc_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1,verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
CPU times: total: 29.1 s
Wall time: 15min 6s


In [31]:
grid_search.best_estimator_

In [None]:
# Home Work: Take the best estimator from both the model and then compare their performance