In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import make_column_transformer

In [2]:
train_data = pd.read_csv('traininingdata.txt', sep=';')
test_data = pd.read_csv('testdata.txt', sep=';')

In [3]:
train_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,technician,divorced,secondary,no,-78,yes,yes,cellular,29,jul,10,23,-1,0,unknown,no
1,50,management,married,tertiary,no,106,yes,no,unknown,15,may,388,2,-1,0,unknown,no
2,47,admin.,married,secondary,no,407,yes,no,unknown,5,jun,67,12,-1,0,unknown,no
3,39,admin.,divorced,secondary,no,952,yes,no,unknown,16,jun,62,1,-1,0,unknown,no
4,30,management,single,tertiary,no,364,no,no,cellular,30,apr,306,2,-1,0,unknown,yes


# 1. Preprcessing Data

## 1.1 split job into 4 groups based on label

In [4]:
sort_job=train_data.groupby('job').apply(lambda x:sum(x.y=='yes')/len(x)).sort_values(axis=0)
low_level=tuple(sort_job[:4].index.values)
mid_level=tuple(sort_job[4:8].index.values)
advanced_level=tuple(sort_job[8:10].index.values)
high_level=tuple(sort_job[10:].index.values)

In [5]:
train_data.replace({'job': {**{i:1 for i in low_level}, **{i:2 for i in mid_level}, **{i:3 for i in advanced_level}, **{i:4 for i in high_level}}}, inplace=True)
test_data.replace({'job': {**{i:1 for i in low_level}, **{i:2 for i in mid_level}, **{i:3 for i in advanced_level}, **{i:4 for i in high_level}}}, inplace=True)

## 1.2 fill 999 for -1 in pdays

In [6]:
train_data.replace({'pdays':{-1:999}}, inplace=True)

## 1.3 Label Encode education, month, poutcome as well as label

In [8]:
train_data.poutcome.unique()

array(['unknown', 'other', 'failure', 'success'], dtype=object)

In [9]:
train_data.groupby('poutcome').apply(lambda x: sum(x.y == 'yes') / len(x))

poutcome
failure    0.123765
other      0.169685
success    0.653146
unknown    0.091584
dtype: float64

In [10]:
month_dict = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
label_dict = {'yes': 1, 'no': 0}
education_dict = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
poutcome_dict = {'unknown': 0, 'other': 2, 'failure': 1, 'success': 3}

train_data['month'] = train_data.month.apply(lambda x: month_dict[x])
train_data['y'] = train_data.y.apply(lambda x: label_dict[x])
train_data['education'] = train_data.education.apply(lambda x: education_dict[x])
train_data['poutcome'] = train_data.poutcome.apply(lambda x: poutcome_dict[x])

test_data['month'] = test_data.month.apply(lambda x: month_dict[x])
test_data['y'] = test_data.y.apply(lambda x: label_dict[x])
test_data['education'] = test_data.education.apply(lambda x: education_dict[x])
test_data['poutcome'] = test_data.poutcome.apply(lambda x: poutcome_dict[x])


In [11]:
train_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,2,divorced,2,no,-78,yes,yes,cellular,29,7,10,23,999,0,0,0
1,50,3,married,3,no,106,yes,no,unknown,15,5,388,2,999,0,0,0
2,47,2,married,2,no,407,yes,no,unknown,5,6,67,12,999,0,0,0
3,39,2,divorced,2,no,952,yes,no,unknown,16,6,62,1,999,0,0,0
4,30,3,single,3,no,364,no,no,cellular,30,4,306,2,999,0,0,1


## 1.4 one-hot encode marital, default, housing, loan and contact

In [12]:
# one-hot encode job, default, housing, loan, contact
transformer = make_column_transformer((OneHotEncoder(drop='if_binary'), ['marital', 'default', 'housing', 'loan', 'contact']), remainder = 'passthrough').fit(train_data)
train_transformed = transformer.transform(train_data)
test_transformed = transformer.transform(test_data)

train_transformed = pd.DataFrame(train_transformed, columns = transformer.get_feature_names_out())
train_transformed.columns = [i.split('__')[-1] for i in train_transformed.columns]
test_transformed = pd.DataFrame(test_transformed, columns = transformer.get_feature_names_out())
test_transformed.columns = [i.split('__')[-1] for i in test_transformed.columns]

In [13]:
test_transformed.head()

Unnamed: 0,marital_divorced,marital_married,marital_single,default_yes,housing_yes,loan_yes,contact_cellular,contact_telephone,contact_unknown,age,...,education,balance,day,month,duration,campaign,pdays,previous,poutcome,y
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,48.0,...,3.0,468.0,14.0,5.0,220.0,1.0,-1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,31.0,...,3.0,10215.0,22.0,8.0,139.0,2.0,-1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,45.0,...,1.0,900.0,9.0,6.0,213.0,1.0,-1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,58.0,...,1.0,1231.0,20.0,6.0,21.0,3.0,-1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,35.0,...,3.0,5301.0,21.0,11.0,937.0,2.0,-1.0,0.0,0.0,1.0


## 1.5 standard-scale all numerical columns

In [14]:
numerical_features = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
# for all other columns, cast to int after scaling
int_features = [i for i in train_transformed.columns if i not in numerical_features]

scaler = make_column_transformer((MinMaxScaler(), numerical_features), remainder='passthrough').fit(train_transformed)

train_transformed = pd.DataFrame(scaler.transform(train_transformed), columns = scaler.get_feature_names_out())
train_transformed.columns = [i.split('__')[-1] for i in train_transformed.columns]

test_transformed = pd.DataFrame(scaler.transform(test_transformed), columns = scaler.get_feature_names_out())
test_transformed.columns = [i.split('__')[-1] for i in test_transformed.columns]

## 1.6 cast remaining columns to int

In [15]:
train_transformed = train_transformed.astype({i:'int' for i in int_features})
test_transformed = test_transformed.astype({i:'int' for i in int_features})

In [16]:
train_transformed.head()

Unnamed: 0,marital_divorced,marital_married,marital_single,default_yes,housing_yes,loan_yes,contact_cellular,contact_telephone,contact_unknown,age,...,education,balance,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,0,0,0,1,1,1,0,0,42.0,...,2,-78.0,29,7,10.0,23.0,999.0,0.0,0,0
1,0,1,0,0,1,0,0,0,1,50.0,...,3,106.0,15,5,388.0,2.0,999.0,0.0,0,0
2,0,1,0,0,1,0,0,0,1,47.0,...,2,407.0,5,6,67.0,12.0,999.0,0.0,0,0
3,1,0,0,0,1,0,0,0,1,39.0,...,2,952.0,16,6,62.0,1.0,999.0,0.0,0,0
4,0,0,1,0,0,0,1,0,0,30.0,...,3,364.0,30,4,306.0,2.0,999.0,0.0,0,1


## 1.7 split data

In [17]:
xtrain, xval, ytrain, yval = train_test_split(train_transformed.drop('y', axis = 1), train_transformed['y'], test_size=0.2)
xtest, ytest = test_transformed.drop('y', axis = 1), test_transformed['y']

## 1.8 Downsample Train data to n2p = 5

In [18]:
positive_count = sum(ytrain == 1)
negative_indexes = ytrain[ytrain == 0].sample(n = positive_count * 5).index
all_indexes = negative_indexes.append(ytrain[ytrain == 1].index)


In [19]:
xtrain_sampled = xtrain.loc[all_indexes,:]
ytrain_sampled = ytrain.loc[all_indexes]

# 1.9 Over Sample traning data using SMOTE

In [20]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy=0.6,random_state=5101)
xtrain_SMOTE, ytrain_SMOTE = smote.fit_resample(xtrain, ytrain)

# 2 Fit Models

## 2.1 Random Forest

### 2.1.1 train on unsampled data

In [22]:
classifier = RandomForestClassifier(random_state=5101, n_estimators=400, oob_score=True)
classifier.fit(xtrain, ytrain)

In [23]:
new_proba = classifier.predict_proba(xtest)
new_prediction = (new_proba[:, 1] >= 0.4).astype('int')

In [24]:
print('precision:', precision_score(ytest, new_prediction))
print('recall:', recall_score(ytest, new_prediction))
print('f1:', f1_score(ytest, new_prediction))

precision: 0.5774647887323944
recall: 0.5425330812854442
f1: 0.5594541910331384


### 2.1.2 Train on Downsampled Data

In [25]:
classifier_sampled = RandomForestClassifier(random_state=5101, n_estimators=400, oob_score=True)
classifier_sampled.fit(xtrain_sampled, ytrain_sampled)

In [26]:
new_proba = classifier_sampled.predict_proba(xtest)
new_prediction = (new_proba[:, 1] >= 0.4).astype('int')

In [27]:
print('precision:', precision_score(ytest, new_prediction))
print('recall:', recall_score(ytest, new_prediction))
print('f1:', f1_score(ytest, new_prediction))

precision: 0.5055555555555555
recall: 0.6880907372400756
f1: 0.5828662930344276


### 2.1.3 Train on SMOTE Data

In [28]:
classifier_SMOTE = RandomForestClassifier(random_state=5101, n_estimators=400, oob_score=True)
classifier_SMOTE.fit(xtrain_SMOTE, ytrain_SMOTE)

In [29]:
new_proba = classifier_SMOTE.predict_proba(xtest)
new_prediction = (new_proba[:, 1] >= 0.4).astype('int')

In [30]:
print('precision:', precision_score(ytest, new_prediction))
print('recall:', recall_score(ytest, new_prediction))
print('f1:', f1_score(ytest, new_prediction))

precision: 0.49793388429752067
recall: 0.6833648393194707
f1: 0.5760956175298805
