In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier

In [77]:
#loading the data
df = pd.read_csv('./bank/bank-additional-full.csv',sep=";")
df.shape

(41188, 21)

In [78]:
df.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [79]:
df.groupby('loan').y.value_counts()

loan     y  
no       no     30100
         yes     3850
unknown  no       883
         yes      107
yes      no      5565
         yes      683
Name: y, dtype: int64

In [80]:
df.groupby(['loan','education','marital']).y.value_counts()

loan  education            marital   y  
no    basic.4y             divorced  no      340
                                     yes      68
                           married   no     2404
                                     yes     255
                           single    no      338
                                     yes      25
                           unknown   no        4
                                     yes       1
      basic.6y             divorced  no      138
                                     yes      12
                           married   no     1355
                                     yes     121
                           single    no      242
                                     yes      29
                           unknown   no        5
      basic.9y             divorced  no      452
                                     yes      27
                           married   no     3197
                                     yes     252
                           s

In [81]:
#The first and most important step in using TPOT on any data set is to rename the target class/response variable to class.
df.rename(columns={'y': 'class'}, inplace=True)


In [82]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
class              object
dtype: object

In [83]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'class'],
      dtype='object')

In [84]:
df['day_of_week']

0        mon
1        mon
2        mon
3        mon
4        mon
5        mon
6        mon
7        mon
8        mon
9        mon
10       mon
11       mon
12       mon
13       mon
14       mon
15       mon
16       mon
17       mon
18       mon
19       mon
20       mon
21       mon
22       mon
23       mon
24       mon
25       mon
26       mon
27       mon
28       mon
29       mon
        ... 
41158    tue
41159    tue
41160    tue
41161    tue
41162    tue
41163    tue
41164    tue
41165    wed
41166    wed
41167    wed
41168    wed
41169    wed
41170    wed
41171    thu
41172    thu
41173    thu
41174    thu
41175    thu
41176    thu
41177    thu
41178    thu
41179    fri
41180    fri
41181    fri
41182    fri
41183    fri
41184    fri
41185    fri
41186    fri
41187    fri
Name: day_of_week, Length: 41188, dtype: object

In [85]:
type(df.columns)

pandas.core.indexes.base.Index

In [86]:
for cat in df.columns:
    if( df[cat].dtypes == 'object'):
        print("Category has",cat,"********************",df[cat].unique().size,"\r") ;
        print("Category has unique value",cat,"********************",df[cat].unique(),"\r")

Category has job ******************** 12 
Category has unique value job ******************** ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student'] 
Category has marital ******************** 4 
Category has unique value marital ******************** ['married' 'single' 'divorced' 'unknown'] 
Category has education ******************** 8 
Category has unique value education ******************** ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate'] 
Category has default ******************** 3 
Category has unique value default ******************** ['no' 'unknown' 'yes'] 
Category has housing ******************** 3 
Category has unique value housing ******************** ['no' 'yes' 'unknown'] 
Category has loan ******************** 3 
Category has unique value loan ******************** ['no' 'yes' 'unknown'] 
Category has contact ******

In [87]:
df['age'].dtypes

dtype('int64')

In [88]:
df['marital'] = df['marital'].map({'married':0,'single':1,'divorced':2})
df['default'] = df['default'].map({'no':0,'yes':1,'unknown':2})
df['housing'] = df['housing'].map({'no':0,'yes':1,'unknown':2})
df['loan'] = df['loan'].map({'no':0,'yes':1,'unknown':2})
df['contact'] = df['contact'].map({'telephone':0,'cellular':1})
df['poutcome'] = df['poutcome'].map({'nonexistent':0,'failure':1,'success':2})
df['class'] = df['class'].map({'no':0,'yes':1})

In [89]:
pd.isnull(df).any()

age               False
job               False
marital            True
education         False
default           False
housing           False
loan              False
contact           False
month             False
day_of_week       False
duration          False
campaign          False
pdays             False
previous          False
poutcome          False
emp.var.rate      False
cons.price.idx    False
cons.conf.idx     False
euribor3m         False
nr.employed       False
class             False
dtype: bool

In [90]:
df = df.fillna(-999)
pd.isnull(df).any()

age               False
job               False
marital           False
education         False
default           False
housing           False
loan              False
contact           False
month             False
day_of_week       False
duration          False
campaign          False
pdays             False
previous          False
poutcome          False
emp.var.rate      False
cons.price.idx    False
cons.conf.idx     False
euribor3m         False
nr.employed       False
class             False
dtype: bool

In [91]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

job_Trans = mlb.fit_transform([{str(val)} for val in df['job'].values])
education_Trans = mlb.fit_transform([{str(val)} for val in df['education'].values])
month_Trans = mlb.fit_transform([{str(val)} for val in df['month'].values])
day_of_week_Trans = mlb.fit_transform([{str(val)} for val in df['day_of_week'].values])

job_Trans

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [92]:
education_Trans

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [93]:
month_Trans

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [94]:
df_new = df.drop(['marital','default','housing','loan','contact','poutcome','class','job','education','month','day_of_week'], axis=1)

In [95]:
assert (len(df['day_of_week'].unique()) == len(mlb.classes_)), "Not Equal"
df['day_of_week']

0        mon
1        mon
2        mon
3        mon
4        mon
5        mon
6        mon
7        mon
8        mon
9        mon
10       mon
11       mon
12       mon
13       mon
14       mon
15       mon
16       mon
17       mon
18       mon
19       mon
20       mon
21       mon
22       mon
23       mon
24       mon
25       mon
26       mon
27       mon
28       mon
29       mon
        ... 
41158    tue
41159    tue
41160    tue
41161    tue
41162    tue
41163    tue
41164    tue
41165    wed
41166    wed
41167    wed
41168    wed
41169    wed
41170    wed
41171    thu
41172    thu
41173    thu
41174    thu
41175    thu
41176    thu
41177    thu
41178    thu
41179    fri
41180    fri
41181    fri
41182    fri
41183    fri
41184    fri
41185    fri
41186    fri
41187    fri
Name: day_of_week, Length: 41188, dtype: object

In [96]:
df['day_of_week'].unique(),mlb.classes_


(array(['mon', 'tue', 'wed', 'thu', 'fri'], dtype=object),
 array(['fri', 'mon', 'thu', 'tue', 'wed'], dtype=object))

In [97]:
df_new = np.hstack((df_new.values, job_Trans, education_Trans, month_Trans, day_of_week_Trans))

In [98]:
np.isnan(df_new).any()


False

In [99]:
df_new[0].size


45

In [100]:
df_class = df['class'].values


In [101]:
training_indices, validation_indices = training_indices, testing_indices = train_test_split(df.index, stratify = df_class, train_size=0.75, test_size=0.25)
training_indices.size, validation_indices.size

(30891, 10297)

In [102]:
tpot = TPOTClassifier(verbosity=2, max_time_mins=2, max_eval_time_mins=0.04, population_size=15)
tpot.fit(df_new[training_indices], df_class[training_indices])


Generation 1 - Current best internal CV score: 0.9130491434165083
Generation 2 - Current best internal CV score: 0.91327571205659
Generation 3 - Current best internal CV score: 0.91327571205659

2.0440169166666666 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: DecisionTreeClassifier(VarianceThreshold(input_matrix, threshold=0.001), criterion=gini, max_depth=8, min_samples_leaf=16, min_samples_split=13)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
        disable_update_check=False, early_stop=None, generations=1000000,
        max_eval_time_mins=0.04, max_time_mins=2, memory=None,
        mutation_rate=0.9, n_jobs=1, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=15,
        random_state=None, scoring=None, subsample=1.0, template=None,
        use_dask=False, verbosity=2, warm_start=False)

In [103]:
tpot.score(df_new[validation_indices], df.loc[validation_indices, 'class'].values)


0.9111391667475964

In [104]:
tpot.export('tpot_loan_pipeline.py')