In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Bank.csv')
df.head(2)

Unnamed: 0,id,age,job,marital,education,default,amount,housing,loan,contact,day,month,duration,campaign,previous,y
0,1,39,blue-collar,married,secondary,no,1756.0,yes,no,cellular,3,apr,370.055237,1,0,1
1,2,51,entrepreneur,married,primary,no,1443.0,no,no,cellular,18,feb,233.998933,10,0,1


In [3]:
df['education'].value_counts()

secondary    13882
tertiary      7959
primary       4150
unknown       1137
Name: education, dtype: int64

In [4]:
# まず、ダミー変数化をしたいが、文字列の列が複数あるので抜き出す。
str_col_name=['job','default','marital','education','housing','loan','contact','month']
str_df = df[str_col_name]
#複数列を一気にダミー変数化
str_df2=pd.get_dummies(str_df,drop_first=True)

In [5]:
str_df2.head(2)

Unnamed: 0,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [6]:
num_df = df.drop(str_col_name,axis=1)#数値列を抜き出す

In [7]:
num_df.head(2)

Unnamed: 0,id,age,amount,day,duration,campaign,previous,y
0,1,39,1756.0,3,370.055237,1,0,1
1,2,51,1443.0,18,233.998933,10,0,1


In [8]:
df2 = pd.concat([num_df,str_df2,str_df],axis=1)#結合(今後の集計の利便性も考慮してstr_dfも結合しておく)

In [9]:
df2.head(2)

Unnamed: 0,id,age,amount,day,duration,campaign,previous,y,job_blue-collar,job_entrepreneur,...,month_oct,month_sep,job,default,marital,education,housing,loan,contact,month
0,1,39,1756.0,3,370.055237,1,0,1,1,0,...,0,0,blue-collar,no,married,secondary,yes,no,cellular,apr
1,2,51,1443.0,18,233.998933,10,0,1,0,1,...,0,0,entrepreneur,no,married,primary,no,no,cellular,feb


In [10]:
#訓練&検証データとテストデータに分割
train_val,test = train_test_split(df2,test_size=0.1,random_state=9)
train_val.head()

Unnamed: 0,id,age,amount,day,duration,campaign,previous,y,job_blue-collar,job_entrepreneur,...,month_oct,month_sep,job,default,marital,education,housing,loan,contact,month
13378,13379,46,0.0,10,378.293875,1,0,1,0,0,...,0,0,unemployed,no,married,tertiary,yes,no,cellular,jul
3800,3801,41,-333.0,30,300.845752,3,2,0,1,0,...,0,0,blue-collar,no,divorced,secondary,yes,no,cellular,jan
10398,10399,30,5389.0,6,159.053623,3,0,1,0,0,...,0,0,management,no,single,tertiary,no,no,cellular,aug
23401,23402,39,255.0,5,334.802583,1,0,0,0,0,...,0,0,management,no,single,tertiary,yes,no,sending _document,may
26223,26224,47,33.0,5,,1,0,1,0,0,...,0,0,services,no,single,secondary,yes,no,cellular,may


In [11]:
train_val.isnull().sum()

id                              0
age                             0
amount                          0
day                             0
duration                     6331
campaign                        0
previous                        0
y                               0
job_blue-collar                 0
job_entrepreneur                0
job_housemaid                   0
job_management                  0
job_retired                     0
job_self-employed               0
job_services                    0
job_student                     0
job_technician                  0
job_unemployed                  0
job_unknown                     0
default_yes                     0
marital_married                 0
marital_single                  0
education_secondary             0
education_tertiary              0
education_unknown               0
housing_yes                     0
loan_yes                        0
contact_sending _document       0
contact_telephone               0
month_aug     

In [22]:
train_val['duration'].median()

314.89135074820206

In [23]:
train_val2 = train_val.fillna(train_val.median(numeric_only=True))

In [25]:
train_val2.isnull().sum()

id                           0
age                          0
amount                       0
day                          0
duration                     0
campaign                     0
previous                     0
y                            0
job_blue-collar              0
job_entrepreneur             0
job_housemaid                0
job_management               0
job_retired                  0
job_self-employed            0
job_services                 0
job_student                  0
job_technician               0
job_unemployed               0
job_unknown                  0
default_yes                  0
marital_married              0
marital_single               0
education_secondary          0
education_tertiary           0
education_unknown            0
housing_yes                  0
loan_yes                     0
contact_sending _document    0
contact_telephone            0
month_aug                    0
month_dec                    0
month_feb                    0
month_ja

In [27]:
#不均衡なデータであるか確認
train_val2['y'].value_counts()

0    16601
1     7814
Name: y, dtype: int64

In [31]:
from sklearn import tree
#とりあえず、全ての特徴量を利用してモデルを作ってみる。
t =train_val2['y']
x = train_val2.drop(str_col_name,axis=1)
x =x.drop(['id','y','day'],axis=1)

x_train,x_val,y_train,y_val = train_test_split(x,t,test_size=0.2,random_state=13)

#不均衡データに対応できるように、class_weight引数も設定
model = tree.DecisionTreeClassifier(random_state=3,max_depth=3,class_weight='balanced')

#class_weightを指定しないとちなみに正解率は0.7ぐらい
#model = tree.DecisionTreeClassifier(random_state=3,max_depth=5)
model.fit(x_train,y_train)
model.score(x_val,y_val)

0.6493958631988531