In [42]:
import pandas as pd

bank_df = pd.read_csv('bank.csv', sep = ',')
bank_df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,36,technician,single,secondary,no,265,yes,yes,,5,may,348,1,-1,0,,no
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,may,365,1,-1,0,,no
3,53,technician,married,secondary,no,-3,no,no,,5,may,1666,1,-1,0,,no
4,24,technician,single,secondary,no,-103,yes,yes,,5,may,145,1,-1,0,,no


In [90]:
bank_df_new = pd.read_csv('bank-prep.csv', sep = ',')
bank_df_new

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,dec,feb,jan,jul,jun,mar,may,nov,oct,sep
0,58,0,2143,1,0,5,261,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
1,36,0,265,1,1,5,348,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
2,25,0,-7,1,0,5,365,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
3,53,0,-3,0,0,5,1666,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
4,24,0,-103,1,1,5,145,1,-1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6928,25,0,199,0,0,16,173,1,92,5,...,0,0,0,0,0,0,0,1,0,0
6929,28,0,159,0,0,16,449,2,33,4,...,0,0,0,0,0,0,0,1,0,0
6930,59,0,138,1,1,16,162,2,187,5,...,0,0,0,0,0,0,0,1,0,0
6931,37,0,1428,0,0,16,333,2,-1,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
! pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable


In [4]:
!pip list


Package                       Version
----------------------------- --------------------
alabaster                     0.7.12
anaconda-client               1.11.0
anaconda-navigator            2.3.1
anaconda-project              0.11.1
anyio                         3.5.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arrow                         1.2.2
astroid                       2.11.7
astropy                       5.1
atomicwrites                  1.4.0
attrs                         21.4.0
Automat                       20.2.0
autopep8                      1.6.0
Babel                         2.9.1
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
backports.tempfile            1.0
backports.weakref             1.0.post1
bcrypt                        3.2.0
beautifulsoup4                4.11.1
binaryornot                   0.4.4
bitarray                      2.5.1
bkcharts                      0.2
blac

In [5]:
# 언더 샘플링 
# 데이터 이해 부분을 보면 yes 와 no 의 비율이 확연히 차이나는것을 볼수 있음 
# 데이터 불균형이 일어나면 안되기 떄문에 언더 샘플링을 하여 데이터의 균형을 맞춰줌

import numpy as np
from imblearn.under_sampling import RandomUnderSampler

X = np.array(bank_df_new.drop('y', axis = 1))
Y = np.array(bank_df_new[['y', ]])
print(np.sum(Y==1), np.sum(Y==0))

# 샘플링 조건 설정(랜덤)
sampler = RandomUnderSampler(random_state=42)
# x 와 y 를 조건에 맞춰서 샘플링 함 
X, Y = sampler.fit_resample(X,Y)
print(np.sum(Y==1), np.sum(Y==0))

820 6113
820 820


In [22]:
# sklearn 는 머신러닝 알고리즘, 모델검증, 최적화에 도움이 되는 기능 제공
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.metrics import accuracy_score

#k-폴드를 사용해 교차검증 실시 k = 10
kf = KFold(n_splits = 10, shuffle = True)
score = []

# 훈련 데이터와 테스트 데이터의 조합을 변경해 가며 모델 작성
for train_id, test_id in kf.split(X):
    x = X[train_id] # 훈련 데이터
    y = Y[train_id]
    clf = tree.DecisionTreeClassifier(criterion='gini')
#분류를 위해 결정트리 인스턴스를 생성함 
    clf.fit(x, y)
# predict 를 이용해 결과 취득
    pred_y = clf.predict(X[test_id]) # 테스트 데이터에 적용한 결과 취득
# accuract_score을 사용해 결과와 답변의 정답, 오답수를 계산함
    scores = accuracy_score(Y[test_id],pred_y)
    score.append(score)
    
scores = np.array(scores)
print(scores.mean(), scores.std())

# 평균 정확도, 표준편차 

0.75 0.0


In [7]:
#재현율과 적합도 확인
from sklearn.metrics import recall_score # 재현율
from sklearn.metrics import precision_score #정밀도

print(recall_score(Y[test_id], pred_y))
print(precision_score(Y[test_id], pred_y))

#주어진 값에 따라 재현율을 높일지 적합도를 높일지 판단해야함 현 데이터는 재현율이 높아야함 

0.75
0.7682926829268293


In [56]:
# 기존 직업들을 job2, worker로 
bank_df.loc[(bank_df['job'] == 'management') |
            (bank_df['job'] == 'technician') |
            (bank_df['job'] == 'blue-collar') |
            (bank_df['job'] == 'admin.') |
            (bank_df['job'] == 'services') |
            (bank_df['job'] == 'self-employed') |
            (bank_df['job'] == 'entrepreneur') |
            (bank_df['job'] == 'housemaid'), 'job2'] = 'worker'
bank_df.loc[(bank_df['job'] == 'retired') |
            (bank_df['job'] == 'unemployed') |
            (bank_df['job'] == 'student'), 'job2'] = 'non-worker'
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,job2,month2
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no,worker,
1,36,technician,single,secondary,no,265,yes,yes,,5,may,348,1,-1,0,,no,worker,
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,may,365,1,-1,0,,no,worker,
3,53,technician,married,secondary,no,-3,no,no,,5,may,1666,1,-1,0,,no,worker,
4,24,technician,single,secondary,no,-103,yes,yes,,5,may,145,1,-1,0,,no,worker,


In [58]:
#날짜별로 1쿼터,2쿼터,, 나눠줌

bank_df.loc[(bank_df['month'] == 'jan') |
            (bank_df['month'] == 'feb') |
            (bank_df['month'] == 'mar'), 'month2'] = '1Q'

bank_df.loc[(bank_df['month'] == 'apr') |
            (bank_df['month'] == 'may') |
            (bank_df['month'] == 'jun'), 'month2'] = '2Q'

bank_df.loc[(bank_df['month'] == 'jul') |
            (bank_df['month'] == 'aug') |
            (bank_df['month'] == 'sep'), 'month2'] = '3Q'

bank_df.loc[(bank_df['month'] == 'oct') |
            (bank_df['month'] == 'nov') |
            (bank_df['month'] == 'dec'), 'month2'] = '4Q'


bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,job2,month2
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no,worker,2Q
1,36,technician,single,secondary,no,265,yes,yes,,5,may,348,1,-1,0,,no,worker,2Q
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,may,365,1,-1,0,,no,worker,2Q
3,53,technician,married,secondary,no,-3,no,no,,5,may,1666,1,-1,0,,no,worker,2Q
4,24,technician,single,secondary,no,-103,yes,yes,,5,may,145,1,-1,0,,no,worker,2Q


In [66]:
# day 가 10 보다 작으면 early 10보다 크고 20보다 작으면 middle 20 보다 크면 late

bank_df.loc[bank_df['day'] <= 10, 'day2'] = 'early'
bank_df.loc[(bank_df['day'] > 10) & (bank_df['day'] <= 20), 'day2'] = 'middle'
bank_df.loc[bank_df['day'] > 20, 'day2'] = 'late'


bank_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,job2,month2,day2
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no,worker,2Q,early
1,36,technician,single,secondary,no,265,yes,yes,,5,may,348,1,-1,0,,no,worker,2Q,early
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,may,365,1,-1,0,,no,worker,2Q,early
3,53,technician,married,secondary,no,-3,no,no,,5,may,1666,1,-1,0,,no,worker,2Q,early
4,24,technician,single,secondary,no,-103,yes,yes,,5,may,145,1,-1,0,,no,worker,2Q,early
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7229,25,services,single,secondary,no,199,no,no,cellular,16,nov,173,1,92,5,failure,no,worker,4Q,middle
7230,28,self-employed,single,tertiary,no,159,no,no,cellular,16,nov,449,2,33,4,success,yes,worker,4Q,middle
7231,59,management,married,tertiary,no,138,yes,yes,cellular,16,nov,162,2,187,5,failure,no,worker,4Q,middle
7232,37,management,married,tertiary,no,1428,no,no,cellular,16,nov,333,2,-1,0,,no,worker,4Q,middle


In [70]:
#duration = 접촉한 시간
# 300 이상이면 long 아니면 short

bank_df.loc[bank_df['duration'] < 300, 'duration2'] = 'short'
bank_df.loc[bank_df['duration'] >= 300, 'duration2'] = 'long'
bank_df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,duration,campaign,pdays,previous,poutcome,y,job2,month2,day2,duration2
0,58,management,married,tertiary,no,2143,yes,no,,5,...,261,1,-1,0,,no,worker,2Q,early,short
1,36,technician,single,secondary,no,265,yes,yes,,5,...,348,1,-1,0,,no,worker,2Q,early,long
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,...,365,1,-1,0,,no,worker,2Q,early,long
3,53,technician,married,secondary,no,-3,no,no,,5,...,1666,1,-1,0,,no,worker,2Q,early,long
4,24,technician,single,secondary,no,-103,yes,yes,,5,...,145,1,-1,0,,no,worker,2Q,early,short


In [74]:
#previous = 연락횟수 

bank_df.loc[bank_df['previous'] < 1, 'previous2'] = 'zero' 
bank_df.loc[bank_df['previous'] >= 1, 'previous2'] = 'one-more'
bank_df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,campaign,pdays,previous,poutcome,y,job2,month2,day2,duration2,previous2
0,58,management,married,tertiary,no,2143,yes,no,,5,...,1,-1,0,,no,worker,2Q,early,short,zero
1,36,technician,single,secondary,no,265,yes,yes,,5,...,1,-1,0,,no,worker,2Q,early,long,zero
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,...,1,-1,0,,no,worker,2Q,early,long,zero
3,53,technician,married,secondary,no,-3,no,no,,5,...,1,-1,0,,no,worker,2Q,early,long,zero
4,24,technician,single,secondary,no,-103,yes,yes,,5,...,1,-1,0,,no,worker,2Q,early,short,zero


In [83]:
#pdats = 지난번 캠페인에서 마지막으로 접촉한 후의 경과시간

bank_df.loc[bank_df['pdays'] >= 0, 'pdays2'] = 'less'
bank_df.loc[bank_df['pdays'] < 0, 'pdays2'] = 'more'

bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,pdays,previous,poutcome,y,job2,month2,day2,duration2,previous2,pdays2
0,58,management,married,tertiary,no,2143,yes,no,,5,...,-1,0,,no,worker,2Q,early,short,zero,less
1,36,technician,single,secondary,no,265,yes,yes,,5,...,-1,0,,no,worker,2Q,early,long,zero,less
2,25,blue-collar,married,secondary,no,-7,yes,no,,5,...,-1,0,,no,worker,2Q,early,long,zero,less
3,53,technician,married,secondary,no,-3,no,no,,5,...,-1,0,,no,worker,2Q,early,long,zero,less
4,24,technician,single,secondary,no,-103,yes,yes,,5,...,-1,0,,no,worker,2Q,early,short,zero,less
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,38,admin.,single,secondary,no,221,yes,no,,6,...,-1,0,,no,worker,2Q,early,short,zero,less
95,41,services,married,,no,4,no,no,,6,...,-1,0,,no,worker,2Q,early,short,zero,less
96,29,admin.,married,secondary,no,-150,yes,no,,6,...,-1,0,,no,worker,2Q,early,long,zero,less
97,41,blue-collar,married,secondary,no,140,yes,no,,6,...,-1,0,,no,worker,2Q,early,long,zero,less


In [84]:
#더미변수로 바꿔주기

bank_df_job2 = pd.get_dummies(bank_df['job2'])
bank_df_month2 = pd.get_dummies(bank_df['month2'])
bank_df_day2 = pd.get_dummies(bank_df['day2'])
bank_df_duration2 = pd.get_dummies(bank_df['duration2'])
bank_df_previous2 = pd.get_dummies(bank_df['previous2'])
bank_df_pdays2 = pd.get_dummies(bank_df['pdays2'])

In [93]:
#concat 을 이용하여 합치기

tmp5 = pd.concat([bank_df_new, bank_df_job2], axis = 1)
tmp6 = pd.concat([tmp5, bank_df_month2], axis = 1)
tmp7 = pd.concat([tmp6, bank_df_day2], axis = 1)
tmp8 = pd.concat([tmp7, bank_df_duration2], axis = 1)
tmp9 = pd.concat([tmp8, bank_df_previous2], axis = 1)

bank_df_new2 = pd.concat([tmp9, bank_df_pdays2], axis = 1)
pd.set_option('display.max_columns', None)
bank_df_new2.head()


Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,y,divorced,married,single,primary,secondary,tertiary,cellular,telephone,unknown,apr,aug,dec,feb,jan,jul,jun,mar,may,nov,oct,sep,non-worker,worker,1Q,2Q,3Q,4Q,early,late,middle,long,short,one-more,zero,less,more
0,58.0,0.0,2143.0,1.0,0.0,5.0,261.0,1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,0,1,0,0,1,0,0,0,1,0,1,1,0
1,36.0,0.0,265.0,1.0,1.0,5.0,348.0,1.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,0,1,0,0,1,0,0,1,0,0,1,1,0
2,25.0,0.0,-7.0,1.0,0.0,5.0,365.0,1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,0,1,0,0,1,0,0,1,0,0,1,1,0
3,53.0,0.0,-3.0,0.0,0.0,5.0,1666.0,1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,0,1,0,0,1,0,0,1,0,0,1,1,0
4,24.0,0.0,-103.0,1.0,1.0,5.0,145.0,1.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,0,1,0,0,1,0,0,0,1,0,1,1,0


In [94]:
#파일에 저장 

bank_df_new2.to_csv('bank-prep2.csv', index = False)