# 회원 탈퇴 예측
* Decision Tree

## 1. 데이터 읽기 및 수정

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

customer = pd.read_csv('customer_join.csv')
uselog_months = pd.read_csv('use_log_months.csv')

In [2]:
customer.head()

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,OA832399,XXXX,C01,F,2015-05-01,,CA1,0,종일,10500,일반,4.833333,5.0,8,2,1,2019-04-30,47
1,PL270116,XXXXX,C01,M,2015-05-01,,CA1,0,종일,10500,일반,5.083333,5.0,7,3,1,2019-04-30,47
2,OA974876,XXXXX,C01,M,2015-05-01,,CA1,0,종일,10500,일반,4.583333,5.0,6,3,1,2019-04-30,47
3,HD024127,XXXXX,C01,F,2015-05-01,,CA1,0,종일,10500,일반,4.833333,4.5,7,2,1,2019-04-30,47
4,HD661448,XXXXX,C03,F,2015-05-01,,CA1,0,야간,6000,일반,3.916667,4.0,6,1,1,2019-04-30,47


In [3]:
uselog_months.head()

Unnamed: 0,연월,customer_id,count
0,201804,AS002855,4
1,201804,AS009013,2
2,201804,AS009373,3
3,201804,AS015315,6
4,201804,AS015739,7


* 이달과 지난 달의 이용횟수 집계

In [4]:
year_months = list(uselog_months['연월'].unique())
uselog = pd.DataFrame()

for i in range(1, len(year_months)):
    tmp = uselog_months.loc[uselog_months['연월']==year_months[i]]
    tmp.rename(columns={'count':'count_0'}, inplace=True)
    tmp_before = uselog_months.loc[uselog_months['연월']==year_months[i-1]]
    del tmp_before['연월']
    
    tmp_before.rename(columns={'count':'count_1'}, inplace=True)
    tmp = pd.merge(tmp, tmp_before, on='customer_id', how='left')
    uselog = pd.concat([uselog, tmp], ignore_index=True)
    
uselog.head()

Unnamed: 0,연월,customer_id,count_0,count_1
0,201805,AS002855,5,4.0
1,201805,AS009373,4,3.0
2,201805,AS015233,7,
3,201805,AS015315,3,6.0
4,201805,AS015739,5,7.0


## 2. 탈퇴 전월의 탈퇴 고객 데이터
* 탈퇴한 회원 추출, end_date의 1개월 전을 계산 -> 연월에 저장 후 uselog와 customer_id, 연월로 결합

In [5]:
from dateutil.relativedelta import relativedelta

exit_customer = customer.loc[customer['is_deleted']==1]
exit_customer['exit_date'] = None
exit_customer['end_date'] = pd.to_datetime(exit_customer['end_date'])

for i in range(len(exit_customer)):
    exit_customer['exit_date'].iloc[i] = exit_customer['end_date'].iloc[i] - relativedelta(months=1)

exit_customer['연월'] = exit_customer['exit_date'].dt.strftime('%Y%m')
uselog['연월'] = uselog['연월'].astype(str)
exit_uselog = pd.merge(uselog, exit_customer, on=['customer_id', '연월'], how='left')
print(len(uselog))
exit_uselog.head()

33851


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201805,AS002855,5,4.0,,,,,NaT,,...,,,,,,,,,,
1,201805,AS009373,4,3.0,,,,,NaT,,...,,,,,,,,,,
2,201805,AS015233,7,,,,,,NaT,,...,,,,,,,,,,
3,201805,AS015315,3,6.0,,,,,NaT,,...,,,,,,,,,,
4,201805,AS015739,5,7.0,,,,,NaT,,...,,,,,,,,,,


* 결측치 제거

In [6]:
exit_uselog = exit_uselog.dropna(subset=['name'])
print(len(exit_uselog))
print(len(exit_uselog['customer_id'].unique()))
exit_uselog.head()

1104
1104


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
19,201805,AS055680,3,3.0,XXXXX,C01,M,2018-03-01,2018-06-30,CA1,...,10500.0,일반,3.0,3.0,3.0,3.0,0.0,2018-06-30,3.0,2018-05-30 00:00:00
57,201805,AS169823,2,3.0,XX,C01,M,2017-11-01,2018-06-30,CA1,...,10500.0,일반,3.0,3.0,4.0,2.0,1.0,2018-06-30,7.0,2018-05-30 00:00:00
110,201805,AS305860,5,3.0,XXXX,C01,M,2017-06-01,2018-06-30,CA1,...,10500.0,일반,3.333333,3.0,5.0,2.0,0.0,2018-06-30,12.0,2018-05-30 00:00:00
128,201805,AS363699,5,3.0,XXXXX,C01,M,2018-02-01,2018-06-30,CA1,...,10500.0,일반,3.333333,3.0,5.0,2.0,0.0,2018-06-30,4.0,2018-05-30 00:00:00
147,201805,AS417696,1,4.0,XX,C03,F,2017-09-01,2018-06-30,CA1,...,6000.0,일반,2.0,1.0,4.0,1.0,0.0,2018-06-30,9.0,2018-05-30 00:00:00


## 3. 지속 회원의 데이터 작성

In [7]:
conti_customer = customer.loc[customer['is_deleted']==0]
conti_uselog = pd.merge(uselog, conti_customer, on=['customer_id'], how='left')
print(len(conti_uselog))

conti_uselog = conti_uselog.dropna(subset=['name'])
print(len(conti_uselog))

33851
27422


In [8]:
conti_uselog = conti_uselog.sample(frac=1).reset_index(drop=True)
conti_uselog = conti_uselog.drop_duplicates(subset="customer_id")
print(len(conti_uselog))
conti_uselog.head()

2842


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,201902,GD001777,3,7.0,XX,C03,M,2015-12-01,,CA1,...,야간,6000.0,일반,3.916667,3.0,7.0,2.0,1.0,2019-04-30,40.0
1,201805,GD083655,4,5.0,XXXX,C01,M,2016-04-01,,CA1,...,종일,10500.0,일반,5.166667,5.0,9.0,2.0,1.0,2019-04-30,36.0
2,201902,IK611614,7,8.0,XXX,C03,M,2018-05-09,,CA2,...,야간,6000.0,입회비반액할인,7.909091,8.0,11.0,7.0,1.0,2019-04-30,11.0
3,201903,PL652437,2,2.0,XXXX,C02,F,2015-08-01,,CA1,...,주간,7500.0,일반,4.833333,5.0,7.0,2.0,1.0,2019-04-30,44.0
4,201805,AS772036,7,8.0,XX,C03,F,2016-06-01,,CA1,...,야간,6000.0,일반,4.5,4.0,8.0,3.0,1.0,2019-04-30,34.0


In [9]:
# 지속 회원 데이터와 탈퇴 회원 데이터를 세로로 결합
predict_data = pd.concat([conti_uselog, exit_uselog], ignore_index=True)
print(len(predict_data))
predict_data.head()

3946


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201902,GD001777,3,7.0,XX,C03,M,2015-12-01,,CA1,...,6000.0,일반,3.916667,3.0,7.0,2.0,1.0,2019-04-30,40.0,
1,201805,GD083655,4,5.0,XXXX,C01,M,2016-04-01,,CA1,...,10500.0,일반,5.166667,5.0,9.0,2.0,1.0,2019-04-30,36.0,
2,201902,IK611614,7,8.0,XXX,C03,M,2018-05-09,,CA2,...,6000.0,입회비반액할인,7.909091,8.0,11.0,7.0,1.0,2019-04-30,11.0,
3,201903,PL652437,2,2.0,XXXX,C02,F,2015-08-01,,CA1,...,7500.0,일반,4.833333,5.0,7.0,2.0,1.0,2019-04-30,44.0,
4,201805,AS772036,7,8.0,XX,C03,F,2016-06-01,,CA1,...,6000.0,일반,4.5,4.0,8.0,3.0,1.0,2019-04-30,34.0,


## 4. 예측할 달의 재적 기간을 작성

In [10]:
predict_data['period'] = 0
predict_data['now_date'] = pd.to_datetime(predict_data['연월'], format="%Y%m")
predict_data['start_date'] = pd.to_datetime(predict_data['start_date'])

for i in range(len(predict_data)):
    delta = relativedelta(predict_data['now_date'][i], predict_data['start_date'][i])
    predict_data['period'][i] = int(delta.years * 12 + delta.months)
    
predict_data.head()

Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date,period,now_date
0,201902,GD001777,3,7.0,XX,C03,M,2015-12-01,,CA1,...,3.916667,3.0,7.0,2.0,1.0,2019-04-30,40.0,,38,2019-02-01
1,201805,GD083655,4,5.0,XXXX,C01,M,2016-04-01,,CA1,...,5.166667,5.0,9.0,2.0,1.0,2019-04-30,36.0,,25,2018-05-01
2,201902,IK611614,7,8.0,XXX,C03,M,2018-05-09,,CA2,...,7.909091,8.0,11.0,7.0,1.0,2019-04-30,11.0,,8,2019-02-01
3,201903,PL652437,2,2.0,XXXX,C02,F,2015-08-01,,CA1,...,4.833333,5.0,7.0,2.0,1.0,2019-04-30,44.0,,43,2019-03-01
4,201805,AS772036,7,8.0,XX,C03,F,2016-06-01,,CA1,...,4.5,4.0,8.0,3.0,1.0,2019-04-30,34.0,,23,2018-05-01


## 5. 결측치 제거

In [11]:
predict_data.isna().sum()

연월                      0
customer_id             0
count_0                 0
count_1               246
name                    0
class                   0
gender                  0
start_date              0
end_date             2842
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
exit_date            2842
period                  0
now_date                0
dtype: int64

In [12]:
predict_data = predict_data.dropna(subset=['count_1'])
predict_data.isna().sum()

연월                      0
customer_id             0
count_0                 0
count_1                 0
name                    0
class                   0
gender                  0
start_date              0
end_date             2648
campaign_id             0
is_deleted              0
class_name              0
price                   0
campaign_name           0
mean                    0
median                  0
max                     0
min                     0
routine_flg             0
calc_date               0
membership_period       0
exit_date            2648
period                  0
now_date                0
dtype: int64

## 6. 문자열 변수 처리를 위한 가공

In [13]:
target_col = ['campaign_name', 'class_name', 'gender', 'count_1', 'routine_flg', 'period', 'is_deleted']
predict_data = predict_data[target_col]
predict_data.head()

Unnamed: 0,campaign_name,class_name,gender,count_1,routine_flg,period,is_deleted
0,일반,야간,M,7.0,1.0,38,0.0
1,일반,종일,M,5.0,1.0,25,0.0
2,입회비반액할인,야간,M,8.0,1.0,8,0.0
3,일반,주간,F,2.0,1.0,43,0.0
4,일반,야간,F,8.0,1.0,23,0.0


In [14]:
predict_data = pd.get_dummies(predict_data)
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_일반,campaign_name_입회비무료,campaign_name_입회비반액할인,class_name_야간,class_name_종일,class_name_주간,gender_F,gender_M
0,7.0,1.0,38,0.0,1,0,0,1,0,0,0,1
1,5.0,1.0,25,0.0,1,0,0,0,1,0,0,1
2,8.0,1.0,8,0.0,0,0,1,1,0,0,0,1
3,2.0,1.0,43,0.0,1,0,0,0,0,1,1,0
4,8.0,1.0,23,0.0,1,0,0,1,0,0,1,0


In [15]:
del predict_data["campaign_name_일반"]
del predict_data["class_name_야간"]
del predict_data["gender_M"]
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_입회비무료,campaign_name_입회비반액할인,class_name_종일,class_name_주간,gender_F
0,7.0,1.0,38,0.0,0,0,0,0,0
1,5.0,1.0,25,0.0,0,0,1,0,0
2,8.0,1.0,8,0.0,0,1,0,0,0
3,2.0,1.0,43,0.0,0,0,0,1,1
4,8.0,1.0,23,0.0,0,0,0,0,1


## 7. 의사결정트리로 탈퇴 예측 모델 구축

In [16]:
from sklearn.tree import DecisionTreeClassifier
import sklearn.model_selection

exit = predict_data.loc[predict_data['is_deleted']==1]
conti = predict_data.loc[predict_data['is_deleted']==0].sample(len(exit))

X = pd.concat([exit, conti], ignore_index=True)
y = X['is_deleted']
del X['is_deleted']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)

model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print(y_test_pred)

[1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1.
 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0.
 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0.
 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0.
 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1.
 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0.
 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0.
 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1.
 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1.

In [17]:
results_test = pd.DataFrame({'y_test' : y_test, 'y_pred' : y_test_pred})
results_test.head()

Unnamed: 0,y_test,y_pred
492,1.0,1.0
0,1.0,1.0
829,1.0,1.0
1233,0.0,0.0
1994,0.0,0.0


## 8. 예측 모델 평가

In [18]:
correct = len(results_test.loc[results_test['y_test']==results_test['y_pred']])
data_count = len(results_test)
score_test = correct / data_count
print(score_test)

0.8935361216730038


In [19]:
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.8935361216730038
0.9752851711026616


In [22]:
X = pd.concat([exit, conti], ignore_index=True)
y = X['is_deleted']
del X['is_deleted']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y)

model = DecisionTreeClassifier(random_state=0, max_depth=5)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.9182509505703422
0.926489226869455
