In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
def dtype_compressor(df):
  #object 
  # when unique ratio > 0.5, catogory stands less memory than object
  converted_obj = pd.DataFrame()
  df_obj = df.select_dtypes(include=['object'])
  for col in df_obj.columns:
    converted_obj.loc[:,col] = df_obj.loc[:,col].astype('category')

  # numbers
  df_down_num = pd.DataFrame()
  df_num = df.select_dtypes(include=['number'])
  for col in df_num.columns:
    df_down_num.loc[:,col] = pd.to_numeric(df_num.loc[:,col],downcast='signed')

  # merge
  df = pd.concat([converted_obj,df_down_num],axis=1)
  
  del converted_obj, df_obj, df_down_num, df_num  
  
  return df

In [3]:
def fast_datetime(s):
    dates = {date:pd.to_datetime(date,format='%Y%m%d') for date in s.unique()}
    return s.map(dates)

In [4]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

---

In [5]:
# 讀取train+members
KKboxTrainMembers = pd.read_csv('KKboxTrainMembers.csv')

In [6]:
KKboxTrainMembers = KKboxTrainMembers[KKboxTrainMembers['bd'] > 10]

In [7]:
KKboxTrainMembers.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,5.0,28.0,1.0,3.0,2013-12-23
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,13.0,20.0,1.0,3.0,2013-12-23
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,13.0,18.0,1.0,3.0,2013-12-27
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,13.0,35.0,0.0,7.0,2014-01-25
7,moRTKhKIDvb+C8ZHOgmaF4dXMLk0jOn65d7a8tQ2Eds=,1,9.0,28.0,0.0,3.0,2014-02-02


---

## 資料探勘—關聯性分析

### 根據交易資料取得不同方案做關聯性分析-依合約短中長期製作欄位

In [8]:
# 讀取資料
ProgramTransac_v1 = pd.read_csv('ProgramTransaction_v1.csv')
ProgramTransac_v2 = pd.read_csv('ProgramTransaction_v2.csv')
# 合併兩份資料
ProgramTransac = pd.concat([ProgramTransac_v1, ProgramTransac_v2], axis=0)
# 合併相同msno
ProgramTransac = ProgramTransac[['msno', 'ProgramA_Short', 'ProgramB_Month', 'ProgramC_Mid', 'ProgramD_Season', 'ProgramE_Long']].groupby(by=['msno'],as_index=False).sum()

In [9]:
# 針對使用過方案的人都當作1，沒有的人當作0
ProgramTransac_Col = list(ProgramTransac.columns[1:])
for i in ProgramTransac_Col:
    ProgramTransac[i] = ProgramTransac[i].apply(lambda x: 1 if x > 0 else 0)

In [10]:
print(ProgramTransac.shape)
ProgramTransac.head()

(2426143, 6)


Unnamed: 0,msno,ProgramA_Short,ProgramB_Month,ProgramC_Mid,ProgramD_Season,ProgramE_Long
0,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,1,0,0,0,0
1,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,0,0,0,0,1
2,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,1,0,0,0
3,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,1,1,0,0,0
4,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,1,1,0,0,0


In [11]:
# 存成csv
# ProgramTransac.to_csv('ProgramTransac.csv',index=False,header=True)

In [12]:
KKboxMembersProgram = pd.merge(KKboxTrainMembers, ProgramTransac, how='left', on=['msno'])

##### 針對流失非流失進行隨機抽樣使得數據不平衡現象達到解決

In [13]:
def imbalance_random_sampling(df):
    # 取出特徵is_churn為流失(1)與非流失(0)的數據集
    df_notchurn = df[df['is_churn'] == 0]
    df_churn = df[df['is_churn'] == 1]
    # 因為要對數據量較多的一方進行隨機抽樣(這邊是notchurn數據量較多，並取得與churn數據量差不多的n)
    df_notchurn_change = df_notchurn.sample(n=52000)
    # 合併churn與notchurn欄位
    df_after = pd.concat([df_churn, df_notchurn_change], axis=0)
    df_after.sort_values(by=['msno','registration_init_time'],inplace=True)
    return df_after

In [14]:
KKboxMembersProgram = imbalance_random_sampling(KKboxMembersProgram)

In [15]:
print(KKboxMembersProgram.shape)
KKboxMembersProgram.head()

(103378, 12)


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,ProgramA_Short,ProgramB_Month,ProgramC_Mid,ProgramD_Season,ProgramE_Long
303850,++/ZHqwUNa7U21Qz+zqteiXlZapxey86l6eEorrak/g=,0,13.0,39.0,1.0,3.0,2014-04-21,1,1,0,0,0
173734,++4cUL0b9CfW8cj0A/wfSxQc4k4fcVtWcLqk2UOdpKs=,0,15.0,28.0,1.0,7.0,2012-10-09,1,1,0,0,0
239969,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,1,13.0,22.0,1.0,9.0,2014-12-10,1,1,0,0,0
93345,++A8p4GrsTnMjI6hAZEtlRsaz6s6O9ddUoH0fmS4s7s=,0,5.0,43.0,0.0,9.0,2014-11-18,0,1,0,0,0
28091,++FM6zdGj2whewSf10HHTRTeg2/jnxq3NQJKeYmAKwY=,1,3.0,22.0,1.0,9.0,2015-01-21,1,1,0,1,1


##### 套入其他交易特徵

In [60]:
KKboxTrainMemTransac = pd.read_csv('KKboxTrainMemTransac.csv')

In [61]:
KKboxTrainMemTransac.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,client_level_code,age_under26,last_last_churn,last_auto_renew,age_level_code,is_discount,amount_per_day,longtime_user
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,5.0,28.0,1,3.0,2013-12-23,1,0.0,0.0,1,1,6,4.967,0
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,13.0,20.0,1,3.0,2013-12-23,1,1.0,0.0,0,0,6,4.483,0
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,13.0,18.0,1,3.0,2013-12-27,3,1.0,0.0,0,0,20,4.637,0
3,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,13.0,35.0,0,7.0,2014-01-25,4,0.0,0.0,1,1,32,3.817,1
4,moRTKhKIDvb+C8ZHOgmaF4dXMLk0jOn65d7a8tQ2Eds=,1,9.0,28.0,0,3.0,2014-02-02,1,0.0,0.0,0,1,3,4.436,1


In [62]:
KKboxTrainMemTransac = KKboxTrainMemTransac[['msno', 'last_last_churn', 'last_auto_renew', 'is_discount', 'amount_per_day', 'longtime_user']]

In [63]:
KKboxProgramFinal = pd.merge(KKboxMembersProgram, KKboxTrainMemTransac, how='left', on=['msno'])

In [64]:
KKboxProgramFinal.drop_duplicates(subset=None, keep='first', inplace=True)

In [65]:
print(KKboxProgramFinal.shape)
KKboxProgramFinal.head()

(103378, 17)


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,ProgramA_Short,ProgramB_Month,ProgramC_Mid,ProgramD_Season,ProgramE_Long,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user
0,++/ZHqwUNa7U21Qz+zqteiXlZapxey86l6eEorrak/g=,0,13.0,39.0,1.0,3.0,2014-04-21,1,1,0,0,0,0.0,1.0,23.0,4.967,1.0
1,++4cUL0b9CfW8cj0A/wfSxQc4k4fcVtWcLqk2UOdpKs=,0,15.0,28.0,1.0,7.0,2012-10-09,1,1,0,0,0,0.0,1.0,29.0,4.492,1.0
2,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,1,13.0,22.0,1.0,9.0,2014-12-10,1,1,0,0,0,0.0,1.0,27.0,4.942,1.0
3,++A8p4GrsTnMjI6hAZEtlRsaz6s6O9ddUoH0fmS4s7s=,0,5.0,43.0,0.0,9.0,2014-11-18,0,1,0,0,0,0.0,1.0,24.0,4.442,0.0
4,++FM6zdGj2whewSf10HHTRTeg2/jnxq3NQJKeYmAKwY=,1,3.0,22.0,1.0,9.0,2015-01-21,1,1,0,1,1,0.0,0.0,15.0,4.76,1.0


In [66]:
# 存成csv
# KKboxProgramFinal.to_csv('KKboxProgramFinal_balance.csv',index=False,header=True)

---

---

### 根據交易資料取得不同方案做關聯性分析-依詳細方案(含價格)製作欄位

In [67]:
# 讀取資料
ProgramTransac2nd_v1 = pd.read_csv('ProgramTransaction2nd_v1.csv')
ProgramTransac2nd_v2 = pd.read_csv('ProgramTransaction2nd_v2.csv')
# 合併兩份資料
ProgramTransac2nd = pd.concat([ProgramTransac2nd_v1, ProgramTransac2nd_v2], axis=0)
# 合併相同msno
ProgramTransac2nd = ProgramTransac2nd[['msno', 'Program_A_0days', 'Program_B_7days', 'Program_C_30days_month', 'Program_D_30days_acount', 'Program_E_90days', 'Program_F_90days_discount', 'Program_G_100days_card', 'Program_H_120days_season', 'Program_I_halfyear', 'Program_J_180days_discount', 'Program_K_200days_card', 'Program_L_360days_year', 'Program_M_395days', 'Program_N_410days_discount', 'Program_O_450days']].groupby(by=['msno'],as_index=False).sum()

In [68]:
# 重新命名
program_name = {'Program_A_0days':'A_0days', 'Program_B_7days':'B_7days',\
                'Program_C_30days_month':'C_30days_month', \
                'Program_D_30days_acount':'D_30days_acount',\
                'Program_E_90days':'E_90days',\
                'Program_F_90days_discount':'F_90days_discount',\
                'Program_G_100days_card':'G_100days_card',\
                'Program_H_120days_season':'H_120days_season',\
                'Program_I_halfyear':'I_halfyear',\
                'Program_J_180days_discount':'J_180days_discount',\
                'Program_K_200days_card':'K_200days_card',\
                'Program_L_360days_year':'L_360days_year',\
                'Program_M_395days':'M_395days',\
                'Program_N_410days_discount':'N_410days_discount', 'Program_O_450days':'O_450days'}
ProgramTransac2nd.rename(columns=program_name,inplace=True)

In [69]:
# 針對使用過方案的人都當作1，沒有的人當作0
ProgramTransac2nd_Col = list(ProgramTransac2nd.columns[1:])
for i in ProgramTransac2nd_Col:
    ProgramTransac2nd[i] = ProgramTransac2nd[i].apply(lambda x: 1 if x > 0 else 0)

In [70]:
print(ProgramTransac2nd.shape)
ProgramTransac2nd.head()

(2426143, 16)


Unnamed: 0,msno,A_0days,B_7days,C_30days_month,D_30days_acount,E_90days,F_90days_discount,G_100days_card,H_120days_season,I_halfyear,J_180days_discount,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days
0,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [71]:
# 合併用戶資料
KKboxMembersProgram2nd = pd.merge(KKboxTrainMembers, ProgramTransac2nd, how='left', on=['msno'])

##### 針對流失非流失進行隨機抽樣使得數據不平衡現象達到解決

In [72]:
KKboxMembersProgram2nd = imbalance_random_sampling(KKboxMembersProgram2nd)

In [73]:
print(KKboxMembersProgram2nd.shape)
KKboxMembersProgram2nd.head()

(103378, 22)


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,A_0days,B_7days,C_30days_month,...,F_90days_discount,G_100days_card,H_120days_season,I_halfyear,J_180days_discount,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days
121451,++3Z+W8OPnpbHYfrKwqRKN1bF83XEbxjdYUolhGdHZg=,0,5.0,35.0,0.0,7.0,2012-02-09,1,0,1,...,0,0,0,0,0,0,0,0,0,0
239969,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,1,13.0,22.0,1.0,9.0,2014-12-10,1,0,1,...,0,0,0,0,0,0,0,0,0,0
28091,++FM6zdGj2whewSf10HHTRTeg2/jnxq3NQJKeYmAKwY=,1,3.0,22.0,1.0,9.0,2015-01-21,1,0,1,...,0,0,0,1,0,0,0,0,0,0
190564,++FojTWzKjhz7uxsZXEN3KBpxSmSvPr2j9KEuLzVAm4=,0,13.0,23.0,1.0,3.0,2013-02-10,1,0,1,...,0,0,0,0,0,0,0,0,0,0
143978,++GeSFVX+57LLP1FT1EQ5uUWgP03beR/Bzh52e6Rd3E=,0,14.0,25.0,1.0,9.0,2014-12-21,0,0,1,...,0,0,0,0,0,0,0,0,0,0


##### 套入其他交易特徵

In [74]:
# KKboxTrainMemTransac = pd.read_csv('KKboxTrainMemTransac.csv')
# KKboxTrainMemTransac = KKboxTrainMemTransac[['msno', 'last_last_churn', 'last_auto_renew', 'is_discount', 'amount_per_day', 'longtime_user', 'pay_method_41_ratio']]

In [75]:
KKboxProgram2ndFinal = pd.merge(KKboxMembersProgram2nd, KKboxTrainMemTransac, how='left', on=['msno'])

In [76]:
KKboxProgram2ndFinal.drop_duplicates(subset=None, keep='first', inplace=True)

In [77]:
print(KKboxProgram2ndFinal.shape)
KKboxProgram2ndFinal.head()

(103378, 27)


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,A_0days,B_7days,C_30days_month,...,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user
0,++3Z+W8OPnpbHYfrKwqRKN1bF83XEbxjdYUolhGdHZg=,0,5.0,35.0,0.0,7.0,2012-02-09,1,0,1,...,0,0,0,0,0,0.0,1.0,28.0,4.685,1.0
1,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,1,13.0,22.0,1.0,9.0,2014-12-10,1,0,1,...,0,0,0,0,0,0.0,1.0,27.0,4.942,1.0
2,++FM6zdGj2whewSf10HHTRTeg2/jnxq3NQJKeYmAKwY=,1,3.0,22.0,1.0,9.0,2015-01-21,1,0,1,...,0,0,0,0,0,0.0,0.0,15.0,4.76,1.0
3,++FojTWzKjhz7uxsZXEN3KBpxSmSvPr2j9KEuLzVAm4=,0,13.0,23.0,1.0,3.0,2013-02-10,1,0,1,...,0,0,0,0,0,0.0,1.0,27.0,4.949,1.0
4,++GeSFVX+57LLP1FT1EQ5uUWgP03beR/Bzh52e6Rd3E=,0,14.0,25.0,1.0,9.0,2014-12-21,0,0,1,...,0,0,0,0,0,0.0,1.0,27.0,4.943,1.0


In [78]:
print(len(KKboxProgram2ndFinal[KKboxProgram2ndFinal['is_churn'] == 1]))
print(len(KKboxProgram2ndFinal[KKboxProgram2ndFinal['is_churn'] == 0]))

51378
52000


In [102]:
KKboxProgram2ndFinal['gender'] = KKboxProgram2ndFinal['gender'].fillna(-1)

In [103]:
KKboxProgram2ndFinal = KKboxProgram2ndFinal.dropna()

In [104]:
# 存成csv
# KKboxProgram2ndFinal.to_csv('KKboxProgram2ndFinal_balance.csv',index=False,header=True)

---

## 從這邊開始看範例

---

## 切資料

### 以年齡26為分界分類

##### 讀資料

In [105]:
KKboxProgram2ndFinal = pd.read_csv('KKboxProgram2ndFinal_balance.csv')

In [106]:
KKboxProgram2ndFinal.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,A_0days,B_7days,C_30days_month,...,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user
0,++3Z+W8OPnpbHYfrKwqRKN1bF83XEbxjdYUolhGdHZg=,0,5.0,35.0,0.0,7.0,2012-02-09,1,0,1,...,0,0,0,0,0,0.0,1.0,28.0,4.685,1.0
1,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,1,13.0,22.0,1.0,9.0,2014-12-10,1,0,1,...,0,0,0,0,0,0.0,1.0,27.0,4.942,1.0
2,++FM6zdGj2whewSf10HHTRTeg2/jnxq3NQJKeYmAKwY=,1,3.0,22.0,1.0,9.0,2015-01-21,1,0,1,...,0,0,0,0,0,0.0,0.0,15.0,4.76,1.0
3,++FojTWzKjhz7uxsZXEN3KBpxSmSvPr2j9KEuLzVAm4=,0,13.0,23.0,1.0,3.0,2013-02-10,1,0,1,...,0,0,0,0,0,0.0,1.0,27.0,4.949,1.0
4,++GeSFVX+57LLP1FT1EQ5uUWgP03beR/Bzh52e6Rd3E=,0,14.0,25.0,1.0,9.0,2014-12-21,0,0,1,...,0,0,0,0,0,0.0,1.0,27.0,4.943,1.0


In [107]:
KKboxProgram2ndFinal.isnull().sum()

msno                      0
is_churn                  0
city                      0
bd                        0
gender                    0
registered_via            0
registration_init_time    0
A_0days                   0
B_7days                   0
C_30days_month            0
D_30days_acount           0
E_90days                  0
F_90days_discount         0
G_100days_card            0
H_120days_season          0
I_halfyear                0
J_180days_discount        0
K_200days_card            0
L_360days_year            0
M_395days                 0
N_410days_discount        0
O_450days                 0
last_last_churn           0
last_auto_renew           0
is_discount               0
amount_per_day            0
longtime_user             0
dtype: int64

In [108]:
KKboxProgram2ndFinal['last_last_churn'].unique()

array([ 0.,  1., -1.])

##### 分割

In [109]:
# 用年齡26歲作分割(若要分不同群，再多加欄位即可)
df_Under26 = KKboxProgram2ndFinal[KKboxProgram2ndFinal['bd'] < 26]
df_Above26 = KKboxProgram2ndFinal[KKboxProgram2ndFinal['bd'] >= 26]
# 加入id，可自行取名(改單引號內為自己可以辨識的id即可)
df_Above26.insert(0, 'id', 'Group_Above26')
df_Under26.insert(0, 'id', 'Group_Under26')
# 取要用到的欄位(這邊不用改)
GroupProgram2ndCol = ['id', 'is_churn', 'A_0days', 'B_7days',\
                          'C_30days_month', 'D_30days_acount', 'E_90days',\
                          'F_90days_discount', 'G_100days_card',\
                          'H_120days_season', 'I_halfyear',\
                          'J_180days_discount', 'K_200days_card',\
                          'L_360days_year', 'M_395days',\
                          'N_410days_discount', 'O_450days']
# 合併每群的方案數量
Above26_2nd = df_Above26[GroupProgram2ndCol].groupby(by=['id'],as_index=False).sum()
Under26_2nd = df_Under26[GroupProgram2ndCol].groupby(by=['id'],as_index=False).sum()

##### 確認分群數量是否均衡?(差異太多直接重新分群)

In [110]:
print(len(df_Above26))
print(len(df_Under26))

61522
41715


##### 確認各群流失/未流失比例?(如果要做關聯性與各群流失關係的話，最好在不同群的流失比例有顯著差異)

In [111]:
# 確認第一群(Above26)流失/未流失比例
print('第一群(Above26)流失/未流失比例:')
print('流失所佔比例:{:.2f}%'.format(100*(len(df_Above26[df_Above26['is_churn'] == 1]) / len(df_Above26))))
print('未流失所佔比例:{:.2f}%'.format(100*(len(df_Above26[df_Above26['is_churn'] == 0]) / len(df_Above26))))
# 確認第二群(Under26)流失/未流失比例
print('第二群(Under26)流失/未流失比例:')
print('流失所佔比例:{:.2f}%'.format(100*(len(df_Under26[df_Under26['is_churn'] == 1]) / len(df_Under26))))
print('未流失所佔比例:{:.2f}%'.format(100*(len(df_Under26[df_Under26['is_churn'] == 0]) / len(df_Under26))))

第一群(Above26)流失/未流失比例:
流失所佔比例:43.51%
未流失所佔比例:56.49%
第二群(Under26)流失/未流失比例:
流失所佔比例:58.83%
未流失所佔比例:41.17%


##### 分成不同群結果：

In [112]:
CutbyAge26_2nd = pd.concat([Above26_2nd, Under26_2nd], axis=0)
CutbyAge26_2nd.reset_index(inplace=True)
CutbyAge26_2nd = CutbyAge26_2nd.drop(columns='index')
CutbyAge26_2nd = CutbyAge26_2nd.set_index('id')
CutbyAge26_2nd.head()

Unnamed: 0_level_0,is_churn,A_0days,B_7days,C_30days_month,D_30days_acount,E_90days,F_90days_discount,G_100days_card,H_120days_season,I_halfyear,J_180days_discount,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Group_Above26,26766,26632,3612,44779,6890,568,492,456,218,3991,2959,167,264,234,5555,123
Group_Under26,24541,9887,5917,30739,3025,1395,586,1260,705,4433,423,394,1236,106,3266,21


##### 換算各方案佔不同群之比例(比例越大越好)

In [113]:
# 對於不同群做比例換算並新增欄位(依照不同群可新增，此處為兩群故分兩欄位)
# CutbyAge26_2nd.loc['新欄位名稱'] = CutbyAge26_2nd.loc['要算比例的欄位名稱'] / (加總)
CutbyAge26_2nd.loc['Group_Above26_Ratio'] = CutbyAge26_2nd.loc['Group_Above26'] / (CutbyAge26_2nd.loc['Group_Above26'] + CutbyAge26_2nd.loc['Group_Under26'])
CutbyAge26_2nd.loc['Group_Under26_Ratio'] = CutbyAge26_2nd.loc['Group_Under26'] / (CutbyAge26_2nd.loc['Group_Above26'] + CutbyAge26_2nd.loc['Group_Under26'])
# 僅顯示含比例欄位
CutbyAge26_2nd = CutbyAge26_2nd.loc[['Group_Above26_Ratio', 'Group_Under26_Ratio']]
CutbyAge26_2nd.head()

Unnamed: 0_level_0,is_churn,A_0days,B_7days,C_30days_month,D_30days_acount,E_90days,F_90days_discount,G_100days_card,H_120days_season,I_halfyear,J_180days_discount,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Group_Above26_Ratio,0.522,0.729,0.379,0.593,0.695,0.289,0.456,0.266,0.236,0.474,0.875,0.298,0.176,0.688,0.63,0.854
Group_Under26_Ratio,0.478,0.271,0.621,0.407,0.305,0.711,0.544,0.734,0.764,0.526,0.125,0.702,0.824,0.312,0.37,0.146


##### 製造rattle()用的csv

In [114]:
df_rattle = pd.DataFrame(columns=['id', 'Project'])
num = 0
for n, m in enumerate(list(CutbyAge26_2nd.index)):
    for i, k in enumerate(list(CutbyAge26_2nd.columns[1:])):
        if CutbyAge26_2nd.loc[m,k] >= 0.6:
            df_rattle = df_rattle.append({'id':m, 'Project':k}, ignore_index=True)

In [115]:
df_rattle

Unnamed: 0,id,Project
0,Group_Above26_Ratio,A_0days
1,Group_Above26_Ratio,D_30days_acount
2,Group_Above26_Ratio,J_180days_discount
3,Group_Above26_Ratio,M_395days
4,Group_Above26_Ratio,N_410days_discount
5,Group_Above26_Ratio,O_450days
6,Group_Under26_Ratio,B_7days
7,Group_Under26_Ratio,E_90days
8,Group_Under26_Ratio,G_100days_card
9,Group_Under26_Ratio,H_120days_season


In [116]:
# 存成可以放在rattle中分析的csv(自行改名)
df_rattle.to_csv('CutByAge26_2nd_forRattle.csv',index=False,header=True)

---

### 有故事的的關聯分析分群存成csv，並分別拿來跑各種模型(決策樹、xgboost等等)看看

In [117]:
print(df_Above26.shape)
df_Above26 = df_Above26.dropna()
df_Above26.head()

(61522, 28)


Unnamed: 0,id,msno,is_churn,city,bd,gender,registered_via,registration_init_time,A_0days,B_7days,...,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user
0,Group_Above26,++3Z+W8OPnpbHYfrKwqRKN1bF83XEbxjdYUolhGdHZg=,0,5.0,35.0,0.0,7.0,2012-02-09,1,0,...,0,0,0,0,0,0.0,1.0,28.0,4.685,1.0
5,Group_Above26,++OO0dlx66sqqbXfJu9K4tX94zYLNwN/kq0cMOCVDyQ=,1,17.0,30.0,1.0,4.0,2016-10-25,0,0,...,0,0,0,0,0,0.0,0.0,4.0,6.0,0.0
6,Group_Above26,++Tp41swlnwrt2eLTL1cSAn97YgfRVh3uG9VHEmAySA=,1,13.0,36.0,1.0,3.0,2015-06-02,0,1,...,0,0,0,0,0,0.0,0.0,21.0,4.73,0.0
7,Group_Above26,++WdoXmpKEhkOr0TC5zhjcuuumbgo3FOm6pV6TjGckU=,0,22.0,28.0,0.0,9.0,2012-12-28,1,0,...,0,0,0,0,0,0.0,1.0,27.0,4.942,1.0
8,Group_Above26,++Xqq1LcF5dltTfVk8kDh7+DZp1dyzMBuU4tNtCT1p8=,0,5.0,30.0,1.0,4.0,2015-11-06,0,0,...,0,0,0,0,0,0.0,0.0,16.0,5.688,0.0


In [118]:
print(df_Under26.shape)
df_Under26.head()

(41715, 28)


Unnamed: 0,id,msno,is_churn,city,bd,gender,registered_via,registration_init_time,A_0days,B_7days,...,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user
1,Group_Under26,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,1,13.0,22.0,1.0,9.0,2014-12-10,1,0,...,0,0,0,0,0,0.0,1.0,27.0,4.942,1.0
2,Group_Under26,++FM6zdGj2whewSf10HHTRTeg2/jnxq3NQJKeYmAKwY=,1,3.0,22.0,1.0,9.0,2015-01-21,1,0,...,0,0,0,0,0,0.0,0.0,15.0,4.76,1.0
3,Group_Under26,++FojTWzKjhz7uxsZXEN3KBpxSmSvPr2j9KEuLzVAm4=,0,13.0,23.0,1.0,3.0,2013-02-10,1,0,...,0,0,0,0,0,0.0,1.0,27.0,4.949,1.0
4,Group_Under26,++GeSFVX+57LLP1FT1EQ5uUWgP03beR/Bzh52e6Rd3E=,0,14.0,25.0,1.0,9.0,2014-12-21,0,0,...,0,0,0,0,0,0.0,1.0,27.0,4.943,1.0
9,Group_Under26,++aTQuzrlR6QCrMNRtQWIGc/oM7oDNYk7uAzgMoAcms=,0,14.0,25.0,1.0,7.0,2016-02-02,0,0,...,0,0,0,0,0,0.0,1.0,16.0,3.306,0.0


In [119]:
# 存成csv
df_Above26.to_csv('Cut_Above26_forModel.csv',index=False,header=True)
df_Under26.to_csv('Cut_Under26_forModel.csv',index=False,header=True)

---

##### 跑完rattle()模型的結果(左邊是26歲以上，右邊是26歲以下)

<table>
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Above26_Tree.png?raw=true">
</td> 
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Under26_Tree.png?raw=true">
</td> 
</table>
<caption><center> DecisionTree決策樹</center></caption>

<table>
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Above26_RandomForest_ROC.png?raw=true">
</td> 
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Under26_RandomForest_ROC.png?raw=true">
</td> 
</table>
<caption><center>隨機森林ROC</center></caption>

<table>
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Above26_RandomForest_FI.png?raw=true">
</td> 
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Under26_RandomForest_FI.png?raw=true">
</td> 
</table>
<caption><center>隨機森林特徵重要性</center></caption>

<table>
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Above26_cm.png?raw=true">
</td> 
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Under26_cm.png?raw=true">
</td> 
</table>
<caption><center>隨機森林混淆矩陣</center></caption>

<table>
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Above26_Xgboost_Error.png?raw=true">
</td> 
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Under26_Xgboost_Error.png?raw=true">
</td> 
</table>
<caption><center>Xgboost Errors</center></caption>

<table>
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Above26_Xgboost_FI.png?raw=true">
</td> 
<td> 
<img src="https://github.com/imiKao/KKBoxProject_for_TcfstCourse/blob/master/Images/CutByAge26/%E8%B7%91Model/Under26_Xgboost_Error.png?raw=true">
</td> 
</table>
<caption><center>Xgboost 特徵重要性</center></caption>

In [120]:
df_Above26.head()

Unnamed: 0,id,msno,is_churn,city,bd,gender,registered_via,registration_init_time,A_0days,B_7days,...,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user
0,Group_Above26,++3Z+W8OPnpbHYfrKwqRKN1bF83XEbxjdYUolhGdHZg=,0,5.0,35.0,0.0,7.0,2012-02-09,1,0,...,0,0,0,0,0,0.0,1.0,28.0,4.685,1.0
5,Group_Above26,++OO0dlx66sqqbXfJu9K4tX94zYLNwN/kq0cMOCVDyQ=,1,17.0,30.0,1.0,4.0,2016-10-25,0,0,...,0,0,0,0,0,0.0,0.0,4.0,6.0,0.0
6,Group_Above26,++Tp41swlnwrt2eLTL1cSAn97YgfRVh3uG9VHEmAySA=,1,13.0,36.0,1.0,3.0,2015-06-02,0,1,...,0,0,0,0,0,0.0,0.0,21.0,4.73,0.0
7,Group_Above26,++WdoXmpKEhkOr0TC5zhjcuuumbgo3FOm6pV6TjGckU=,0,22.0,28.0,0.0,9.0,2012-12-28,1,0,...,0,0,0,0,0,0.0,1.0,27.0,4.942,1.0
8,Group_Above26,++Xqq1LcF5dltTfVk8kDh7+DZp1dyzMBuU4tNtCT1p8=,0,5.0,30.0,1.0,4.0,2015-11-06,0,0,...,0,0,0,0,0,0.0,0.0,16.0,5.688,0.0


In [121]:
df_Under26.head()

Unnamed: 0,id,msno,is_churn,city,bd,gender,registered_via,registration_init_time,A_0days,B_7days,...,K_200days_card,L_360days_year,M_395days,N_410days_discount,O_450days,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user
1,Group_Under26,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,1,13.0,22.0,1.0,9.0,2014-12-10,1,0,...,0,0,0,0,0,0.0,1.0,27.0,4.942,1.0
2,Group_Under26,++FM6zdGj2whewSf10HHTRTeg2/jnxq3NQJKeYmAKwY=,1,3.0,22.0,1.0,9.0,2015-01-21,1,0,...,0,0,0,0,0,0.0,0.0,15.0,4.76,1.0
3,Group_Under26,++FojTWzKjhz7uxsZXEN3KBpxSmSvPr2j9KEuLzVAm4=,0,13.0,23.0,1.0,3.0,2013-02-10,1,0,...,0,0,0,0,0,0.0,1.0,27.0,4.949,1.0
4,Group_Under26,++GeSFVX+57LLP1FT1EQ5uUWgP03beR/Bzh52e6Rd3E=,0,14.0,25.0,1.0,9.0,2014-12-21,0,0,...,0,0,0,0,0,0.0,1.0,27.0,4.943,1.0
9,Group_Under26,++aTQuzrlR6QCrMNRtQWIGc/oM7oDNYk7uAzgMoAcms=,0,14.0,25.0,1.0,7.0,2016-02-02,0,0,...,0,0,0,0,0,0.0,1.0,16.0,3.306,0.0


In [148]:
df_Under26['is_discount'].mean()

12.966942346877621

In [147]:
len(df_Under26[df_Under26['last_last_churn'] == 0]) / len(df_Under26)

0.9076591154261057

##### 分割資料後各自特徵比較
* last_last_churn(前次至今一個月內是否續訂)
> * 26歲以上:前次續訂達94.7%(X)
> * 26歲以下:前次續訂達90.8%(X)
* last_auto_renew(前次是否自動續訂)
> * 26歲以上:前次自動續訂達68.8%
> * 26歲以下:前次自動續訂僅36.2%
* is_discount(擁有折扣次數):取平均
> * 26歲以上:平均使用折扣17.1次(X)
> * 26歲以下:平均使用折扣12.9次(X)
* amount_per_day(平均每日消費)
> * 26歲以上:平均每日方案消費4.73元
> * 26歲以下:平均每日方案消費4.65元
> * 26歲以上與以下的族群使用方案平均每日消費相當，對於
* longtime_user(一年以上用戶)
> * 26歲以上:長期用戶佔49.2%(X)
> * 26歲以下:長期用戶佔30.9%(X)
* registered_via(註冊裝置)
> * 26歲以上:使用裝置9的比例57.1%，使用裝置3的比例19.3%
> * 26歲以下:使用裝置9的比例38.0%，使用裝置3的比例34.9%
> * 推估註冊裝置9對於26歲以上的人使用比較方便；使用裝置3對於26歲以下的人比較方便

* 方案關聯分析：
> * 26歲以上:0天,180天信用卡方案,395天方案,365天限時加送方案
> * 26歲以下:90天方案,200天實體儲值卡,年費方案
* 流失:
> * 26歲以上:流失率43.6%
> * 26歲以下:流失率58.7%

* 關聯分析針對年齡

##### 關聯分析各種方案說明

|方案|方案說明|
|-|-|
|Program_A_0days|	0天，目前尚未確認是什麼方案，但筆數眾多，因此沒有刪掉|
|Program_B_7days|	短期體驗方案|
|Program_C_30days_month	|月租方案|
|Program_D_30days_acount|月租優惠方案|
|Program_E_90days|	90天方案|
|Program_F_90days_discount	|90天優惠方案|
|Program_G_100days_card	|100天實體儲值卡|
|Program_H_120days_season	|120天季卡|
|Program_I_halfyear	|半年方案|
|Program_J_180days_discount	|180天信用卡方案|
|Program_K_200days_card	|200天實體儲值卡|
|Program_L_360days_year	|年費方案|
|Program_M_395days	|395天方案|
|Program_N_410days_discount	|365天限時加送方案|
|Program_O_450days|450天方案|

---

* 根據美國諮詢公司高德納的數據，20%的留存用戶將貢獻公司未來收入的80%。同時，將產品成功銷售給留存用戶的幾率是60-70%，而成功銷售給新用戶的幾率只有5-20%。
* 網絡公司最不喜歡的就是用戶流失。因為獲取新用戶的成本很高，而參與度高的用戶是企業製造收入的活血，所以絞盡腦汁，最好要留住熟客。
* 通過對以往流失用戶的行為數據進行分析，我們可以總結出一些流失用戶共有的行為，譬如他們流失之前的那段時間不像以往那樣活躍，流失之前向我們提出了一些問題但沒有得到我們的反饋等等。
* 如果我們通過「免費」和「便宜」這樣的字眼來吸引新用戶，我們獲取的新用戶可能根本不是我們的目標用戶。這些收集免費贈品的用戶是最有可能流失的群體。我們的目標用戶應該是重視我們產品長期價值的用戶，而非那些貪小便宜的用戶。