In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
def dtype_compressor(df):
  #object 
  # when unique ratio > 0.5, catogory stands less memory than object
  converted_obj = pd.DataFrame()
  df_obj = df.select_dtypes(include=['object'])
  for col in df_obj.columns:
    converted_obj.loc[:,col] = df_obj.loc[:,col].astype('category')

  # numbers
  df_down_num = pd.DataFrame()
  df_num = df.select_dtypes(include=['number'])
  for col in df_num.columns:
    df_down_num.loc[:,col] = pd.to_numeric(df_num.loc[:,col],downcast='signed')

  # merge
  df = pd.concat([converted_obj,df_down_num],axis=1)
  
  del converted_obj, df_obj, df_down_num, df_num  
  
  return df

In [3]:
def fast_datetime(s):
    dates = {date:pd.to_datetime(date,format='%Y%m%d') for date in s.unique()}
    return s.map(dates)

In [4]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

---

In [5]:
# 讀取train+members
KKboxTrainMembers = pd.read_csv('KKboxTrainMembers.csv')

In [6]:
KKboxTrainMembers = KKboxTrainMembers[KKboxTrainMembers['bd'] > 10]

---

## 資料探勘—關聯性分析

### 根據交易資料取得不同方案做關聯性分析-依合約短中長期製作欄位

In [7]:
# 讀取資料
ProgramTransac_v1 = pd.read_csv('ProgramTransaction_v1.csv')
ProgramTransac_v2 = pd.read_csv('ProgramTransaction_v2.csv')
# 合併兩份資料
ProgramTransac = pd.concat([ProgramTransac_v1, ProgramTransac_v2], axis=0)
# 合併相同msno
ProgramTransac = ProgramTransac[['msno', 'ProgramA_Short', 'ProgramB_Month', 'ProgramC_Mid', 'ProgramD_Season', 'ProgramE_Long']].groupby(by=['msno'],as_index=False).sum()

In [8]:
# 針對使用過方案的人都當作1，沒有的人當作0
ProgramTransac_Col = list(ProgramTransac.columns[1:])
for i in ProgramTransac_Col:
    ProgramTransac[i] = ProgramTransac[i].apply(lambda x: 1 if x > 0 else 0)

In [9]:
print(ProgramTransac.shape)
ProgramTransac.head()

(2426143, 6)


Unnamed: 0,msno,ProgramA_Short,ProgramB_Month,ProgramC_Mid,ProgramD_Season,ProgramE_Long
0,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,1,0,0,0,0
1,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,0,0,0,0,1
2,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,1,0,0,0
3,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,1,1,0,0,0
4,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,1,1,0,0,0


In [10]:
# 存成csv
# ProgramTransac.to_csv('ProgramTransac.csv',index=False,header=True)

In [11]:
KKboxMembersProgram = pd.merge(KKboxTrainMembers, ProgramTransac, how='left', on=['msno'])

##### 針對流失非流失進行隨機抽樣使得數據不平衡現象達到解決

In [12]:
def imbalance_random_sampling(df):
    # 取出特徵is_churn為流失(1)與非流失(0)的數據集
    df_notchurn = df[df['is_churn'] == 0]
    df_churn = df[df['is_churn'] == 1]
    # 因為要對數據量較多的一方進行隨機抽樣(這邊是notchurn數據量較多，並取得與churn數據量差不多的n)
    df_notchurn_change = df_notchurn.sample(n=86000)
    # 合併churn與notchurn欄位
    df_after = pd.concat([df_churn, df_notchurn_change], axis=0)
    df_after.sort_values(by=['msno','registration_init_time'],inplace=True)
    return df_after

In [13]:
KKboxMembersProgram = imbalance_random_sampling(KKboxMembersProgram)

In [14]:
print(KKboxMembersProgram.shape)
KKboxMembersProgram.head()

(171711, 12)


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,ProgramA_Short,ProgramB_Month,ProgramC_Mid,ProgramD_Season,ProgramE_Long
263222,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,15.0,26.0,1,9.0,2010-11-18,1,1,0,0,0
710049,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,15.0,26.0,1,9.0,2010-11-18,1,1,0,0,0
575862,++95tJZADNg8U8HKbYdxbbXIRsO6pw1zBK4tHI7BtZo=,0,14.0,35.0,0,3.0,2012-06-03,1,1,0,0,0
628694,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,1,13.0,22.0,1,9.0,2014-12-10,1,1,0,0,0
482251,++A8p4GrsTnMjI6hAZEtlRsaz6s6O9ddUoH0fmS4s7s=,0,5.0,43.0,0,9.0,2014-11-18,0,1,0,0,0


##### 套入其他交易特徵

In [19]:
KKboxTrainMemTransac = pd.read_csv('KKboxTrainMemTransac.csv')

In [23]:
KKboxTrainMemTransac = KKboxTrainMemTransac[['msno', 'last_last_churn', 'last_auto_renew', 'is_discount', 'amount_per_day', 'longtime_user', 'pay_method_41_ratio']]

In [31]:
KKboxProgramFinal = pd.merge(KKboxMembersProgram, KKboxTrainMemTransac, how='left', on=['msno'])

In [32]:
KKboxProgramFinal.drop_duplicates(subset=None, keep='first', inplace=True)

In [33]:
print(KKboxProgramFinal.shape)
KKboxProgramFinal.head()

(162638, 18)


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,ProgramA_Short,ProgramB_Month,ProgramC_Mid,ProgramD_Season,ProgramE_Long,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user,pay_method_41_ratio
0,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,15.0,26.0,1,9.0,2010-11-18,1,1,0,0,0,0.0,1.0,21.0,4.943,1.0,0.0
4,++95tJZADNg8U8HKbYdxbbXIRsO6pw1zBK4tHI7BtZo=,0,14.0,35.0,0,3.0,2012-06-03,1,1,0,0,0,0.0,1.0,22.0,4.941,1.0,0.0
6,++9JUsaZioofS6Fb20Z0z2QOlWNzakO0PRF5GZ75yi4=,1,13.0,22.0,1,9.0,2014-12-10,1,1,0,0,0,0.0,1.0,27.0,4.942,1.0,0.0
8,++A8p4GrsTnMjI6hAZEtlRsaz6s6O9ddUoH0fmS4s7s=,0,5.0,43.0,0,9.0,2014-11-18,0,1,0,0,0,0.0,1.0,24.0,4.442,0.0,0.0
10,++DcyRE+ZfLtlKGigvv9dv5EQ4KAhHhiN9LM9X+F1vw=,1,4.0,37.0,1,9.0,2011-03-22,1,1,0,0,0,0.0,1.0,26.0,4.947,1.0,0.0


In [35]:
# 存成csv
# KKboxProgramFinal.to_csv('KKboxProgramFinal_balance.csv',index=False,header=True)

---

## 切資料

##### 以年齡26為分界分類

In [30]:
KKboxMembersProgram_Under26 = KKboxMembersProgram[KKboxMembersProgram['bd'] < 26]
KKboxMembersProgram_Above26 = KKboxMembersProgram[KKboxMembersProgram['bd'] >= 26]

In [31]:
KKboxMembersProgram_Above26.insert(0, 'id', 'Above26')
KKboxMembersProgram_Under26.insert(0, 'id', 'Under26')

In [32]:
Above26 = KKboxMembersProgram_Above26[['id', 'ProgramA_Short', 'ProgramB_Month', 'ProgramC_Mid', 'ProgramD_Season', 'ProgramE_Long']].groupby(by=['id'],as_index=False).sum()
Under26 = KKboxMembersProgram_Under26[['id', 'ProgramA_Short', 'ProgramB_Month', 'ProgramC_Mid', 'ProgramD_Season', 'ProgramE_Long']].groupby(by=['id'],as_index=False).sum()

In [33]:
Above26.head()

Unnamed: 0,id,ProgramA_Short,ProgramB_Month,ProgramC_Mid,ProgramD_Season,ProgramE_Long
0,Above26,53968,90866,437,3348,19968


In [34]:
Under26.head()

Unnamed: 0,id,ProgramA_Short,ProgramB_Month,ProgramC_Mid,ProgramD_Season,ProgramE_Long
0,Under26,28202,62785,2473,9149,14593


In [37]:
# 存成csv
# KKboxMembersProgram_Above26.to_csv('KKboxMembersProgram_Above26', index=False, header=True)
# KKboxMembersProgram_Under26.to_csv('KKboxMembersProgram_Under26', index=False, header=True)

---

---

### 根據交易資料取得不同方案做關聯性分析-依詳細方案(含價格)製作欄位

In [36]:
# 讀取資料
ProgramTransac2nd_v1 = pd.read_csv('ProgramTransaction2nd_v1.csv')
ProgramTransac2nd_v2 = pd.read_csv('ProgramTransaction2nd_v2.csv')
# 合併兩份資料
ProgramTransac2nd = pd.concat([ProgramTransac2nd_v1, ProgramTransac2nd_v2], axis=0)
# 合併相同msno
ProgramTransac2nd = ProgramTransac2nd[['msno', 'Program_A_0days', 'Program_B_7days', 'Program_C_30days_month', 'Program_D_30days_acount', 'Program_E_90days', 'Program_F_90days_discount', 'Program_G_100days_card', 'Program_H_120days_season', 'Program_I_halfyear', 'Program_J_180days_discount', 'Program_K_200days_card', 'Program_L_360days_year', 'Program_M_395days', 'Program_N_410days_discount', 'Program_O_450days']].groupby(by=['msno'],as_index=False).sum()

In [38]:
# 針對使用過方案的人都當作1，沒有的人當作0
ProgramTransac2nd_Col = list(ProgramTransac2nd.columns[1:])
for i in ProgramTransac2nd_Col:
    ProgramTransac2nd[i] = ProgramTransac2nd[i].apply(lambda x: 1 if x > 0 else 0)

In [41]:
print(ProgramTransac2nd.shape)
ProgramTransac2nd.head()

(2426143, 16)


Unnamed: 0,msno,Program_A_0days,Program_B_7days,Program_C_30days_month,Program_D_30days_acount,Program_E_90days,Program_F_90days_discount,Program_G_100days_card,Program_H_120days_season,Program_I_halfyear,Program_J_180days_discount,Program_K_200days_card,Program_L_360days_year,Program_M_395days,Program_N_410days_discount,Program_O_450days
0,+++FOrTS7ab3tIgIh8eWwX4FqRv8w/FoiOuyXsFvphY=,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
# 合併用戶資料
KKboxMembersProgram2nd = pd.merge(KKboxTrainMembers, ProgramTransac2nd, how='left', on=['msno'])

##### 針對流失非流失進行隨機抽樣使得數據不平衡現象達到解決

In [43]:
KKboxMembersProgram2nd = imbalance_random_sampling(KKboxMembersProgram2nd)

In [44]:
print(KKboxMembersProgram2nd.shape)
KKboxMembersProgram2nd.head()

(171711, 22)


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,Program_A_0days,Program_B_7days,Program_C_30days_month,...,Program_F_90days_discount,Program_G_100days_card,Program_H_120days_season,Program_I_halfyear,Program_J_180days_discount,Program_K_200days_card,Program_L_360days_year,Program_M_395days,Program_N_410days_discount,Program_O_450days
710049,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,15.0,26.0,1,9.0,2010-11-18,1,0,1,...,0,0,0,0,0,0,0,0,0,0
562154,++0nOC7BmrUTtcSboRORfg6ZXTajnBDt1f/SEgH6ONo=,0,13.0,25.0,1,9.0,2010-02-03,0,0,1,...,0,0,0,0,1,0,0,0,0,0
350891,++3Z+W8OPnpbHYfrKwqRKN1bF83XEbxjdYUolhGdHZg=,0,5.0,35.0,0,7.0,2012-02-09,1,0,1,...,0,0,0,0,0,0,0,0,0,0
719664,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,0,13.0,39.0,1,9.0,2006-02-18,1,0,1,...,0,0,0,0,0,0,0,0,0,0
85139,++95tJZADNg8U8HKbYdxbbXIRsO6pw1zBK4tHI7BtZo=,0,14.0,35.0,0,3.0,2012-06-03,1,0,1,...,0,0,0,0,0,0,0,0,0,0


##### 套入其他交易特徵

In [47]:
# KKboxTrainMemTransac = pd.read_csv('KKboxTrainMemTransac.csv')
# KKboxTrainMemTransac = KKboxTrainMemTransac[['msno', 'last_last_churn', 'last_auto_renew', 'is_discount', 'amount_per_day', 'longtime_user', 'pay_method_41_ratio']]

In [48]:
KKboxProgram2ndFinal = pd.merge(KKboxMembersProgram2nd, KKboxTrainMemTransac, how='left', on=['msno'])

In [50]:
KKboxProgram2ndFinal.drop_duplicates(subset=None, keep='first', inplace=True)

In [51]:
print(KKboxProgram2ndFinal.shape)
KKboxProgram2ndFinal.head()

(162616, 28)


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,Program_A_0days,Program_B_7days,Program_C_30days_month,...,Program_L_360days_year,Program_M_395days,Program_N_410days_discount,Program_O_450days,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user,pay_method_41_ratio
0,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,15.0,26.0,1,9.0,2010-11-18,1,0,1,...,0,0,0,0,0.0,1.0,21.0,4.943,1.0,0.0
2,++0nOC7BmrUTtcSboRORfg6ZXTajnBDt1f/SEgH6ONo=,0,13.0,25.0,1,9.0,2010-02-03,0,0,1,...,0,0,0,0,0.0,0.0,8.0,4.263,1.0,0.0
3,++3Z+W8OPnpbHYfrKwqRKN1bF83XEbxjdYUolhGdHZg=,0,5.0,35.0,0,7.0,2012-02-09,1,0,1,...,0,0,0,0,0.0,1.0,28.0,4.685,1.0,1.0
5,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,0,13.0,39.0,1,9.0,2006-02-18,1,0,1,...,0,0,0,0,0.0,1.0,22.0,4.943,1.0,0.0
7,++95tJZADNg8U8HKbYdxbbXIRsO6pw1zBK4tHI7BtZo=,0,14.0,35.0,0,3.0,2012-06-03,1,0,1,...,0,0,0,0,0.0,1.0,22.0,4.941,1.0,0.0


In [53]:
# 存成csv
# KKboxProgram2ndFinal.to_csv('KKboxProgram2ndFinal_balance.csv',index=False,header=True)

---

## 切資料

### 以年齡26為分界分類

##### 讀資料

In [193]:
KKboxProgram2ndFinal = pd.read_csv('KKboxProgram2ndFinal_balance.csv')

In [194]:
KKboxProgram2ndFinal.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,Program_A_0days,Program_B_7days,Program_C_30days_month,...,Program_L_360days_year,Program_M_395days,Program_N_410days_discount,Program_O_450days,last_last_churn,last_auto_renew,is_discount,amount_per_day,longtime_user,pay_method_41_ratio
0,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,15.0,26.0,1,9.0,2010-11-18,1,0,1,...,0,0,0,0,0.0,1.0,21.0,4.943,1.0,0.0
1,++0nOC7BmrUTtcSboRORfg6ZXTajnBDt1f/SEgH6ONo=,0,13.0,25.0,1,9.0,2010-02-03,0,0,1,...,0,0,0,0,0.0,0.0,8.0,4.263,1.0,0.0
2,++3Z+W8OPnpbHYfrKwqRKN1bF83XEbxjdYUolhGdHZg=,0,5.0,35.0,0,7.0,2012-02-09,1,0,1,...,0,0,0,0,0.0,1.0,28.0,4.685,1.0,1.0
3,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,0,13.0,39.0,1,9.0,2006-02-18,1,0,1,...,0,0,0,0,0.0,1.0,22.0,4.943,1.0,0.0
4,++95tJZADNg8U8HKbYdxbbXIRsO6pw1zBK4tHI7BtZo=,0,14.0,35.0,0,3.0,2012-06-03,1,0,1,...,0,0,0,0,0.0,1.0,22.0,4.941,1.0,0.0


##### 分割

In [195]:
# 用年齡26歲作分割
df_Under26 = KKboxProgram2ndFinal[KKboxProgram2ndFinal['bd'] < 26]
df_Above26 = KKboxProgram2ndFinal[KKboxProgram2ndFinal['bd'] >= 26]
# 加入id，可自行取名
df_Above26.insert(0, 'id', 'Group_Above26')
df_Under26.insert(0, 'id', 'Group_Under26')
# 取要用到的欄位(這邊不用改)
GroupProgram2ndCol = ['id', 'is_churn', 'Program_A_0days', 'Program_B_7days',\
                          'Program_C_30days_month', 'Program_D_30days_acount', 'Program_E_90days',\
                          'Program_F_90days_discount', 'Program_G_100days_card',\
                          'Program_H_120days_season', 'Program_I_halfyear',\
                          'Program_J_180days_discount', 'Program_K_200days_card',\
                          'Program_L_360days_year', 'Program_M_395days',\
                          'Program_N_410days_discount', 'Program_O_450days']
# 合併每群的方案數量
Above26_2nd = df_Above26[GroupProgram2ndCol].groupby(by=['id'],as_index=False).sum()
Under26_2nd = df_Under26[GroupProgram2ndCol].groupby(by=['id'],as_index=False).sum()

##### 確認分群數量是否均衡?(差異太多直接重新分群)

In [196]:
print(len(df_Above26))
print(len(df_Under26))

96036
66580


##### 確認各群流失/未流失比例?(如果要做關聯性與各群流失關係的話，最好在不同群的流失比例有顯著差異)

In [197]:
# 確認第一群(Above26)流失/未流失比例
print('第一群(Above26)流失/未流失比例:')
print('流失所佔比例:{:.2f}%'.format(100*(len(df_Above26[df_Above26['is_churn'] == 1]) / len(df_Above26))))
print('未流失所佔比例:{:.2f}%'.format(100*(len(df_Above26[df_Above26['is_churn'] == 0]) / len(df_Above26))))
# 確認第二群(Under26)流失/未流失比例
print('第二群(Under26)流失/未流失比例:')
print('流失所佔比例:{:.2f}%'.format(100*(len(df_Under26[df_Under26['is_churn'] == 1]) / len(df_Under26))))
print('未流失所佔比例:{:.2f}%'.format(100*(len(df_Under26[df_Under26['is_churn'] == 0]) / len(df_Under26))))

第一群(Above26)流失/未流失比例:
流失所佔比例:43.52%
未流失所佔比例:56.48%
第二群(Under26)流失/未流失比例:
流失所佔比例:59.49%
未流失所佔比例:40.51%


##### 分成不同群結果：

In [198]:
CutbyAge26_2nd = pd.concat([Above26_2nd, Under26_2nd], axis=0)
CutbyAge26_2nd.reset_index(inplace=True)
CutbyAge26_2nd = CutbyAge26_2nd.drop(columns='index')
CutbyAge26_2nd = CutbyAge26_2nd.set_index('id')
CutbyAge26_2nd.head()

Unnamed: 0_level_0,is_churn,Program_A_0days,Program_B_7days,Program_C_30days_month,Program_D_30days_acount,Program_E_90days,Program_F_90days_discount,Program_G_100days_card,Program_H_120days_season,Program_I_halfyear,Program_J_180days_discount,Program_K_200days_card,Program_L_360days_year,Program_M_395days,Program_N_410days_discount,Program_O_450days
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Group_Above26,41799,40623,6239,69281,10176,957,755,726,711,6310,4343,263,553,396,8692,207
Group_Under26,39608,15384,10398,49541,4579,2260,887,1955,2023,6815,605,598,1740,159,4981,36


##### 換算比例

In [199]:
# 對於不同群做比例換算並新增欄位(依照不同群可新增，此處為兩群故分兩欄位)
# CutbyAge26_2nd.loc['新欄位名稱'] = CutbyAge26_2nd.loc['要算比例的欄位名稱'] / (加總)
CutbyAge26_2nd.loc['Group_Above26_Ratio'] = CutbyAge26_2nd.loc['Group_Above26'] / (CutbyAge26_2nd.loc['Group_Above26'] + CutbyAge26_2nd.loc['Group_Under26'])
CutbyAge26_2nd.loc['Group_Under26_Ratio'] = CutbyAge26_2nd.loc['Group_Under26'] / (CutbyAge26_2nd.loc['Group_Above26'] + CutbyAge26_2nd.loc['Group_Under26'])
# 僅顯示含比例欄位
CutbyAge26_2nd = CutbyAge26_2nd.loc[['Group_Above26_Ratio', 'Group_Under26_Ratio']]
CutbyAge26_2nd.head()

Unnamed: 0_level_0,is_churn,Program_A_0days,Program_B_7days,Program_C_30days_month,Program_D_30days_acount,Program_E_90days,Program_F_90days_discount,Program_G_100days_card,Program_H_120days_season,Program_I_halfyear,Program_J_180days_discount,Program_K_200days_card,Program_L_360days_year,Program_M_395days,Program_N_410days_discount,Program_O_450days
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Group_Above26_Ratio,0.513,0.725,0.375,0.583,0.69,0.297,0.46,0.271,0.26,0.481,0.878,0.305,0.241,0.714,0.636,0.852
Group_Under26_Ratio,0.487,0.275,0.625,0.417,0.31,0.703,0.54,0.729,0.74,0.519,0.122,0.695,0.759,0.286,0.364,0.148


##### 製造rattle()用的csv

In [200]:
df_rattle = pd.DataFrame(columns=['id', 'Project'])
num = 0
for n, m in enumerate(list(CutbyAge26_2nd.index)):
    for i, k in enumerate(list(CutbyAge26_2nd.columns[1:])):
        if CutbyAge26_2nd.loc[m,k] >= 0.7:
            df_rattle = df_rattle.append({'id':m, 'Project':k}, ignore_index=True)

In [201]:
df_rattle

Unnamed: 0,id,Project
0,Group_Above26_Ratio,Program_A_0days
1,Group_Above26_Ratio,Program_J_180days_discount
2,Group_Above26_Ratio,Program_M_395days
3,Group_Above26_Ratio,Program_O_450days
4,Group_Under26_Ratio,Program_E_90days
5,Group_Under26_Ratio,Program_G_100days_card
6,Group_Under26_Ratio,Program_H_120days_season
7,Group_Under26_Ratio,Program_L_360days_year


In [192]:
# 存成可以放在rattle中分析的csv(自行改名)
df_rattle.to_csv('CutByAge26_forRattle.csv',index=False,header=True)

* 小結：Rattle跑出的關聯規則


   | lhs      |                       rhs        |                  support| confidence| lift| count|
   |-|-|-|-|-|-|
  |{Program_J_180days_discount} | {Program_M_395days}      |        0.5     |     1  |  2   |  1|
 |{Program_M_395days}     |     {Program_J_180days_discount}|     0.5   |       1|    2 |    1|
 |{Program_E_90days}  |          {Program_H_120days_season} |      0.5     |     1  |  2 |    1|
 |{Program_H_120days_season}   |{Program_E_90days}    |           0.5   |       1  |  2  |  1|
 | {Program_E_90days}           | {Program_L_360days_year}    |     0.5       |   1   | 2  |   1|
 | {Program_L_360days_year}    | {Program_E_90days}           |    0.5       |   1 |   2    | 1|
 |{Program_H_120days_season}   | {Program_L_360days_year}    |     0.5     |     1   | 2   |  1|
 |{Program_L_360days_year}     | {Program_H_120days_season}       0.5    |      1   | 2  |   1|
 |{Program_E_90days,Program_H_120days_season}  |{Program_L_360days_year}    |     0.5       |   1  |  2  |   1|
 |{Program_E_90days,Program_L_360days_year}     | {Program_H_120days_season}   |    0.5       |   1 |   2 |    1|
 |{Program_H_120days_season,Program_L_360days_year}    |{Program_E_90days}            |   0.5       |   1   | 2  |   1|


---