In [1]:
import os
import numpy as np
import pandas as pd
from time import time
from tqdm.notebook import tqdm, trange
from sklearn.preprocessing import MinMaxScaler

In [2]:
sample_path = './data/sample_50k'

chid_file = 'sample_chid.txt'
chid_dict_file = 'sample_idx_map.npy'
cdtx_file = 'sample_zip_if_cca_cdtx0001_hist.csv'
cust_f_file = 'sample_zip_if_cca_cust_f.csv'
#y_file = 'sample_zip_if_cca_y.csv'

In [3]:
chid_array = np.loadtxt(os.path.join(sample_path, chid_file), dtype=np.str)
chid_dict = np.load(os.path.join(sample_path, chid_dict_file), allow_pickle=True).item()
df_cdtx = pd.read_csv(os.path.join(sample_path, cdtx_file)) # 交易記錄檔
df_cust_f = pd.read_csv(os.path.join(sample_path, cust_f_file)) # user feature
df_cust_f.drop_duplicates(ignore_index=True, inplace=True)
#df_y = pd.read_csv(os.path.join(sample_path, y_file)) #  預測目標

print(chid_array.shape, len(chid_dict), df_cdtx.shape, df_cust_f.shape, df_y.shape)

(50000,) 50000 (6654938, 10) (1176172, 32) (2889085, 4)


In [4]:
df_cdtx = df_cdtx[df_cdtx.chid.isin(chid_array)].copy()
df_cust_f = df_cust_f[df_cust_f.chid.isin(chid_array)].copy()
df_y = df_y[df_y.chid.isin(chid_array)].copy()

In [5]:
df_cdtx.chid = df_cdtx.chid.map(chid_dict)
df_cust_f.chid = df_cust_f.chid.map(chid_dict)
df_y.chid = df_y.chid.map(chid_dict)

print(len(df_cdtx.chid.unique()), len(df_cust_f.chid.unique()), len(df_y.chid.unique()))
df_cust_f.groupby('chid').count().sort_values(by='data_ym').head()

50000 50000 50000


Unnamed: 0_level_0,data_ym,monin,wrky,first_mob,data_dt,masts,educd,naty,trdtp,poscd,...,constant_u2_ind,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down
chid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15475,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
49572,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
23964,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
49830,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
41574,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11


In [6]:
df_cdtx['month'] = df_cdtx.csmdt.apply(lambda x: x[:-3]+'-01')
df_cdtx.head(2)

Unnamed: 0,bnsfg,bnspt,chid,csmdt,iterm,mcc,objam,scity,tcode,hcefg,month
0,N,0,8054,2018-01-01,0,5411,151,TAOYUAN,5,,2018-01-01
1,N,0,8054,2018-01-01,0,5411,146,TAOYUAN,5,,2018-01-01


In [7]:
## 填滿後12個月

list_chid = sorted(df_y.chid.unique())
list_month = sorted(df_y.data_dt.unique())[12:]

df_full_y_sum = pd.DataFrame({
    'chid': list_chid*len(list_month),
}).sort_values(by='chid', ignore_index=True)
df_full_y_sum['data_dt'] = list_month*len(list_chid)

df_full_y_sum.shape

(600000, 2)

In [8]:
## join feature
category_cols = ['masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']

numeric_cols = sorted(set(df_cust_f.columns) - set(category_cols) - set(['chid', 'data_ym', 'data_dt']), 
                      key=list(df_cust_f.columns).index)

df_full_y_sum = df_full_y_sum.merge(df_cust_f[['chid', 'data_ym'] + category_cols + numeric_cols], 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_ym'])

#df_full_y_sum.dropna(thresh=len(numeric_cols+category_cols), inplace=True)

## fill na value, numerical: 0, category: '-1'
values = dict()

for col in numeric_cols:
    values[col] = 0
    
for col in category_cols:
    values[col] = '-1'
    
df_full_y_sum.fillna(value=values, inplace=True)
df_full_y_sum.shape

(600000, 32)

In [9]:
## 取得整個月的 objam 
temp_cdtx = df_cdtx.groupby(['chid', 'month']).sum()
df_cdtx_objam = pd.DataFrame(list(map(list, temp_cdtx.index)), columns=['chid', 'data_dt'])
df_cdtx_objam['objam'] = temp_cdtx.objam.values

### 取得整個月的 y
#temp_y = df_y.groupby(['chid', 'data_dt']).sum()
#df_y_sum = pd.DataFrame(list(map(list, temp_y.index)), columns=['chid', 'data_dt'])
#df_y_sum['y'] = temp_y['y'].values

In [10]:
## join objam & y 

df_full_y_sum = df_full_y_sum.merge(df_cdtx_objam, 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)

#df_full_y_sum = df_full_y_sum.merge(df_y_sum, 
#                                    how='left', 
#                                    left_on=['chid', 'data_dt'], 
#                                    right_on=['chid', 'data_dt']).fillna(0)
df_full_y_sum.shape

(600000, 33)

In [11]:
mapper = {col: {value: index for index, value in enumerate(sorted(df_full_y_sum[col].unique()))} 
          for col in category_cols}

df_full_y_sum[category_cols] = df_full_y_sum[category_cols].apply(lambda x: x.map(mapper[x.name]))

print(df_full_y_sum.shape)
df_full_y_sum.head(2)

(600000, 33)


Unnamed: 0,chid,data_dt,data_ym,masts,educd,naty,trdtp,poscd,cuorg,monin,...,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down,objam
0,0,2019-01-01,2019-01-01,3,5,2,23,2,8,173472.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.8,0.0,3.0,0.0
1,0,2019-02-01,2019-02-01,3,5,2,23,2,8,173472.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,5.0,0.0


In [12]:
df_full_y_sum.drop(columns=['data_ym'], inplace=True)

ignore_cols = ['data_dt']
category_cols = ['chid'] + category_cols
numeric_cols = sorted(set(df_full_y_sum.columns) - set(category_cols) - set(ignore_cols), 
                      key=list(df_full_y_sum.columns).index)

print(len(ignore_cols), ignore_cols)
print(len(category_cols), category_cols)
print(len(numeric_cols), numeric_cols)

1 ['data_dt']
7 ['chid', 'masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']
24 ['monin', 'wrky', 'first_mob', 'cycam', 'slam', 'sum_area_c', 'sum_u2_ind', 'sum_u3_ind', 'sum_u4_ind', 'sum_l2_ind', 'sum_l3_ind', 'sum_l4_ind', 'constant_area_c', 'constant_u2_ind', 'constant_u3_ind', 'constant_u4_ind', 'constant_l2_ind', 'constant_l3_ind', 'constant_l4_ind', 'constant_change', 'growth_rate', 'monotone_up', 'monotone_down', 'objam']


In [13]:
def data_split(df, numeric_cols=[], category_cols=[], test_size=0.166):
    df = df[category_cols + numeric_cols].copy()
    x_train, x_test, y_train, y_test = [], [], [], []
            
    for i in tqdm(sorted(df.chid.unique())):
        data = df[df.chid == i]
        last = data.shape[0] - 1
        test_num = round(data.shape[0]*test_size)            

        x_train.append(data.iloc[0:last - test_num])
        y_train.append(data.iloc[1:last - test_num + 1, [-1]])

        x_test.append(data.iloc[last - test_num: last])
        y_test.append(data.iloc[last - test_num + 1: last + 1, [-1]])

    x_train = pd.concat(x_train)
    y_train = pd.concat(y_train)
    
    x_test = pd.concat(x_test)
    y_test = pd.concat(y_test)
    
    return x_train, x_test, y_train, y_test

In [14]:
x_train, x_test, y_train, y_test = data_split(df_full_y_sum, numeric_cols, category_cols, test_size=0.166)    

num_chid = len(set(df_full_y_sum.chid))
print('train:{}, test:{}'.format(x_train.shape[0]//num_chid, x_test.shape[0]//num_chid))
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))


train:9, test:2
(450000, 31) (450000, 1) (100000, 31) (100000, 1)


In [15]:
x_test.head()

Unnamed: 0,chid,masts,educd,naty,trdtp,poscd,cuorg,monin,wrky,first_mob,...,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down,objam
9,0,3,5,2,25,2,8,173472.0,0.0,201.0,...,0.0,0.0,0.0,0.0,0.0,1.0,4.1,1.0,0.0,56160.0
10,0,3,5,2,25,2,8,173472.0,0.0,202.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,2.0,41008.0
21,1,1,2,2,25,5,8,248914.0,0.0,201.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16052.0
22,1,1,2,2,25,5,8,248914.0,0.0,202.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,2,1,4,2,7,6,8,272246.0,0.0,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.0,2.0,49930.0


In [16]:
y_test.head()

Unnamed: 0,objam
10,41008.0
11,407784.0
22,0.0
23,0.0
34,446831.0


In [17]:
x_train.to_csv(os.path.join(sample_path, 'Normal', 'x_train.csv'), index=False, encoding='utf-8')
x_test.to_csv(os.path.join(sample_path, 'Normal', 'x_test.csv'), index=False, encoding='utf-8')
y_train.to_csv(os.path.join(sample_path, 'Normal', 'y_train.csv'), index=False, encoding='utf-8')
y_test.to_csv(os.path.join(sample_path, 'Normal', 'y_test.csv'), index=False, encoding='utf-8')

In [18]:
np.save(os.path.join(sample_path, 'Normal', 'feature_map'), mapper)
np.save(os.path.join(sample_path, 'Normal', 'feature'), {
    'category_columns':category_cols, 
    'numeric_columns':numeric_cols
})