<a href="https://colab.research.google.com/github/udothemath/ncku_customer_embedding/blob/main/WenMing/0_sample_file_by_chid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is used for data preprocessing and sampling

In [None]:
# 如果有使用 coloab 再執行此 cell
from google.colab import drive
drive.mount('/gdrive')
!ln -s /gdrive/MyDrive/colab/NCKU_embedding/data/ ./data


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from time import time
import pandas as pd
import numpy as np
import os
import random
from tqdm.notebook import tqdm
import datetime

In [None]:
data_path = './data'
cdtx_path = 'raw/zip_if_cca_cdtx0001_hist'
y_path = 'raw/zip_if_cca_y'
cust_f_path = 'raw/zip_if_cca_cust_f'
stonc_tag_f_path = 'raw/zip_if_cca_stonc_tag_features'

child_file = 'sample_chid.txt'
cdtx_file = 'pickle_if_cca_cdtx0001_hist_{:04d}.pkl'
y_file = 'pickle_if_cca_y_{:04d}.pkl'
cust_f_file = 'pickle_if_cca_cust_f_{:04d}.pkl'
stonc_tag_f_file = 'pickle_if_cca_stonc_tag_features_0000.pkl'

## 進行資料篩選
* 隨機抽取了三個檔案 pickle_if_cca_cdtx0001_hist_0008～10.pkl
* 篩選日期為小於 2019/1/1 

In [None]:
!wc -l data/raw/zip_if_cca_cdtx0001_hist/pickle_if_cca_cdtx0001_hist_0010.pkl
!wc -l data/raw/zip_if_cca_cdtx0001_hist/pickle_if_cca_cdtx0001_hist_0009.pkl
!wc -l data/raw/zip_if_cca_cdtx0001_hist/pickle_if_cca_cdtx0001_hist_0008.pkl

In [None]:
data1 = np.load(f'data/raw/zip_if_cca_cdtx0001_hist/pickle_if_cca_cdtx0001_hist_0008.pkl',  allow_pickle=True)
data1 = data1[data1.csmdt < datetime.date(2019, 1, 1)]
chids = set(data1.chid.unique())
del data1

In [None]:
data2 = np.load(f'data/raw/zip_if_cca_cdtx0001_hist/pickle_if_cca_cdtx0001_hist_0009.pkl',  allow_pickle=True)
data2 = data2[data2.csmdt < datetime.date(2019, 1, 1)]
chids.update(data2.chid.unique())
del data2

In [None]:
data3 = np.load(f'data/raw/zip_if_cca_cdtx0001_hist/pickle_if_cca_cdtx0001_hist_0010.pkl',  allow_pickle=True)
data3 = data3[data3.csmdt < datetime.date(2019, 1, 1)]
chids.update(data3.chid.unique())
del data3

## 產生所有不重複的 chid (customer ID)
* 並產出 customer id list File : './data/sample_50k/sample_chid.txt'

In [None]:
chids = np.array(list(chids))

In [None]:
random.seed(1012+4028+4036)
sample_chids = random.sample(list(chids), k = 50000)
with open(os.path.join(data_path, 'sample_50k', child_file), 'w') as f:
    for i in sample_chids:
        f.write(i+'\n')

## Produce the mapping table that serial number -> chid

In [None]:
idx_map = {}
for i,j in enumerate(sample_chids):
    idx_map[j] = i

In [None]:
np.save(os.path.join(data_path, 'sample_50k/sample_idx_map'), idx_map)

## Load White List

In [None]:
chid_arary = np.loadtxt(os.path.join(data_path, 'sample_50k', child_file), dtype=np.str)
len(set(chid_arary))

## Counting the total loading time and extrac transation log by chid list 
* this is used for counting the toal time that load whole zip_if_cca_cdtx0001_hist data 
* suffule all pickle file and filtering by chid list

In [None]:
cdtx_list = []

t = time()
for i in range(0, 30):
    print('In',  cdtx_file.format(i))
    
    t0 = time()
    
    df_cdtx = pd.read_pickle(os.path.join(data_path, cdtx_path, cdtx_file.format(i)))
    t1 = time()
    
    print('\t Load file cost', t1 - t0)
    
    mask = df_cdtx.chid.isin(chid_arary)
    t2 = time()
    
    print('\t Find mask cost', t2 - t1)
    
    cdtx_list.append(df_cdtx.values[:][mask])
    t3 = time()
    
    print('\t One file total cost', t3 - t0, '\n')
    
columns = df_cdtx.columns
print('Whole files total cost', t3 - t)

In [None]:
sample_df_cdtx = pd.DataFrame(np.concatenate(cdtx_list, axis=0), columns=df_cdtx.columns)
sample_df_cdtx = sample_df_cdtx.sort_values(by=['csmdt', 'chid', 'mcc']).reset_index(drop=True)
sample_df_cdtx.shape

In [None]:
len(sample_df_cdtx.chid.unique()), len(sample_df_cdtx.csmdt.unique()), len(sample_df_cdtx.mcc.unique())

In [None]:
sample_df_cdtx.to_csv(os.path.join(data_path, 'sample_50k', 'sample_zip_if_cca_cdtx0001_hist.csv'), index=False, encoding='utf-8')

In [None]:
!head -n 3 ./data/sample_50k/sample_zip_if_cca_cdtx0001_hist.csv

In [None]:
del sample_df_cdtx

In [None]:
del cdtx_list

In [None]:
del df_cdtx
del mask

## 對學習目標資料檔進行整理 zip_if_cca_y

In [None]:
y_list = []

t = time()
for i in range(0, 13):
    print('In',  y_file.format(i))
    
    t0 = time()
    
    df_y = pd.read_pickle(os.path.join(data_path, y_path, y_file.format(i)))
    t1 = time()
    
    print('\t Load file cost', t1 - t0)
    
    mask = df_y.chid.isin(chid_arary)
    t2 = time()
    
    print('\t Find mask cost', t2 - t1)
    
    y_list.append(df_y.values[:][mask])
    t3 = time()
    
    print('\t One file total cost', t3 - t0, '\n')
    
columns = df_y.columns
print('Whole files total cost', t3 - t)

In [None]:
sample_df_y = pd.DataFrame(np.concatenate(y_list, axis=0), columns=df_y.columns)
sample_df_y = sample_df_y.sort_values(by=['data_dt', 'chid', 'stonc_tag']).reset_index(drop=True)
sample_df_y.shape

In [None]:
len(sample_df_y.chid.unique()), len(sample_df_y.data_dt.unique()), len(sample_df_y.stonc_tag.unique())

In [None]:
sample_df_y.to_csv(os.path.join(data_path, 'sample_50k', 'sample_zip_if_cca_y.csv'), index=False, encoding='utf-8')

### the objective data include below column
* date_dt: the transaction's date
* chid: user's id
* stonc_tg: merchant category
* y: the amount of consumption



In [None]:
!head -n 5 ./data/sample_50k/sample_zip_if_cca_y.csv

## 根據抽樣過後的 chid list 取出相對的使用者 Profile

In [None]:
pd.read_pickle(os.path.join(data_path, cust_f_path, cust_f_file.format(0))).head(3)

In [None]:
cust_f_list = []

t = time()
for i in range(0, 94):
    print('In',  cust_f_file.format(i))
    
    t0 = time()
    
    df_cust_f = pd.read_pickle(os.path.join(data_path, cust_f_path, cust_f_file.format(i)))
    t1 = time()
    
    print('\t Load file cost', t1 - t0)
    
    mask = df_cust_f.chid.isin(chid_arary)
    t2 = time()
    
    print('\t Find mask cost', t2 - t1)
    
    cust_f_list.append(df_cust_f.values[:][mask])
    t3 = time()
    
    print('\t One file total cost', t3 - t0, '\n')
    
columns = df_cust_f.columns
print('Whole files total cost', t3 - t)

In [None]:
cust_f_list

In [None]:
sample_df_cust_f = pd.DataFrame(np.concatenate(cust_f_list, axis=0), columns=df_cust_f.columns)
sample_df_cust_f = sample_df_cust_f.sort_values(by=['chid', 'data_ym', 'data_dt']).reset_index(drop=True)
sample_df_cust_f.shape

In [None]:
len(sample_df_cust_f.chid.unique()), len(sample_df_cust_f.data_ym.unique()), len(sample_df_cust_f.data_dt.unique())

In [None]:
sample_df_cust_f.to_csv(os.path.join(data_path, 'sample_50k', 'sample_zip_if_cca_cust_f.csv'), index=False, encoding='utf-8')

In [None]:
!head -n 3 ./data/sample_50k/sample_zip_if_cca_cust_f.csv