In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from time import time
import pandas as pd
import numpy as np
import os
import random
from tqdm.notebook import tqdm
import datetime

In [2]:
data_path = './data'
cdtx_path = 'raw/zip_if_cca_cdtx0001_hist'
y_path = 'raw/zip_if_cca_y'
cust_f_path = 'raw/zip_if_cca_cust_f'
stonc_tag_f_path = 'raw/zip_if_cca_stonc_tag_features'

child_file = 'sample_chid.txt'
cdtx_file = 'pickle_if_cca_cdtx0001_hist_{:04d}.pkl'
y_file = 'pickle_if_cca_y_{:04d}.pkl'
cust_f_file = 'pickle_if_cca_cust_f_{:04d}.pkl'
stonc_tag_f_file = 'pickle_if_cca_stonc_tag_features_0000.pkl'

In [None]:
data1 = np.load(f'data/raw/zip_if_cca_cdtx0001_hist/pickle_if_cca_cdtx0001_hist_0008.pkl',  allow_pickle=True)
data2 = np.load(f'data/raw/zip_if_cca_cdtx0001_hist/pickle_if_cca_cdtx0001_hist_0009.pkl',  allow_pickle=True)
data3 = np.load(f'data/raw/zip_if_cca_cdtx0001_hist/pickle_if_cca_cdtx0001_hist_0010.pkl',  allow_pickle=True)
data1 = data1[data1.csmdt < datetime.date(2019, 1, 1)]
data2 = data2[data2.csmdt < datetime.date(2019, 1, 1)]
data3 = data3[data3.csmdt < datetime.date(2019, 1, 1)]

In [None]:
chids = set(data1.chid.unique())
chids.update(data2.chid.unique())
chids.update(data3.chid.unique())
chids = np.array(list(chids))

In [None]:
random.seed(1012+4028+4036)
sample_chids = random.sample(list(chids), k = 50000)
with open(os.path.join(data_path, 'sample_50k', child_file), 'w') as f:
    for i in sample_chids:
        f.write(i+'\n')

In [None]:
len(sample_chids)

In [None]:
idx_map = {}
for i,j in enumerate(sample_chids):
    idx_map[j] = i

In [None]:
np.save(os.path.join(data_path, 'sample_50k/sample_idx_map'), idx_map)

In [3]:
chid_arary = np.loadtxt(os.path.join(data_path, 'sample_50k', child_file), dtype=np.str)
len(set(chid_arary))

50000

## zip_if_cca_cdtx0001_hist

In [4]:
cdtx_list = []

t = time()
for i in range(0, 30):
    print('In',  cdtx_file.format(i))
    
    t0 = time()
    
    df_cdtx = pd.read_pickle(os.path.join(data_path, cdtx_path, cdtx_file.format(i)))
    t1 = time()
    
    print('\t Load file cost', t1 - t0)
    
    mask = df_cdtx.chid.isin(chid_arary)
    t2 = time()
    
    print('\t Find mask cost', t2 - t1)
    
    cdtx_list.append(df_cdtx.values[:][mask])
    t3 = time()
    
    print('\t One file total cost', t3 - t0, '\n')
    
columns = df_cdtx.columns
print('Whole files total cost', t3 - t)

In pickle_if_cca_cdtx0001_hist_0000.pkl
	 Load file cost 16.876818656921387
	 Find mask cost 1.4249851703643799
	 One file total cost 18.448395013809204 

In pickle_if_cca_cdtx0001_hist_0001.pkl
	 Load file cost 19.16689658164978
	 Find mask cost 1.4275867938995361
	 One file total cost 20.74968957901001 

In pickle_if_cca_cdtx0001_hist_0002.pkl
	 Load file cost 24.60371494293213
	 Find mask cost 1.459846019744873
	 One file total cost 26.34774684906006 

In pickle_if_cca_cdtx0001_hist_0003.pkl
	 Load file cost 23.268123388290405
	 Find mask cost 1.4816529750823975
	 One file total cost 25.014113664627075 

In pickle_if_cca_cdtx0001_hist_0004.pkl
	 Load file cost 24.992562294006348
	 Find mask cost 1.6065473556518555
	 One file total cost 26.890823125839233 

In pickle_if_cca_cdtx0001_hist_0005.pkl
	 Load file cost 19.083059072494507
	 Find mask cost 1.4902989864349365
	 One file total cost 20.82158374786377 

In pickle_if_cca_cdtx0001_hist_0006.pkl
	 Load file cost 20.166643142700195


In [5]:
sample_df_cdtx = pd.DataFrame(np.concatenate(cdtx_list, axis=0), columns=df_cdtx.columns)
sample_df_cdtx = sample_df_cdtx.sort_values(by=['csmdt', 'chid', 'mcc']).reset_index(drop=True)
sample_df_cdtx.shape

(6654938, 10)

In [6]:
len(sample_df_cdtx.chid.unique()), len(sample_df_cdtx.csmdt.unique()), len(sample_df_cdtx.mcc.unique())

(50000, 761, 502)

In [7]:
sample_df_cdtx.to_csv(os.path.join(data_path, 'sample_50k', 'sample_zip_if_cca_cdtx0001_hist.csv'), index=False, encoding='utf-8')

## zip_if_cca_y

In [8]:
y_list = []

t = time()
for i in range(0, 13):
    print('In',  y_file.format(i))
    
    t0 = time()
    
    df_y = pd.read_pickle(os.path.join(data_path, y_path, y_file.format(i)))
    t1 = time()
    
    print('\t Load file cost', t1 - t0)
    
    mask = df_y.chid.isin(chid_arary)
    t2 = time()
    
    print('\t Find mask cost', t2 - t1)
    
    y_list.append(df_y.values[:][mask])
    t3 = time()
    
    print('\t One file total cost', t3 - t0, '\n')
    
columns = df_y.columns
print('Whole files total cost', t3 - t)

In pickle_if_cca_y_0000.pkl
	 Load file cost 7.390331268310547
	 Find mask cost 1.1920809745788574
	 One file total cost 8.668232202529907 

In pickle_if_cca_y_0001.pkl
	 Load file cost 7.920997619628906
	 Find mask cost 1.133734941482544
	 One file total cost 9.136716604232788 

In pickle_if_cca_y_0002.pkl
	 Load file cost 8.505628108978271
	 Find mask cost 1.169633150100708
	 One file total cost 9.788049936294556 

In pickle_if_cca_y_0003.pkl
	 Load file cost 7.627463102340698
	 Find mask cost 1.1210627555847168
	 One file total cost 8.82791519165039 

In pickle_if_cca_y_0004.pkl
	 Load file cost 8.39721393585205
	 Find mask cost 1.182722568511963
	 One file total cost 9.662742853164673 

In pickle_if_cca_y_0005.pkl
	 Load file cost 7.927182674407959
	 Find mask cost 1.1155385971069336
	 One file total cost 9.123964548110962 

In pickle_if_cca_y_0006.pkl
	 Load file cost 8.412933826446533
	 Find mask cost 1.1640393733978271
	 One file total cost 9.659816026687622 

In pickle_if_cca_y

In [9]:
sample_df_y = pd.DataFrame(np.concatenate(y_list, axis=0), columns=df_y.columns)
sample_df_y = sample_df_y.sort_values(by=['data_dt', 'chid', 'stonc_tag']).reset_index(drop=True)
sample_df_y.shape

(2889085, 4)

In [10]:
len(sample_df_y.chid.unique()), len(sample_df_y.data_dt.unique()), len(sample_df_y.stonc_tag.unique())

(50000, 24, 49)

In [11]:
sample_df_y.to_csv(os.path.join(data_path, 'sample_50k', 'sample_zip_if_cca_y.csv'), index=False, encoding='utf-8')

## zip_if_cust_f_path

In [12]:
cust_f_list = []

t = time()
for i in range(0, 94):
    print('In',  cust_f_file.format(i))
    
    t0 = time()
    
    df_cust_f = pd.read_pickle(os.path.join(data_path, cust_f_path, cust_f_file.format(i)))
    t1 = time()
    
    print('\t Load file cost', t1 - t0)
    
    mask = df_cust_f.chid.isin(chid_arary)
    t2 = time()
    
    print('\t Find mask cost', t2 - t1)
    
    cust_f_list.append(df_cust_f.values[:][mask])
    t3 = time()
    
    print('\t One file total cost', t3 - t0, '\n')
    
columns = df_cust_f.columns
print('Whole files total cost', t3 - t)

In pickle_if_cca_cust_f_0000.pkl
	 Load file cost 9.685742378234863
	 Find mask cost 0.14999866485595703
	 One file total cost 9.883501529693604 

In pickle_if_cca_cust_f_0001.pkl
	 Load file cost 12.659935235977173
	 Find mask cost 0.17086577415466309
	 One file total cost 12.881603479385376 

In pickle_if_cca_cust_f_0002.pkl
	 Load file cost 10.794726848602295
	 Find mask cost 0.15149259567260742
	 One file total cost 10.996924877166748 

In pickle_if_cca_cust_f_0003.pkl
	 Load file cost 11.432553052902222
	 Find mask cost 0.15112972259521484
	 One file total cost 11.628779172897339 

In pickle_if_cca_cust_f_0004.pkl
	 Load file cost 11.601858615875244
	 Find mask cost 0.14953112602233887
	 One file total cost 11.813467979431152 

In pickle_if_cca_cust_f_0005.pkl
	 Load file cost 11.168764352798462
	 Find mask cost 0.15137887001037598
	 One file total cost 11.366822957992554 

In pickle_if_cca_cust_f_0006.pkl
	 Load file cost 11.691450834274292
	 Find mask cost 0.15526461601257324
	 

	 Load file cost 10.364654779434204
	 Find mask cost 0.17528295516967773
	 One file total cost 10.627500772476196 

In pickle_if_cca_cust_f_0057.pkl
	 Load file cost 10.480870962142944
	 Find mask cost 0.17404389381408691
	 One file total cost 10.713730573654175 

In pickle_if_cca_cust_f_0058.pkl
	 Load file cost 10.354742050170898
	 Find mask cost 0.17624497413635254
	 One file total cost 10.583185195922852 

In pickle_if_cca_cust_f_0059.pkl
	 Load file cost 10.183088541030884
	 Find mask cost 0.1731405258178711
	 One file total cost 10.405745267868042 

In pickle_if_cca_cust_f_0060.pkl
	 Load file cost 10.375734090805054
	 Find mask cost 0.1721651554107666
	 One file total cost 10.601123094558716 

In pickle_if_cca_cust_f_0061.pkl
	 Load file cost 10.045108318328857
	 Find mask cost 0.1827538013458252
	 One file total cost 10.276908874511719 

In pickle_if_cca_cust_f_0062.pkl
	 Load file cost 10.671982049942017
	 Find mask cost 0.1773974895477295
	 One file total cost 10.905895948410

In [13]:
sample_df_cust_f = pd.DataFrame(np.concatenate(cust_f_list, axis=0), columns=df_cust_f.columns)
sample_df_cust_f = sample_df_cust_f.sort_values(by=['chid', 'data_ym', 'data_dt']).reset_index(drop=True)
sample_df_cust_f.shape

(1188620, 32)

In [14]:
len(sample_df_cust_f.chid.unique()), len(sample_df_cust_f.data_ym.unique()), len(sample_df_cust_f.data_dt.unique())

(50000, 24, 24)

In [15]:
sample_df_cust_f.to_csv(os.path.join(data_path, 'sample_50k', 'sample_zip_if_cca_cust_f.csv'), index=False, encoding='utf-8')