In [1]:
import pandas as pd
import numpy as np
import glob
import pickle
import math

import warnings
warnings.filterwarnings("ignore")
from dateutil.relativedelta import relativedelta

In [2]:
def get_embedding_by_date(year, month):    
    # Fetch graph embedding by customer ID
    embed_path = '../data/vertex_embeddings/embed_heter_superv_recur_focal_logisticMF_embed_{}-{}.npy'.format(year, month)
    E_t = np.load(embed_path)
    print(E_t.shape)
    embedding = pd.DataFrame(data=E_t, 
                             index=range(E_t.shape[0]),
                             columns=['embed_{}'.format(i) for i in range(E_t.shape[1])])
    with open("../data/index_mapping/int2key_{}{}.pickle".format(str(year), str(month).zfill(2)), 'rb') as f:
        int2key_t = pickle.load(f)
    embedding['customerno'] = embedding.apply(lambda x: int2key_t[x.name], axis=1)
    
    return embedding

In [3]:
class DateYM:
    def __init__(self, year, month):
        self.year = year
        self.month = month - 1    # 0 ~ 11, from Jan to Dec
        
    def export_tuple(self):
        return (self.year, self.month+1)
    
    def add_year(self, y):
        self.year += y
        
    def substract_year(self, y):
        self.year -= y
        
    def add_month(self, m):
        self.month += m
        self.year += math.floor(self.month / 12)
        self.month = self.month % 12
        
    def subtract_month(self, m):
        self.month -= m
        tmp_year = math.floor(self.month / 12)
        self.year += tmp_year
        self.month += -tmp_year * 12
        
    def is_larger_than(self, ym):
        return self.year*12 + self.month > ym.year*12 + ym.month
    
    def is_smaller_than(slef, ym):
        return self.year*12 + self.month < ym.year*12 + ym.month
    
    def is_equal(self, ym):
        return self.year*12 + self.month == ym.year*12 + ym.month
    
def list_date_tuples(from_date, to_date):
    ret = []
    tmp_date = DateYM(*from_date.export_tuple())
    while not tmp_date.is_larger_than(to_date):
        ret.append(tmp_date.export_tuple())
        tmp_date.add_month(1)
    return ret

In [4]:
def get_feature(year, month, pre_fix="", 
                use_feature_ctbc=True, use_feature_embed=True, use_feature_rankscore=True):
    from_date = pd.to_datetime("{}/{}/{}".format(month, 1, year))
    to_date = from_date + pd.DateOffset(months=1)
    print("Construct dataframe in {}.{}".format(year, month))
    
    # Get CTBC features
    if year == 2017:
        file_name = 'aml_all_{}{:02d}_y.csv'.format(str(2018)[2:], 1)
    else:
        file_name = 'aml_all_{}{:02d}_y.csv'.format(str(year)[2:], month)
    pdframe = pd.read_csv('../data/' + file_name)
    pdframe = pdframe.set_index('customerno')
    
    # Attach graph embedding onto the features, weight_type: [weight_none, weight_log, count_larger8000]
    embed = get_embedding_by_date(year, month)
    pdframe = pdframe.join(embed.set_index('customerno'), on='customerno')
    embed_col_names = list(embed.columns)
    embed_col_names.remove('customerno')
    pdframe.loc[:, embed_col_names] = pdframe.loc[:, embed_col_names].fillna(0)
    
    return pdframe

In [5]:
from_date_ym = DateYM(2018, 1)
to_date_ym = DateYM(2019, 6)
list_date_seq = list_date_tuples(from_date_ym, to_date_ym)

AML_data = {}
embed_col_names = []
pdframe_pre = None
for date_ym in list_date_seq:
    pdframe = get_feature(*date_ym)
    AML_data[date_ym] = pd.get_dummies(pdframe, columns=['Customer_Type_Code', 'Customer_Category_Code'])

Construct dataframe in 2018.1
(47612, 300)
Construct dataframe in 2018.2
(47612, 300)
Construct dataframe in 2018.3
(47612, 300)
Construct dataframe in 2018.4
(47612, 300)
Construct dataframe in 2018.5
(47612, 300)
Construct dataframe in 2018.6
(47612, 300)
Construct dataframe in 2018.7
(47612, 300)
Construct dataframe in 2018.8
(47612, 300)
Construct dataframe in 2018.9
(47612, 300)
Construct dataframe in 2018.10
(47612, 300)
Construct dataframe in 2018.11
(47612, 300)
Construct dataframe in 2018.12
(47612, 300)
Construct dataframe in 2019.1
(47612, 300)
Construct dataframe in 2019.2
(47612, 300)
Construct dataframe in 2019.3
(47612, 300)
Construct dataframe in 2019.4
(47612, 300)
Construct dataframe in 2019.5
(47612, 300)
Construct dataframe in 2019.6
(47612, 300)


In [6]:
embed_feat = []
for x in list(pdframe.columns):
    if x.startswith('embed') and not x.endswith('_diff'):
        embed_feat.append(x)
print('The length of embed_feat = {}'.format(len(embed_feat)))

feature_feat = []
for x in list(pdframe.columns):
    if x.startswith('feature') or x.startswith('SARU'):
        feature_feat.append(x)
print('The length of feature_feat = {}'.format(len(feature_feat)))

The length of embed_feat = 300
The length of feature_feat = 330


In [7]:
for date in AML_data.keys():
    print(date, AML_data[date]['feature04'].max())

(2018, 1) 5830443790
(2018, 2) 6413467128
(2018, 3) 6586047443
(2018, 4) 7008951066
(2018, 5) 7685097383
(2018, 6) 8096072367
(2018, 7) 8237477056
(2018, 8) 8798939138
(2018, 9) 9051328933
(2018, 10) 9254419155
(2018, 11) 8261446647
(2018, 12) 7297770836
(2019, 1) 6001430539
(2019, 2) 5602766946
(2019, 3) 5314131292
(2019, 4) 4760485624
(2019, 5) 4322361921
(2019, 6) 3964901090


In [8]:
ScalarFrame = pd.DataFrame(index=feature_feat, columns=['max', 'min']).fillna(0)

for date in AML_data.keys():
    print (date)
    tmp_AML_data = AML_data.get(date).describe()
    for n in feature_feat:
        if tmp_AML_data.loc['max', n] > ScalarFrame.loc[n,'max']:
            ScalarFrame.loc[n,'max'] = tmp_AML_data.loc['max', n]
        if tmp_AML_data.loc['min', n] < ScalarFrame.loc[n,'min']:
            ScalarFrame.loc[n,'min'] = tmp_AML_data.loc['min', n]

ScalarFrame

(2018, 1)
(2018, 2)
(2018, 3)
(2018, 4)
(2018, 5)
(2018, 6)
(2018, 7)
(2018, 8)
(2018, 9)
(2018, 10)
(2018, 11)
(2018, 12)
(2019, 1)
(2019, 2)
(2019, 3)
(2019, 4)
(2019, 5)
(2019, 6)


Unnamed: 0,max,min
feature01,7.820427e+02,0.0
feature02,3.600000e+02,0.0
feature03,3.600000e+02,0.0
feature04,9.254419e+09,0.0
feature05,1.500000e+01,0.0
feature06,1.500000e+01,0.0
feature07,6.830000e+02,0.0
feature08,4.283503e+05,0.0
feature09,9.970070e+09,0.0
feature10,2.020000e+02,0.0


In [9]:
label    = {}
announce = {}
FILTER   = {}
GRUdict  = {}
for n in range(3, len(list_date_seq)):
    print (list_date_seq[n])
    GRUframe_tmp_1 = AML_data[list_date_seq[n]]    
    GRUframe_tmp_1 = GRUframe_tmp_1.drop('Y_SAR', axis=1)
    GRUframe_tmp_1[embed_feat] = GRUframe_tmp_1[embed_feat]
    GRUframe_tmp_1[feature_feat] = (GRUframe_tmp_1[feature_feat] - ScalarFrame.T[feature_feat].min()) /\
                                   (ScalarFrame.T[feature_feat].max() - ScalarFrame.T[feature_feat].min() + 1e-20)
    
    GRUframe_tmp_2 = AML_data[list_date_seq[n-1]]
    GRUframe_tmp_2 = GRUframe_tmp_2.drop('Y_SAR', axis=1)
    GRUframe_tmp_2[embed_feat] = GRUframe_tmp_2[embed_feat]
    GRUframe_tmp_2[feature_feat] = (GRUframe_tmp_2[feature_feat] - ScalarFrame.T[feature_feat].min()) /\
                                   (ScalarFrame.T[feature_feat].max() - ScalarFrame.T[feature_feat].min() + 1e-20)
    
    GRUframe_tmp_3 = AML_data[list_date_seq[n-2]]
    GRUframe_tmp_3 = GRUframe_tmp_3.drop('Y_SAR', axis=1)
    GRUframe_tmp_3[embed_feat] = GRUframe_tmp_3[embed_feat]
    GRUframe_tmp_3[feature_feat] = (GRUframe_tmp_3[feature_feat] - ScalarFrame.T[feature_feat].min()) /\
                                   (ScalarFrame.T[feature_feat].max() - ScalarFrame.T[feature_feat].min() + 1e-20)

    GRUframe_tmp_4 = AML_data[list_date_seq[n-3]]
    GRUframe_tmp_4 = GRUframe_tmp_4.drop('Y_SAR', axis=1)
    GRUframe_tmp_4[embed_feat] = GRUframe_tmp_4[embed_feat]
    GRUframe_tmp_4[feature_feat] = (GRUframe_tmp_4[feature_feat] - ScalarFrame.T[feature_feat].min()) /\
                                   (ScalarFrame.T[feature_feat].max() - ScalarFrame.T[feature_feat].min() + 1e-20)

    GRUdict[list_date_seq[n]] = np.hstack([GRUframe_tmp_1.values[:,np.newaxis, :], 
                                           GRUframe_tmp_2.values[:,np.newaxis, :], 
                                           GRUframe_tmp_3.values[:,np.newaxis, :], 
                                           GRUframe_tmp_4.values[:,np.newaxis, :]])
    label[list_date_seq[n]]    = AML_data[list_date_seq[n]]['Y_SAR'].values
    announce[list_date_seq[n]] = AML_data[list_date_seq[n]]['announce'].values
    FILTER[list_date_seq[n]]   = AML_data[list_date_seq[n]]['FILTER'].values

(2018, 4)
(2018, 5)
(2018, 6)
(2018, 7)
(2018, 8)
(2018, 9)
(2018, 10)
(2018, 11)
(2018, 12)
(2019, 1)
(2019, 2)
(2019, 3)
(2019, 4)
(2019, 5)
(2019, 6)


In [17]:
for year, month in list_date_seq[3:]:
    np.savez('../../user_data/CloudMile/data/data_{}_{}.npz'.format(year, month),
             GRUdict[(year, month)], 
             label[(year, month)],
             announce[(year, month)], 
             FILTER[(year, month)])

In [255]:
# testing_date   = [(2018, 12),(2019, 1),(2019, 2),
#                   (2019, 3),(2019, 4), (2019, 5),(2019, 6)]
# training_date  = [(2018, 5),
#                   (2018, 6),(2018, 7), (2018, 8),
#                   (2018, 9),(2018, 10),(2018, 11)]

In [None]:
date_list = [(2018, 5), (2018, 6),(2018, 7),
             (2018, 8),(2018, 9),(2018, 10),
             (2018, 11), (2018, 12),(2019, 1),
             (2019, 2),(2019, 3),(2019, 4), 
             (2019, 5),(2019, 6)]

In [256]:
training_data     = np.concatenate([GRUdict[date] for date in training_date])
testing_data      = np.concatenate([GRUdict[date] for date in testing_date])

training_label    = np.concatenate([label[date] for date in training_date])
testing_label     = np.concatenate([label[date] for date in testing_date])

training_announce = np.concatenate([announce[date] for date in training_date])
testing_announce  = np.concatenate([announce[date] for date in testing_date])

training_FILTER   = np.concatenate([FILTER[date] for date in training_date])
testing_FILTER    = np.concatenate([FILTER[date] for date in testing_date])

In [259]:
# GRUArray = np.vstack(list(GRUdict.values()))

np.savez('../../user_data/CloudMile/data/Training_data_with_embed_ep600.npz', 
         training_data,
         training_label, 
         training_announce, 
         training_FILTER)
np.savez('../../user_data/CloudMile/data/Testing_data_with_embed_ep600.npz', 
         testing_data,  
         testing_label,  
         testing_announce,  
         testing_FILTER)