In [14]:
import networkx as nx
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy
import numpy as np
import math
import pickle
from tqdm import tqdm_notebook
from multiprocessing import Pool
import math
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import warnings
warnings.filterwarnings("ignore")

In [53]:
#
# Parameter Settings
# --------------------------------------------------------------------------------------------------------------------------
latent_dim = 300
epochs = 300
seq_length = 8
lr = 1e-6
param_lambda = 0.2
Z = 1e5
min_num_nodes = 6
weight_norm_type = 'log'  # [log, min-max] 
offset = 6

## Data Loading and Graph Constructing

In [3]:
CustInfo = pd.read_csv('../../AMLAML/customerinformation.csv')
CustInfo['open_date'] = pd.to_datetime(CustInfo.Open_Date)

SARCase = pd.read_csv('../../AMLAML/sarcase.csv')
SARCase['created_date'] = pd.to_datetime(SARCase.Created_Date)

WireTrans = pd.read_csv('../data/new_wire.csv')
WireTrans['trans_date'] = pd.to_datetime(WireTrans.trandt)
WireTrans.head()

Unnamed: 0,Acct_Txn_Amt,acctrxncurrencycode,Agent_Id,appl,As_Of_Date,Bank_Country_Code,bnf_id,bnfbankcountrycode,bnf,Bnf_Bank_Name,...,Account_Nbr_10,Account_Nbr_11,customerno,customer_to,customer_from,customer_to2,customer_from2,trans_date,org_cust,bnf_cust
0,345000.0,USD,,SAV,06JUN2019,HK,534718887838,HK,,HSBCHKHHHKH,...,DgUFAQUCBnQKBCM=,DgUFAQUCBnQKBCM=,BgcDAgcBIw==,AgYFBwQKDnwOCQQj,DgUFAQUDBHYLAwQj,extra_vertex,extra_vertex,2017-01-25,BgcDAgcBIw==,AgYFBwQKDnwOCQQj
1,5647.0,USD,,SAV,06JUN2019,HK,904138007505,,,,...,DgUFAQYKBnQOBCM=,DgUFAQYKBnQOBCM=,DwUBBwACIw==,DwUBBwADIw==,fnACBnd9cAUACAcj,DwUBBwADIw==,extra_vertex,2017-01-04,fnACBnd9cAUACAcj,DwUBBwACIw==
2,4450000.0,USD,,SAV,06JUN2019,HK,656157759287,CN,,BKCHCNBJ400,...,DgUFAQYHBnUBASM=,DgUFAQYHBnUBASM=,AgUACAUAIw==,AQAHAQAFAXEAAw8j,AgUACAUHIw==,extra_vertex,AgUACAUHIw==,2017-01-13,AgUACAUAIw==,AQAHAQAFAXEAAw8j
3,19090.05,EUR,,SAV,06JUN2019,HK,360166300,DE,,COBADEFF670,...,DgUFAQYHBncMBiM=,DgUFAQYHBncMBiM=,AgUCBQIAIw==,BAMBAQMEBXQJIw==,AgUCBQIFIw==,extra_vertex,AgUCBQIFIw==,2017-01-19,AgUCBQIAIw==,BAMBAQMEBXQJIw==
4,238369.08,USD,,SAV,06JUN2019,HK,904133012003,,,,...,DgUFAQYBBnULASM=,DgUFAQYBBnULASM=,BAUAAgUCIw==,BAUAAgUCIw==,DwMAAQ0BBXMLCCM=,BAUAAgUCIw==,extra_vertex,2017-01-16,DwMAAQ0BBXMLCCM=,BAUAAgUCIw==


## ```WIRE_AMTIN``` & ```WIRE_AMTOT``` Normalization
```Acct_Txn_Amt``` is recorded in terms of the currency of that transaction.
```WIRE_AMTOT``` and ```WIRE_AMTIN``` are recorded in terms of the same currency via currency conversion.
Owing to the lack of the balance, account-wise normalization methods cannot be applied.
The applied normalization methods are listed as follows:
+ log normalization
+ min-max normalization

In [18]:
# Log normalization
if weight_norm_type == 'log':
    WireTrans["norm_wire_amt_out"] = WireTrans.apply(lambda x: math.log(x['WIRE_AMTOT']+1), axis=1)
    WireTrans["norm_wire_amt_in"] = WireTrans.apply(lambda x: math.log(x['WIRE_AMTIN']+1), axis=1)
    WireTrans["norm_wire_amt"] = WireTrans.apply(lambda x: x["norm_wire_amt_in"] if x["norm_wire_amt_in"] > 0 else x["norm_wire_amt_out"], axis=1)
    WireTrans.loc[:, ["WIRE_AMTIN", "WIRE_AMTOT", "norm_wire_amt", "norm_wire_amt_in", "norm_wire_amt_out"]]

Unnamed: 0,WIRE_AMTIN,WIRE_AMTOT,norm_wire_amt,norm_wire_amt_in,norm_wire_amt_out
0,0.000,2676372.000,14.800,0.000,14.800
1,43801.520,0.000,10.687,10.687,0.000
2,0.000,34509750.000,17.357,0.000,17.357
3,0.000,157502.460,11.967,0.000,11.967
4,1848456.870,0.000,14.430,14.430,0.000
...,...,...,...,...,...
1407527,0.000,282189.600,12.550,0.000,12.550
1407528,0.000,51477.580,10.849,0.000,10.849
1407529,86411.270,0.000,11.367,11.367,0.000
1407530,0.000,940427.400,13.754,0.000,13.754


### Fetch data by date

In [19]:
class DateYM:
    def __init__(self, year, month):
        self.year = year
        self.month = month - 1    # 0 ~ 11, from Jan to Dec
        
    def export_tuple(self):
        return (self.year, self.month+1)
    
    def add_year(self, y):
        self.year += y
        
    def substract_year(self, y):
        self.year -= y
        
    def add_month(self, m):
        self.month += m
        self.year += math.floor(self.month / 12)
        self.month = self.month % 12
        
    def subtract_month(self, m):
        self.month -= m
        tmp_year = math.floor(self.month / 12)
        self.year += tmp_year
        self.month += -tmp_year * 12
        
    def is_larger_than(self, ym):
        return self.year*12 + self.month > ym.year*12 + ym.month
    
    def is_smaller_than(slef, ym):
        return self.year*12 + self.month < ym.year*12 + ym.month
    
    def is_equal(self, ym):
        return self.year*12 + self.month == ym.year*12 + ym.month

    
def list_date_tuples(from_date, to_date):
    ret = []
    tmp_date = DateYM(*from_date.export_tuple())
    while not tmp_date.is_larger_than(to_date):
        ret.append(tmp_date.export_tuple())
        tmp_date.add_month(1)
    return ret


def fetch_data_by_month(date_ym, trans_offset=6):
    year, month = date_ym
    
    # Get view: WireTrans
    from_date = pd.to_datetime("{}/{}/{}".format(month, 1, year))
    to_date = from_date + pd.DateOffset(months=1)
    offset_date = from_date - pd.DateOffset(months=trans_offset)
    view_wiretrans = WireTrans[(WireTrans.trans_date > offset_date) & 
                               (WireTrans.trans_date < to_date)]
    
    # Get view: CustInfo
    view_customer = CustInfo[CustInfo.open_date < to_date]
    
    # Attach label onto CustInfo
    target_list = SARCase[(SARCase.Status_SAR == 4) & 
                          (SARCase.created_date > from_date) & 
                          (SARCase.created_date < to_date)]['customerno'].unique()
    # print ('# of SAR customers: {}'.format(len(target_list)))
    view_customer['label'] = view_customer.apply(lambda x: 1 if x['customerno'] in target_list else 0, axis=1)
    
    return view_wiretrans, view_customer

In [21]:
#
# Find all nodes across the squential graphs
# --------------------------------------------------------------------------------------------------------------------------
list_nodes_key = list(set(WireTrans.org_cust.values) | set(WireTrans.bnf_cust.values))
dict_nodes_key2int = dict(zip(list_nodes_key, range(len(list_nodes_key))))
dict_nodes_int2key = dict(zip(range(len(list_nodes_key)), list_nodes_key))
with open('dict_nodes_key2int.pickle', 'wb') as f:
    pickle.dump(dict_nodes_key2int, f)
with open('dict_nodes_int2key.pickle', 'wb') as f:
    pickle.dump(dict_nodes_int2key, f)

## Compute Eigenmaps of a Weighted Graph Laplacian

In [29]:
def eigen_maps(G, min_num_nodes=5):
    """ Split graph G into connected subgraph, and then
        compute SA w.r.t. each subgraph respectively.
    """
    # Get all the connected components and their graph Laplacians
    print('  |-- Get all the connected components ...')
    subVs = list(nx.connected_components(G))
    subGs = [G.subgraph(subV) for subV in subVs]
    subLs = [nx.normalized_laplacian_matrix(subG) for subG in subGs]

    # Initialize the vertex embedding with one-hot indexing of components
    print('  |-- Initialize the vertex embedding ...')
    embedding = np.zeros([G.number_of_nodes(), len(subVs)])
    for index in range(len(subVs)):
        embedding[list(subVs[index]), index] = 1

    # Do spectral analysis respectively
    print('  |-- Graph Processing ...')
    for index in range(len(subGs)):
        subV2V = list(subGs[index].nodes())  # ordered index in terms of V of subV
        if len(subV2V) < min_num_nodes:
            continue
        subL = subLs[index]
        
        # Compute eigenmaps
        is_complete_eigen = False
        while not is_complete_eigen:
            try:
                eigval_subL, eigvec_subL = scipy.sparse.linalg.eigs(subL, k=min(6, len(subV2V)-2), which="SR")
                is_complete_eigen = True
            except:
                print("Re-run eigenmaps ...")
        eigval_subL = np.real(eigval_subL)
        sorted_index = np.argsort(eigval_subL)
        eigval_subL = eigval_subL[sorted_index]
        eigvec_subL = np.real(eigvec_subL)[:,sorted_index]

        # Discard not important (i.e., zero and large) eigvectors
        tmp_vec = eigvec_subL[:, (eigval_subL > 0) * (eigval_subL < 0.6)]
        if tmp_vec.shape[1] == 0:
            continue
        # Conpute the vertex embeeding w.r.t. to vertices in $new_subV
        for i in list(range(tmp_vec.shape[1]-1, -1, -1)):
            if np.sum(tmp_vec[:,i]>0)>0 and np.sum(tmp_vec[:,i]<0)>0:
                tmp_vec = np.insert(tmp_vec, i, np.sign(tmp_vec[:,i]), axis=1)
                tmp_vec[:, i+1] = np.absolute(tmp_vec[:, i+1])

        tmp_embedding = np.zeros([G.number_of_nodes(), tmp_vec.shape[1]])
        tmp_embedding[subV2V, :] = tmp_vec
        embedding = np.append(embedding, tmp_embedding, axis=1)
        # print("subG index = {}, subL.shape = {}, tmp_embedding.shape={}".format(index, subL.shape, tmp_embedding.shape))
        
    return embedding

In [30]:
from_date_ym = DateYM(2017, 1)
to_date_ym = DateYM(2019, 8)

list_date_seq = list_date_tuples(from_date_ym, to_date_ym)
projectors_shape = {}
for i, date_ym in zip(range(len(list_date_seq)), list_date_seq):
    print("Processing the data in {}".format(date_ym))
    
    # Fetch data by month
    view_wiretrans, view_customer = fetch_data_by_month(date_ym, trans_offset=offset)
    
    # Construct graph
    G_t = nx.from_pandas_edgelist(view_wiretrans, 'org_cust', 'bnf_cust', edge_attr='norm_wire_amt')
    dict_sub_nodes_key2int = dict(zip(list(G_t.nodes()), range(G_t.number_of_nodes())))
    dict_sub_nodes_int2key = dict(zip(range(G_t.number_of_nodes()), list(G_t.nodes())))
    
    G_t = nx.relabel_nodes(G_t, dict_sub_nodes_key2int)
    G_embedding = eigen_maps(G_t, min_num_nodes=min_num_nodes)
    
    # Write eigenmap into a file
    eigen_path = "eigen/offset_{}/latest/weight_{}_eigenmap_{}-{}".format(
        offset, weight_norm_type, date_ym[0], date_ym[1])
    np.savez_compressed(eigen_path, G_embedding)
    # np.save(eigen_path, G_embedding)
    
    # Write dict int2key / key2int into files
    key2int_path = "offset_{}/latest/weight_{}_nodes_key2int_{}-{}.pickle".format(
        offset, weight_norm_type, date_ym[0], date_ym[1])
    with open(key2int_path, 'wb') as f:
        pickle.dump(dict_sub_nodes_key2int, f)
    int2key_path = "offset_{}/latest/weight_{}_nodes_int2key_{}-{}.pickle".format(
        offset, weight_norm_type, date_ym[0], date_ym[1])
    with open(int2key_path, 'wb') as f:
        pickle.dump(dict_sub_nodes_int2key, f)
 
    projectors_shape[date_ym] = (G_embedding.shape[1], latent_dim)
    print("SubGraph {}: has {} vertices, {} edges; Embedding shape: {}".
          format(date_ym, G_t.number_of_nodes(), G_t.number_of_edges(), G_embedding.shape))

Processing the data in (2017, 1)
  |-- Get all the connected components ...
  |-- Initialize the vertex embedding ...
  |-- Graph Processing ...
SubGraph (2017, 1): has 27473 vertices, 26782 edges; Embedding shape: (27473, 3777)
Processing the data in (2017, 2)
  |-- Get all the connected components ...
  |-- Initialize the vertex embedding ...
  |-- Graph Processing ...
SubGraph (2017, 2): has 38665 vertices, 39814 edges; Embedding shape: (38665, 4380)
Processing the data in (2017, 3)
  |-- Get all the connected components ...
  |-- Initialize the vertex embedding ...
  |-- Graph Processing ...
SubGraph (2017, 3): has 50581 vertices, 54237 edges; Embedding shape: (50581, 4886)
Processing the data in (2017, 4)
  |-- Get all the connected components ...
  |-- Initialize the vertex embedding ...
  |-- Graph Processing ...
SubGraph (2017, 4): has 58216 vertices, 63724 edges; Embedding shape: (58216, 5156)
Processing the data in (2017, 5)
  |-- Get all the connected components ...
  |-- In

In [50]:
with open('eigen/offset_{}/latest/weight_{}_projectors_shape'.format(offset, weight_norm_type), 'wb') as f:
    pickle.dump(projectors_shape, f)

4 steps to prepare files:
+ Read sequential graphs;
+ Convert to symmetric graph Laplacian;
+ Spectral analysis;
+ Write the eigenmaps into files.


## Embedding Projection on a Dynamic graph

In [31]:
def row_normalize(M):
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    return Q * M

In [32]:
def matrix_index_map(from_matrix, from_int2key, to_key2int):
    """Map matrix to another one by row-index keys
    """
    from_keys = map(lambda x: from_int2key[x], list(range(from_matrix.shape[0])))
    to_indices = list(map(lambda x: to_key2int[x] if x in to_key2int.keys() else -1, from_keys))
    ret = np.zeros((len(to_key2int), from_matrix.shape[1]))
    for from_row_index in range(from_matrix.shape[0]):
        to_row_index = to_indices[from_row_index]
        if to_row_index == -1:
            continue
        ret[to_row_index, :] = from_matrix[from_row_index, :]
    return ret

In [47]:
def update_params(t):
    
    date_ym = list_date_seq[t]

    view_wiretrans, view_customer = fetch_data_by_month(date_ym, trans_offset=6)
    G_t = nx.from_pandas_edgelist(view_wiretrans, 'org_cust', 'bnf_cust', edge_attr=True)

    E_t = np.load('eigen/offset_{}/latest/weight_{}_eigenmap_{}-{}.npz'.format(
        offset, weight_norm_type, date_ym[0], date_ym[1]))['arr_0']
    F_t = projectors[date_ym]
    with open("offset_{}/latest/weight_{}_nodes_key2int_{}-{}.pickle".format(
        offset, weight_norm_type, date_ym[0], date_ym[1]), 'rb') as f:
        key2int_t = pickle.load(f)
    with open("offset_{}/latest/weight_{}_nodes_int2key_{}-{}.pickle".format(
        offset, weight_norm_type, date_ym[0], date_ym[1]), 'rb') as f:
        int2key_t = pickle.load(f)
    G_t = nx.relabel_nodes(G_t, key2int_t)    
    A_t = row_normalize(nx.to_scipy_sparse_matrix(G_t))

    t_plus = t + 1
    if t_plus < seq_length:
        date_ym_plus = list_date_seq[t_plus]
        E_t_plus = np.load('eigen/offset_{}/latest/weight_{}_eigenmap_{}-{}.npz'.format(
            offset, weight_norm_type, date_ym_plus[0], date_ym_plus[1]))['arr_0']
        F_t_plus = projectors[date_ym_plus]
        with open("offset_{}/latest/weight_{}_nodes_key2int_{}-{}.pickle".format(
            offset, weight_norm_type, date_ym_plus[0], date_ym_plus[1]), 'rb') as f:
            key2int_t_plus = pickle.load(f)
        with open("offset_{}/latest/weight_{}_nodes_int2key_{}-{}.pickle".format(
            offset, weight_norm_type, date_ym_plus[0], date_ym_plus[1]), 'rb') as f:
            int2key_t_plus = pickle.load(f)
    else:
        E_t_plus = None
        F_t_plus = None
        key2int_t_plus = None
        int2key_t_plus = None

    t_minus = t - 1
    if t_minus > -1:
        date_ym_minus = list_date_seq[t_minus]
        E_t_minus = np.load('eigen/offset_{}/latest/weight_{}_eigenmap_{}-{}.npz'.format(
            offset, weight_norm_type, date_ym_minus[0], date_ym_minus[1]))['arr_0']
        F_t_minus = projectors[date_ym_minus]
        with open("offset_{}/latest/weight_{}_nodes_key2int_{}-{}.pickle".format(
            offset, weight_norm_type, date_ym_minus[0], date_ym_minus[1]), 'rb') as f:
            key2int_t_minus = pickle.load(f)
        with open("offset_{}/latest/weight_{}_nodes_int2key_{}-{}.pickle".format(
            offset, weight_norm_type, date_ym_minus[0], date_ym_minus[1]), 'rb') as f:
            int2key_t_minus = pickle.load(f)
    else:
        E_t_minus = None
        F_t_minus = None
        key2int_t_minus = None
        int2key_t_minus = None

    for itr in range(3):
        tran = A_t * E_t
        emb = np.dot(E_t, F_t)
        tmp = np.sum(F_t ** 2) - Z
        if F_t_plus is None:
            tmp2 = matrix_index_map(from_matrix=np.dot(E_t_minus, F_t_minus), 
                                    from_int2key=int2key_t_minus, 
                                    to_key2int=key2int_t)
            grad_F_t = np.dot((E_t - tran).T, (emb - np.dot(tran, F_t))) + \
                       np.dot(E_t.T, (emb - tmp2)) + \
                       param_lambda * tmp * F_t
        elif F_t_minus is None:
            tmp2 = matrix_index_map(from_matrix=np.dot(E_t_plus, F_t_plus),
                                    from_int2key=int2key_t_plus, 
                                    to_key2int=key2int_t)
            grad_F_t = np.dot((E_t - tran).T, (emb - np.dot(tran, F_t))) + \
                       np.dot(E_t.T, (emb - tmp2)) + \
                       param_lambda * tmp * F_t
        else:
            tmp2 = matrix_index_map(from_matrix=np.dot(E_t_minus, F_t_minus), 
                                    from_int2key=int2key_t_minus, 
                                    to_key2int=key2int_t)
            tmp3 = matrix_index_map(from_matrix=np.dot(E_t_plus, F_t_plus),
                                    from_int2key=int2key_t_plus, 
                                    to_key2int=key2int_t)
            grad_F_t = np.dot((E_t - tran).T, (emb - np.dot(tran, F_t))) + \
                       np.dot(E_t.T, (2 * emb - tmp2 - tmp3)) + \
                       param_lambda * tmp * F_t
        F_t -= lr * grad_F_t

        if F_t_plus is not None:
            tran = A_t * matrix_index_map(from_matrix=E_t_plus, 
                                          from_int2key=int2key_t_plus, 
                                          to_key2int=key2int_t)
            tran = matrix_index_map(from_matrix=tran, 
                                    from_int2key=int2key_t, 
                                    to_key2int=key2int_t_plus)
            emb = np.dot(E_t_plus, F_t_plus)
            tmp = np.sum(F_t_plus ** 2) - Z
            tmp2 = matrix_index_map(from_matrix=np.dot(E_t, F_t), 
                                    from_int2key=int2key_t, 
                                    to_key2int=key2int_t_plus)
            grad_F_t_plus = np.dot((E_t_plus - tran).T, (emb - np.dot(tran, F_t_plus))) + \
                            np.dot(E_t_plus.T, (emb - tmp2)) + \
                            param_lambda * tmp * F_t_plus
            F_t_plus -= lr * grad_F_t_plus

        if F_t_minus is not None:
            tran = A_t * matrix_index_map(from_matrix=E_t_minus, 
                                          from_int2key=int2key_t_minus, 
                                          to_key2int=key2int_t)
            tran = matrix_index_map(from_matrix=tran, 
                                    from_int2key=int2key_t, 
                                    to_key2int=key2int_t_minus)
            emb = np.dot(E_t_minus, F_t_minus)
            tmp = np.sum(F_t_minus ** 2) - Z
            tmp2 = matrix_index_map(from_matrix=np.dot(E_t, F_t), 
                                    from_int2key=int2key_t, 
                                    to_key2int=key2int_t_minus)
            grad_F_t_minus = np.dot((E_t_minus - tran).T, (emb - np.dot(tran, F_t_minus))) + \
                             np.dot(E_t_minus.T, (emb - tmp2)) + \
                             param_lambda * tmp * F_t_minus
            F_t_minus -= lr * grad_F_t_minus

    projectors[date_ym] = F_t
    if F_t_plus is not None:
        projectors[date_ym_plus] = F_t_plus
    if F_t_minus is not None:
        projectors[date_ym_minus] = F_t_minus
    
    return (date_ym, F_t, F_t_plus, F_t_minus)

In [48]:
def validate(t):
    
    date_ym = list_date_seq[t]

    view_wiretrans, view_customer = fetch_data_by_month(date_ym, trans_offset=6)
    G_t = nx.from_pandas_edgelist(view_wiretrans, 'org_cust', 'bnf_cust', edge_attr=True)

    E_t = np.load('eigen/offset_{}/latest/weight_{}_eigenmap_{}-{}.npz'.format(
        offset, weight_norm_type, date_ym[0], date_ym[1]))['arr_0']
    F_t = projectors[date_ym]
    emb_t = np.dot(E_t, F_t)
    with open("offset_{}/latest/weight_{}_nodes_key2int_{}-{}.pickle".format(
        offset, weight_norm_type, date_ym[0], date_ym[1]), 'rb') as f:
        key2int_t = pickle.load(f)
    G_t = nx.relabel_nodes(G_t, key2int_t)    
    A_t = row_normalize(nx.to_scipy_sparse_matrix(G_t))

    t_plus = t + 1
    if t_plus < seq_length:
        date_ym_plus = list_date_seq[t_plus]
        E_t_plus = np.load('eigen/offset_{}/latest/weight_{}_eigenmap_{}-{}.npz'.format(
            offset, weight_norm_type, date_ym_plus[0], date_ym_plus[1]))['arr_0']
        F_t_plus = projectors[date_ym_plus]
        emb_t_plus = np.dot(E_t_plus, F_t_plus)
        with open("offset_{}/latest/weight_{}_nodes_int2key_{}-{}.pickle".format(
            offset, weight_norm_type, date_ym_plus[0], date_ym_plus[1]), 'rb') as f:
            int2key_t_plus = pickle.load(f)
    else:
        E_t_plus = None
        F_t_plus = None
        emb_t_plus = None

    t_minus = t - 1
    if t_minus > -1:
        date_ym_minus = list_date_seq[t_minus]
        E_t_minus = np.load('eigen/offset_{}/latest/weight_{}_eigenmap_{}-{}.npz'.format(
            offset, weight_norm_type, date_ym_minus[0], date_ym_minus[1]))['arr_0']
        F_t_minus = projectors[date_ym_minus]
        emb_t_minus = np.dot(E_t_minus, F_t_minus)
        with open("offset_{}/latest/weight_{}_nodes_int2key_{}-{}.pickle".format(
            offset, weight_norm_type, date_ym_minus[0], date_ym_minus[1]), 'rb') as f:
            int2key_t_minus = pickle.load(f)
    else:
        E_t_minus = None
        F_t_minus = None
        emb_t_minus = None       

    loss = 0.5 * np.sum((emb_t - A_t * emb_t) ** 2) + param_lambda * (np.sum(F_t ** 2) - Z) ** 2
    if t_plus < seq_length:
        tmp = matrix_index_map(from_matrix=emb_t_plus, 
                               from_int2key=int2key_t_plus, 
                               to_key2int=key2int_t)
        loss += 0.5 * np.sum((emb_t - tmp) ** 2)
    if t_minus > -1:
        tmp = matrix_index_map(from_matrix=emb_t_minus, 
                               from_int2key=int2key_t_minus, 
                               to_key2int=key2int_t)
        loss += 0.5 * np.sum((emb_t - tmp) ** 2)
    return loss

In [42]:
projectors = {}
for i in list_date_seq:
    projectors[i] = np.random.rand(*projectors_shape[i])

{(2017,
  1): array([[0.52607174, 0.29247956, 0.37564495, ..., 0.43726186, 0.18452044,
         0.46458377],
        [0.34831817, 0.09368581, 0.43611474, ..., 0.46979976, 0.12464521,
         0.02623283],
        [0.30503242, 0.18663762, 0.25010656, ..., 0.12874941, 0.1983374 ,
         0.13407922],
        ...,
        [0.30100769, 0.15606785, 0.31933084, ..., 0.17849729, 0.42697177,
         0.27155101],
        [0.21091231, 0.12160213, 0.26173109, ..., 0.16875141, 0.27081603,
         0.04660214],
        [0.03948262, 0.08115797, 0.04328127, ..., 0.42494441, 0.05012726,
         0.04845521]]),
 (2017,
  2): array([[0.34758631, 0.4369188 , 0.20000385, ..., 0.41145867, 0.3248552 ,
         0.31079054],
        [0.02699876, 0.45426689, 0.05449086, ..., 0.39537239, 0.34267827,
         0.45606905],
        [0.04474652, 0.18265513, 0.14003476, ..., 0.04763881, 0.46138796,
         0.20908264],
        ...,
        [0.1176037 , 0.41334648, 0.46738504, ..., 0.33174151, 0.45288907,
        

In [60]:
seq_length = len(list_date_seq)
num_cores = 16
print("Training ......")
for epoch in range(epochs):        
    #
    # Projectors training
    # -------------------------------------------------------------------------------------------------------------------
    res_list = []
    with Pool(processes=num_cores) as p:
        max_ = seq_length
        with tqdm_notebook(total=max_) as pbar:
            for t, res in tqdm_notebook(enumerate(p.imap_unordered(update_params, range(seq_length)))):
                pbar.update()
                res_list.append(res)
    
    for res in res_list:
        key = res[0]
        value = res[1]
        projectors[key] = value
    
    #
    # Validation
    # -------------------------------------------------------------------------------------------------------------------
    res_list = []
    with Pool(processes=num_cores) as p:
        max_ = seq_length
        with tqdm_notebook(total=max_) as pbar:
            for t, res in tqdm_notebook(enumerate(p.imap_unordered(validate, range(seq_length)))):
                pbar.update()
                res_list.append(res)
        
    # Save the checkpoint
    for t in range(seq_length):
        date_ym = list_date_seq[t]
        np.save('model/offset_{}/latest/weight_{}_projctr_{}-{}'.format(
            offset, weight_norm_type, date_ym[0], date_ym[1]), projectors[date_ym])
    print("Epoch {} validation = {}".format(epoch, np.mean(res_list)))

Training ......


HBox(children=(IntProgress(value=0, max=32), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

## Feature Generation for ```CustInfo```

In [303]:
# Add columun for Label
target_list = SARCase[SARCase.Status_SAR == 4]['customerno'].unique()
CustInfo['label'] = CustInfo.apply(lambda x: 1 if x['customerno'] in target_list else 0, axis=1)
pos_data_count = len(CustInfo[CustInfo.label==1])
neg_data_count = len(CustInfo[CustInfo.label==0])
total_data_count = len(CustInfo)
print('pos: {} / neg: {} / total: {} / ratio: {:.3f}%'.format(pos_data_count, neg_data_count, total_data_count, pos_data_count/total_data_count * 100))

# Processing Mailaddr_Country_Code
print('Processing **Mailaddr_Country_Code** ......')
target_list = ['CN', 'HK', 'TW']
CustInfo['Mailaddr_Country_Code2'] = CustInfo.apply(lambda x: x['Mailaddr_Country_Code'] 
                                                              if x['Mailaddr_Country_Code'] in target_list 
                                                              else 'other', axis=1)

# Processing Permaddr_Country_Code
print('Processing **Permaddr_Country_Code** ......')
target_list = ['HK', 'KY', 'SC', 'CN', 'VG', 'TW']
CustInfo['Permaddr_Country_Code2'] = CustInfo.apply(lambda x: x['Permaddr_Country_Code'] 
                                                              if x['Permaddr_Country_Code'] in target_list 
                                                              else 'other', axis=1)

# China **LOVES** Hong Kong ><
CustInfo['CN_love_HK'] = CustInfo.apply(lambda x: 1 
                                                  if (x['Permaddr_Country_Code']=='HK') & 
                                                     (x['Mailaddr_Country_Code']=='CN') 
                                                  else 0, axis=1)

# Discretize the value
print('Discretizing **Business_Establish_Year** ......')
est_year_mapping = {1: '1', 
                    2: '2', 
                    3: '3', 
                    4: '4', 
                    5: '5', 
                    6: '6', 
                    7: '7',
                   }
CustInfo['Business_Establish_Year'] = CustInfo['Business_Establish_Year'].map(est_year_mapping)

# Select columns for training
print('Selecting columns for training ......')
used_columns = ['Business_Establish_Year', # 1, 2, 3, 4, 5 ; Year 2, 3 are more significant
                'Customer_Segment_Code',   # CB, RB, TS
                'Customer_Type_Code',      # N, P
                'Negative_News_Flag',      # N, Y, nan
                'Record_Status_Code',      # C, O
                'HighRisk',                # H, L, M
                'IsOtherListHit',          # Y, N, nan
                'Mailaddr_Country_Code2',
                'Permaddr_Country_Code2',  # $Nationality_Primary_Code is almost equivalent to $Mailaddr_Country_Code
                'CN_love_HK',               # 1, 0
                'IsSARFiled',
                'label'
                ]
CustInfo = CustInfo[used_columns]
CustInfo2 = pd.get_dummies(CustInfo)

print('Converitng matrix for training ......')
CustInfo_colnames = list(CustInfo2.columns)
CustInfo_colnames.remove('label')
X = CustInfo2[CustInfo_colnames].as_matrix()
Y = CustInfo2['label'].as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print('X.shape = {}         Y.shape = {}'.format(X.shape, Y.shape))
print('X_train.shape = {}   y_train.shape = {}'.format(X_train.shape, y_train.shape))
print('X_test.shape = {}    y_test.shape = {}'.format(X_test.shape, y_test.shape))

In [302]:
clf_xgb = XGBClassifier(learning_rate=0.1, 
                        n_estimators=500, 
                        max_depth=2, 
                        min_child_weight=3, 
                        gamma=0, 
                        subsample=0.8, 
                        colsample_bytree=0.7, 
                        reg_alpha=1, 
                        objective='rank:ndcg')

scoring = ['precision', 'recall']
scores_clf_xgb = cross_validate(clf_xgb, X_train, y_train, cv=5, scoring=scoring, return_train_score=False)
clf_xgb.fit(X_train, y_train)
joblib.dump(clf_xgb, 'save/xgb-ndcg_customerno.pkl')

y_predict = clf_xgb.predict(X_test)
evaluate(y_test, y_predict)

print(scores_clf_xgb)

(83812, 5738)

In [None]:
def gen_temporal_data_by_month(date_ym):
    year, month = date_ym
    
    view_wiretrans, view_customer = fetch_data_by_month(date_ym, offset=0)
    view_customer_for_x = pd.get_dummies(view_customer[used_columns])
    
    # Fetch data from ```customer_info```
    CustInfo_colnames = list(view_customer_for_x.columns)
    CustInfo_colnames.remove('label')
    X = view_customer_for_x[CustInfo_colnames].as_matrix()
    Y = view_customer_for_x['label'].as_matrix()
    
    # Fetch graph embedding by customer ID
    E_t = np.load('../data/eigenmap_{}-{}.npz'.format(year, month))['arr_0']
    F_t = np.load('model/projctr_{}-{}.npy'.format(year, month))
    with open("nodes_key2int_{}-{}.pickle".format(year, month), 'rb') as f:
        key2int_t = pickle.load(f)
    customer_id_list = list(view_customer_for_x['customerno'])
    customer_index_list = map(lambda x: key2int_t[x], customer_id_list)
    embeddings_for_X = np.dot(E_t[customer_index_list, :], F_t)
    
    # Concatenate two matrices
    X = np.concatenate((X, embeddings_for_X), axis=1)
    
    return X, Y

In [None]:
for date_ym in list_date_seq:
    X, Y = gen_temporal_data_by_month(date_ym)
    write_path = 'data/data_{}-{}'.format(date_ym[0], date_ym[1])
    np.savez_compressed(write_path, np.concatenate((X, Y), axis=1))

In [None]:
seq_date_length = len(list_date_seq)

year, month = list_date_seq[0]
total_data = np.load('data/data_{}-{}.npy'.format(year, month))['arr_0']
for date_index in range(1, seq_date_length):
    year, month = list_date_seq[date_index]
    data = np.load('data/data_{}-{}.npy'.format(year, month))['arr_0']
    total_data = np.concatenate((total_data, data), axis=0)

X = total_data[:, :-1]
Y = total_data[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print('X.shape = {}         Y.shape = {}'.format(X.shape, Y.shape))
print('X_train.shape = {}   y_train.shape = {}'.format(X_train.shape, y_train.shape))
print('X_test.shape = {}    y_test.shape = {}'.format(X_test.shape, y_test.shape))

In [None]:
12878137837.260553
 5735842282.450901
 3056328313.988741
 1800709873.8203099
 1131526091.901606
  743538142.0896592
  504837082.71222484
  351397500.49108243
  249398018.66356814
   14005855.305017628
    8445861.31137953
     779599.615898475
     417115.88577040855
     397480.49516823917
     375443.98209077516
     364310.42852415366
     362366.62897386594
     360637.82810836774
     359085.03423782036

In [None]:
MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x3fff127efb00>'. Reason: 'PicklingError("Can't pickle <class 'MemoryError'>: it's not the same object as builtins.MemoryError",)'