In [1]:
import networkx as nx
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy
import numpy as np
import math
import pickle
from tqdm import tqdm_notebook
from multiprocessing import Pool
import math
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import warnings
warnings.filterwarnings("ignore")

In [21]:
#
# Parameter Settings
# --------------------------------------------------------------------------------------------------------------------------
latent_dim = 300
epochs = 300
seq_length = 8
lr = 1e-6
param_lambda = 0.2
Z = 1e5
min_num_nodes = 6
embed_dim_select = 1       # [1, 2, 3]
weight_type = 'embed_dim{}'.format(embed_dim_select)

## Fetch data by date

In [27]:
class DateYM:
    def __init__(self, year, month):
        self.year = year
        self.month = month - 1    # 0 ~ 11, from Jan to Dec
        
    def export_tuple(self):
        return (self.year, self.month+1)
    
    def add_year(self, y):
        self.year += y
        
    def substract_year(self, y):
        self.year -= y
        
    def add_month(self, m):
        self.month += m
        self.year += math.floor(self.month / 12)
        self.month = self.month % 12
        
    def subtract_month(self, m):
        self.month -= m
        tmp_year = math.floor(self.month / 12)
        self.year += tmp_year
        self.month += -tmp_year * 12
        
    def is_larger_than(self, ym):
        return self.year*12 + self.month > ym.year*12 + ym.month
    
    def is_smaller_than(slef, ym):
        return self.year*12 + self.month < ym.year*12 + ym.month
    
    def is_equal(self, ym):
        return self.year*12 + self.month == ym.year*12 + ym.month

    
def list_date_tuples(from_date, to_date):
    ret = []
    tmp_date = DateYM(*from_date.export_tuple())
    while not tmp_date.is_larger_than(to_date):
        ret.append(tmp_date.export_tuple())
        tmp_date.add_month(1)
    return ret


def fetch_data_by_month(date_ym, trans_offset=6):
    year, month = date_ym
    
    # Get view: WireTrans
    from_date = pd.to_datetime("{}/{}/{}".format(month, 1, year))
    to_date = from_date + pd.DateOffset(months=1)
    offset_date = from_date - pd.DateOffset(months=trans_offset)
    view_wiretrans = WireTrans[(WireTrans.trans_date > offset_date) & 
                               (WireTrans.trans_date < to_date)]
    
    # Get view: CustInfo
    view_customer = CustInfo[CustInfo.open_date < to_date]
    
    # Attach label onto CustInfo
    target_list = SARCase[(SARCase.Status_SAR == 4) & 
                          (SARCase.created_date > from_date) & 
                          (SARCase.created_date < to_date)]['customerno'].unique()
    # print ('# of SAR customers: {}'.format(len(target_list)))
    view_customer['label'] = view_customer.apply(lambda x: 1 if x['customerno'] in target_list else 0, axis=1)
    
    return view_wiretrans, view_customer

In [16]:
def get_edge_embed(date_ym, groupby_method='sum'):
    year, month = date_ym
    df = pd.read_csv('Edge_Attribute/VAE_Edge_attribute_{}-{}.csv'.format(year, month))
    df.columns = ['index', 'from_vertex', 'to_vertex', 'dim_1', 'dim_2', 'dim_3']
    df['vertex_index'] = df.apply(lambda x: str(set([x['from_vertex'], x['to_vertex']])), axis=1)
    if groupby_method == 'sum':
        df_new = df.groupby('vertex_index').sum().reset_index().reset_index()
    else:
        df_new = df.groupby('vertex_index').mean().reset_index().reset_index()
    df_new['from_vertex'] = df_new.apply(lambda x: x['vertex_index'][1:-1].split(',')[0][1:-1], axis=1)
    df_new['to_vertex'] = df_new.apply(lambda x: x['vertex_index'][1:-1].split(',')[1][2:-1], axis=1)
    return df_new

In [18]:
def eigen_maps(G, min_num_nodes=5, weight_attr='norm_wire_amt'):
    """ Split graph G into connected subgraph, and then
        compute SA w.r.t. each subgraph respectively.
    """
    # Get all the connected components and their graph Laplacians
    print('  |-- Get all the connected components ...')
    subVs = list(nx.connected_components(G))
    subGs = [G.subgraph(subV) for subV in subVs]
    subLs = [nx.normalized_laplacian_matrix(subG, weight=weight_attr) for subG in subGs]

    # Initialize the vertex embedding with one-hot indexing of components
    print('  |-- Initialize the vertex embedding ...')
    embedding = np.zeros([G.number_of_nodes(), len(subVs)])
    for index in range(len(subVs)):
        embedding[list(subVs[index]), index] = 1

    # Do spectral analysis respectively
    print('  |-- Graph Processing ...')
    for index in range(len(subGs)):
        subV2V = list(subGs[index].nodes())  # ordered index in terms of V of subV
        if len(subV2V) < min_num_nodes:
            continue
        subL = subLs[index]
        
        # Compute eigenmaps
        is_complete_eigen = False
        repreat_time = 0
        while (not is_complete_eigen) and (repreat_time < 10):
            try:
                repreat_time += 1
                eigval_subL, eigvec_subL = scipy.sparse.linalg.eigs(subL, k=min(6, len(subV2V)-2), which="SR")
                is_complete_eigen = True
            except:
                print("Re-run eigenmaps of subG[{}] with {} vertices {} edges ...".
                      format(index, subL.shape[0], len(subL.nonzero()[0])))
        if is_complete_eigen:
            eigval_subL = np.real(eigval_subL)
            sorted_index = np.argsort(eigval_subL)
            eigval_subL = eigval_subL[sorted_index]
            eigvec_subL = np.real(eigvec_subL)[:,sorted_index]
        else:
            continue

        # Discard not important (i.e., zero and large) eigvectors
        tmp_vec = eigvec_subL[:, (eigval_subL > 0) * (eigval_subL < 0.6)]
        if tmp_vec.shape[1] == 0:
            continue
        # Conpute the vertex embeeding w.r.t. to vertices in $new_subV
        for i in list(range(tmp_vec.shape[1]-1, -1, -1)):
            if np.sum(tmp_vec[:,i]>0)>0 and np.sum(tmp_vec[:,i]<0)>0:
                tmp_vec = np.insert(tmp_vec, i, np.sign(tmp_vec[:,i]), axis=1)
                tmp_vec[:, i+1] = np.absolute(tmp_vec[:, i+1])

        tmp_embedding = np.zeros([G.number_of_nodes(), tmp_vec.shape[1]])
        tmp_embedding[subV2V, :] = tmp_vec
        embedding = np.append(embedding, tmp_embedding, axis=1)
        
    return embedding

In [28]:
from_date_ym = DateYM(2018, 1)
to_date_ym = DateYM(2019, 6)

list_date_seq = list_date_tuples(from_date_ym, to_date_ym)
projectors_shape = {}
for i, date_ym in zip(range(len(list_date_seq)), list_date_seq):
    print("Processing the {} graph in {}".format(weight_type, date_ym))

    view_embeds = get_edge_embed(date_ym)

    # Construct graph
    G_t = nx.from_pandas_edgelist(view_embeds, 'from_vertex', 'to_vertex', edge_attr='dim_{}'.format(embed_dim_select))
    dict_sub_nodes_key2int = dict(zip(list(G_t.nodes()), range(G_t.number_of_nodes())))
    dict_sub_nodes_int2key = dict(zip(range(G_t.number_of_nodes()), list(G_t.nodes())))

    G_t = nx.relabel_nodes(G_t, dict_sub_nodes_key2int)
    G_embedding = eigen_maps(G_t, min_num_nodes=min_num_nodes)

    # Write eigenmap into a file
    eigen_path = "eigen/offset_0/aggregate/{}_eigenmap_{}-{}".format(
        weight_type, date_ym[0], date_ym[1])
    np.savez_compressed(eigen_path, G_embedding)

    # Write dict int2key / key2int into files
    key2int_path = "offset_0/aggregate/{}_nodes_key2int_{}-{}.pickle".format(
        weight_type, date_ym[0], date_ym[1])
    with open(key2int_path, 'wb') as f:
        pickle.dump(dict_sub_nodes_key2int, f)
    int2key_path = "offset_0/aggregate/{}_nodes_int2key_{}-{}.pickle".format(
        weight_type, date_ym[0], date_ym[1])
    with open(int2key_path, 'wb') as f:
        pickle.dump(dict_sub_nodes_int2key, f)

    projectors_shape[date_ym] = (G_embedding.shape[1], latent_dim)
    print("SubGraph {}: has {} vertices, {} edges; Embedding shape: {}".
          format(date_ym, G_t.number_of_nodes(), G_t.number_of_edges(), G_embedding.shape))

Processing the embed_dim1 graph in (2018, 1)
  |-- Get all the connected components ...
  |-- Initialize the vertex embedding ...
  |-- Graph Processing ...
SubGraph (2018, 1): has 66690 vertices, 76156 edges; Embedding shape: (66690, 6439)
Processing the embed_dim1 graph in (2018, 2)
  |-- Get all the connected components ...
  |-- Initialize the vertex embedding ...
  |-- Graph Processing ...
SubGraph (2018, 2): has 66436 vertices, 75607 edges; Embedding shape: (66436, 6390)
Processing the embed_dim1 graph in (2018, 3)
  |-- Get all the connected components ...
  |-- Initialize the vertex embedding ...
  |-- Graph Processing ...
SubGraph (2018, 3): has 66851 vertices, 76831 edges; Embedding shape: (66851, 6380)
Processing the embed_dim1 graph in (2018, 4)
  |-- Get all the connected components ...
  |-- Initialize the vertex embedding ...
  |-- Graph Processing ...
SubGraph (2018, 4): has 67995 vertices, 78921 edges; Embedding shape: (67995, 6317)
Processing the embed_dim1 graph in (