In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import dgl
from dgl.data.utils import load_graphs
from sklearn.cluster import KMeans
from sklearn.model_selection import  KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn import linear_model

Using backend: pytorch


In [8]:
def load_dataset(k=75):
    '''
    load dataset: 1) meta-path for urban-HIN, 2) input node feature, 3) downstream tasks, and 4) regional attribute 
    '''
    
    data = {}
    
    # 1) Set urban-HIN metapath
    data['meta_paths'] = [['zone-zone'],['src-time','time-src'],['dst-time','time-dst']]
    
    # 2) Set feature size
    feats = np.random.uniform(-1, 1, size=(77*4, 250))
    feats = standardize_features(feats)
    feats = torch.tensor(feats).float()
    feats = feats.view(77*4,-1)
    data['feats'] = feats
    
    # 3) For downstream applications
    ic = []
    ed = []
    pv = []
    em = []
    wh = []
    bl = []
    hp = []
    va = []
    
    for year in range(2018,2022):
        data['heterograph_unified{}'.format(year)] = load_heteograph_unified(k, year)
        path = './data/ride-hailing/'
        
        # 4) Regional Attribute Distribution
        src_matrix = pd.read_csv(path + f'src_matrix_{year}.csv').set_index('src').to_numpy()
        dst_matrix = pd.read_csv(path + f'dst_matrix_{year}.csv').set_index('dst').to_numpy()
        data['src_matrix{}'.format(year)] = src_matrix
        data['dst_matrix{}'.format(year)] = dst_matrix
        
        # downstream application
        income = pd.read_pickle("./data/Downstream/income_CA{}.pkl".format(year)).sort_values(['zone'])
        education = pd.read_pickle('./data/Downstream/education_CA{}.pkl'.format(year)).sort_values(['zone'])[['zone','no_edu']]
        poverty = pd.read_pickle('./data/Downstream/poverty_CA{}.pkl'.format(year)).sort_values(['zone'])
        unemployed = pd.read_pickle('./data/Downstream/unemployed_CA{}.pkl'.format(year)).sort_values(['zone'])
        race = pd.read_csv('./data/Downstream/race{}.csv'.format(year)).sort_values(['zone'])
        value = pd.read_csv('./data/Downstream/mean_value{}.csv'.format(year)).sort_values(['zone'])
        
        ic.append(income)
        ed.append(education)
        pv.append(poverty)
        em.append(unemployed)
        wh.append(race[['zone','white']])
        bl.append(race[['zone','black']])
        hp.append(race[['zone','hispanic']])
        va.append(value)
        
    ic = pd.concat(ic, axis=0).reset_index(drop=True)
    ed = pd.concat(ed, axis=0).reset_index(drop=True)
    pv = pd.concat(pv, axis=0).reset_index(drop=True)
    em = pd.concat(em, axis=0).reset_index(drop=True)
    wh = pd.concat(wh, axis=0).reset_index(drop=True)
    bl = pd.concat(bl, axis=0).reset_index(drop=True)
    hp = pd.concat(hp, axis=0).reset_index(drop=True)
    va = pd.concat(va, axis=0).reset_index(drop=True)
    
    data['heterograph_unified_3'] = load_heteograph_unified_total(k)
    
    data['income'] = ic
    data['education'] = ed
    data['poverty'] = pv
    data['unemployed'] = em
    data['white'] = wh
    data['black'] = bl
    data['hispanic'] = hp
    data['value'] = va
    
    return data

def standardize_features(feature):
    var = np.var(feature, axis=1, keepdims=True)
    mean = np.mean(feature, axis=1, keepdims=True)
    std_inv = np.power(var, -0.5)
    std_inv[np.isinf(std_inv)] = 0.
    feature = np.multiply((feature - mean), std_inv)
    feature = feature[np.newaxis]
    return feature

def load_heteograph_unified(k, year):
    ''' load urban-HIN for each year'''
    path = './data/ride-hailing/'
    g = load_graphs(path+"heterograph{}_k{}.bin".format(year, k))
    return g

def load_heteograph_unified_total(k):
    ''' losd l-urban-HIN '''
    path = './data/ride-hailing/'
    g = load_graphs(path+f"heterograph1821_years_k{k}.bin")
    return g

def evaluation_metrics(y_pred, y_test):
    y_pred[y_pred<0] = 0
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mae, np.sqrt(mse), r2

def regression(X_train, Y_train, X_test):
    reg = linear_model.Ridge(alpha=1)
    reg.fit(X_train, Y_train)
    y_pred = reg.predict(X_test)
    return y_pred

def kf_regression(X, Y):
    kf = KFold(n_splits=5)
    y_preds = []
    y_truths = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]   
        y_pred = regression(X_train, Y_train, X_test)
        y_preds.append(y_pred)
        y_truths.append(Y_test)
    return np.concatenate(y_preds), np.concatenate(y_truths)

def predict_regression(emb, target):
    emb = emb[target.index.values]
    y_pred, y_test = kf_regression(emb, target.values[:,-1])
    mae, rmse, r2 = evaluation_metrics(y_pred, y_test)
    return mae, rmse, r2