In [56]:
import pandas as pd
from pandas import datetime
from matplotlib import pyplot
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np
from pandas.tools.plotting import autocorrelation_plot
import xgboost as xgb
import six
from six.moves import cPickle as cpik
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [58]:
def mape_loss(y_true, y_pred):
    diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true),
                                            K.epsilon(),
                                            None))
    return 100. * K.mean(diff, axis=-1)

def mape(y_p, y):
    y[y == 0] = 0.01
    diff = np.abs((y_p - y) / y)
    #print(diff)
    return 100. * np.mean(diff)

# RMSE loss for calculate test result.
def rmse(y_p, y):
    diff = np.sqrt((y_p - y)**2)# np.sum()
    return np.mean(diff)  


def build_dl_dataset_by_ts(df, ts):
    '''
    Build up a (data_size * 11 * 7 * 7 * 2) dataset for both traning and testing
    
    Input
    df: a dateframe
    ts: time_steps, use how many previous time frame to predict the next one, e.g., if it is 11, then use the
        previous 11 date to predict current 1.
    
    Output
    Training set and testing set
    
    '''
    fids = df['FishnetID'].unique().tolist()
    #print(fids)
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []
    
    for fid in fids:
        df_current = df[df['FishnetID'] == fid].reset_index(drop = True) #original dataset has been ordered by time.
        df_current_len = df_current.shape[0] # overall length of current data 
        if (df_current_len - ts - 1)>0:
            for i in range(df_current_len - ts - 1):
                # current slice of input data
                X_train_cur = []
                Y_train_cur = []
                # append every X by time steps
                for x_v in df_current['neighbours_index_image'].iloc[i : (i+ts)].values:
                    X_train_cur.append(x_v)
                    #print(np.array(X_cur).shape)    
                X_train.append(np.array(X_train_cur))  
                # append very Y by time steps
                for y_v in df_current['PerformanceTarget'].iloc[i+1 : (i+ts+1)].values:
                    Y_train_cur.append(y_v)
                Y_train.append(np.array(Y_train_cur))
            
            for i in range(df_current_len - ts - 1, df_current_len - ts):
                # current slice of input data
                X_test_cur = []
                Y_test_cur = []
                # append every X by time steps
                for x_v in df_current['neighbours_index_image'].iloc[i : (i+ts)].values:
                    X_test_cur.append(x_v)
                    #print(np.array(X_cur).shape)
                X_test.append(np.array(X_test_cur))  
                # append very Y by time steps
                for y_v in df_current['PerformanceTarget'].iloc[i+1 : ].values:
                    Y_test_cur.append(y_v)
                Y_test.append(np.array(Y_test_cur))
        else:
            for i in range(df_current_len - ts):
                # current slice of input data
                X_cur = []
                Y_cur = []
                # append every X by time steps
                for x_v in df_current['neighbours_index_image'].iloc[i : (i+ts)].values:
                    X_cur.append(x_v)
                    #print(np.array(X_cur).shape)    
                X_train.append(np.array(X_cur))
                X_test.append(np.array(X_cur))  
                # append very Y by time steps
                for y_v in df_current['PerformanceTarget'].iloc[i+1 : (i+ts+1)].values:
                    Y_cur.append(y_v)
                Y_train.append(np.array(Y_cur))
                Y_test.append(np.array(Y_cur))      
    return (X_train, Y_train, X_test, Y_test)

## Hospitality

In [8]:
hos_df = pd.read_csv('3_hospitality_13months.csv').iloc[:, 1:8]

neighbours_index_image_arys = cpik.load(open("3_hospitality_13months.pkl", "rb" ))
hos_df['neighbours_index_image'] = neighbours_index_image_arys.tolist()
hos_df['neighbours_index_image'] = hos_df['neighbours_index_image'].apply(lambda x : np.array(x))
print(hos_df['neighbours_index_image'].shape, hos_df['neighbours_index_image'].iloc[0].shape)
ts = 3

(4459,) (7, 7, 2)


In [15]:
# Build up dateset
X_train, Y_train, X_test, Y_test= build_dl_dataset_by_ts(hos_df, ts)
X_train = np.array(X_train)
X_train= np.nan_to_num(X_train)

Y_train = np.array(Y_train)
Y_train = Y_train.reshape(Y_train.shape[0],Y_train.shape[1],1)
Y_train= np.nan_to_num(Y_train)


X_test = np.array(X_test)
X_test= np.nan_to_num(X_test)

Y_test = np.array(Y_test)
Y_test = Y_test.reshape(Y_test.shape[0],Y_test.shape[1],1)
Y_test= np.nan_to_num(Y_test)

In [16]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(3087, 3, 7, 7, 2) (3087, 3, 1) (343, 3, 7, 7, 2) (343, 3, 1)


In [46]:
# flatten features
X_train_flat = X_train.reshape(3087, 3*7*7*2)
Y_train_flat = Y_train[:,-1,:]

X_test_flat = X_test.reshape(343, 3*7*7*2)
Y_test_flat = Y_test[:,-1,:]

xgdmat=xgb.DMatrix(X_train_flat,Y_train_flat)
our_params={'eta':0.6,'seed':0,'subsample':0.6,'colsample_bytree':0.7,'objective':'reg:linear','max_depth':5,'min_child_weight':0.9}
final_gb=xgb.train(our_params,xgdmat)
tesdmat=xgb.DMatrix(X_test_flat)
y_pred_test =np.array(final_gb.predict(tesdmat))
print(mape(y_pred_test, Y_test))
print(rmse(y_pred_test, Y_test))

y_pred_train =np.array(final_gb.predict(xgdmat))
print(mape(y_pred_train, Y_test))
print(rmse(y_pred_train, Y_test))

16.835024493704356
0.11322696094999105
16.592409854553104
0.10986252030081933


In [59]:
print(mape(y_pred_test, Y_test))
print(rmse(y_pred_test, Y_test))

27.105875092448713
0.15801587674588027


## Retail

In [47]:
ret_df = pd.read_csv('4_retail_13months.csv').iloc[:, 1:8]
neighbours_index_image_arys = cpik.load(open("4_retail_13months.pkl", "rb" ))
ret_df['neighbours_index_image'] = neighbours_index_image_arys.tolist()
ret_df['neighbours_index_image'] = ret_df['neighbours_index_image'].apply(lambda x : np.array(x))
ret_df['neighbours_index_image'].shape, ret_df['neighbours_index_image'].iloc[0].shape

((4212,), (7, 7, 2))

In [48]:
# Build up dateset
X_train, Y_train, X_test, Y_test= build_dl_dataset_by_ts(ret_df, ts)
X_train = np.array(X_train)
X_train= np.nan_to_num(X_train)

Y_train = np.array(Y_train)
Y_train = Y_train.reshape(Y_train.shape[0],Y_train.shape[1],1)
Y_train= np.nan_to_num(Y_train)


X_test = np.array(X_test)
X_test= np.nan_to_num(X_test)

Y_test = np.array(Y_test)
Y_test = Y_test.reshape(Y_test.shape[0],Y_test.shape[1],1)
Y_test= np.nan_to_num(Y_test)

In [49]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(2916, 3, 7, 7, 2) (2916, 3, 1) (324, 3, 7, 7, 2) (324, 3, 1)


In [50]:
# flatten features
X_train_flat = X_train.reshape(2916, 3*7*7*2)
Y_train_flat = Y_train[:,-1,:]

X_test_flat = X_test.reshape(324, 3*7*7*2)
Y_test_flat = Y_test[:,-1,:]

xgdmat=xgb.DMatrix(X_train_flat,Y_train_flat)
our_params={'eta':0.6,'seed':0,'subsample':0.6,'colsample_bytree':0.7,'objective':'reg:linear','max_depth':5,'min_child_weight':0.9}
final_gb=xgb.train(our_params,xgdmat)
tesdmat=xgb.DMatrix(X_test_flat)
y_pred_test =np.array(final_gb.predict(tesdmat))
print(mape(y_pred_test, Y_test))
print(rmse(y_pred_test, Y_test))

y_pred_train =np.array(final_gb.predict(xgdmat))
print(mape(y_pred_train, Y_test))
print(rmse(y_pred_train, Y_test))

16.59324537420084
0.10822414313106436
15.84629996460275
0.099890106135536


## Building

In [51]:
bud_df = pd.read_csv('1_building_13months.csv').iloc[:, 1:8]
neighbours_index_image_arys = cpik.load(open("1_building_13months.pkl", "rb" ))
bud_df['neighbours_index_image'] = neighbours_index_image_arys.tolist()
bud_df['neighbours_index_image'] = bud_df['neighbours_index_image'].apply(lambda x : np.array(x))
bud_df['neighbours_index_image'].shape, bud_df['neighbours_index_image'].iloc[0].shape

((4498,), (7, 7, 2))

In [52]:
# Build up dateset
X_train, Y_train, X_test, Y_test= build_dl_dataset_by_ts(bud_df, ts)
X_train = np.array(X_train)
X_train= np.nan_to_num(X_train)

Y_train = np.array(Y_train)
Y_train = Y_train.reshape(Y_train.shape[0],Y_train.shape[1],1)
Y_train= np.nan_to_num(Y_train)


X_test = np.array(X_test)
X_test= np.nan_to_num(X_test)

Y_test = np.array(Y_test)
Y_test = Y_test.reshape(Y_test.shape[0],Y_test.shape[1],1)
Y_test= np.nan_to_num(Y_test)

In [53]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(3114, 3, 7, 7, 2) (3114, 3, 1) (346, 3, 7, 7, 2) (346, 3, 1)


In [54]:
# flatten features
X_train_flat = X_train.reshape(3114, 3*7*7*2)
Y_train_flat = Y_train[:,-1,:]

X_test_flat = X_test.reshape(346, 3*7*7*2)
Y_test_flat = Y_test[:,-1,:]

xgdmat=xgb.DMatrix(X_train_flat,Y_train_flat)
our_params={'eta':0.6,'seed':0,'subsample':0.6,'colsample_bytree':0.7,'objective':'reg:linear','max_depth':5,'min_child_weight':0.9}
final_gb=xgb.train(our_params,xgdmat)
tesdmat=xgb.DMatrix(X_test_flat)
y_pred_test =np.array(final_gb.predict(tesdmat))
print(mape(y_pred_test, Y_test))
print(rmse(y_pred_test, Y_test))

y_pred_train =np.array(final_gb.predict(xgdmat))
print(mape(y_pred_train, Y_test))
print(rmse(y_pred_train, Y_test))

27.105875092448713
0.15801587674588027
26.611197029672468
0.1513631337946317
