In [7]:
import math
import pandas as pd
import numpy as np
import datetime 
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler




df_train = pd.read_csv('train.csv')#loads training data

df_train_target = df_train['demand']#loads target labels into a new variable
df_train = df_train.drop(columns=['demand'])#drops this column 
df_train['timestamp'] = pd.to_datetime(df_train['timestamp'],dayfirst=True)

#extracts date and time features for training data
df_train['hour'] = df_train['timestamp'].dt.hour
df_train['minute'] = df_train['timestamp'].dt.minute
df_train['day'] = df_train['timestamp'].dt.day
df_train['month'] = df_train['timestamp'].dt.month
df_train = df_train.drop(columns=['timestamp'])


# exploration using correlation coefficient
for i in range(1, df_train.shape[1]):
    corr, _ = pearsonr(df_train[df_train.columns[i]], df_train_target)
    print(df_train.columns[i],corr)
    
#drops the features/columns that are not useful    
df_train = df_train.drop(columns=['minute'])
df_train = df_train.drop(columns=['tourist_attractions'])
df_train = df_train.drop(columns=['secondary_connectivity'])


scal = StandardScaler()
df_train = scal.fit_transform(df_train)#Normalize train data

gpr = GaussianProcessRegressor(
    random_state=0).fit(df_train, df_train_target)#inference using gaussian process with a gaussian kernel.




df_test = pd.read_csv('test.csv')#Loads test data

#extracts date and time features for test data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp'], dayfirst=True)
df_test['hour'] = df_test['timestamp'].dt.hour
df_test['day'] = df_test['timestamp'].dt.day
df_test['month'] = df_test['timestamp'].dt.month
df_test = df_test.drop(columns=['timestamp'])

#drops the features/columns that are not useful  based on the exploration using training data
df_test = df_test.drop(columns=['tourist_attractions'])
df_test = df_test.drop(columns=['secondary_connectivity'])



df_test = scal.transform(df_test)#Normalize test data
test_result = gpr.predict(df_test,return_std=True)#prediction using already trained model



df_test = pd.read_csv('test.csv')#load test data so as to attach the results
df_test['demand'] = pd.Series(np.abs(test_result[0]))# add new column for the prediction result--reporting the absolute value since demand can't be negative
df_test.to_csv('test_with_result.csv')# write to a csv file


travelling_proportion 0.05276225712084417
tempC 0.06302925040556868
precipMM -0.028501914864441133
station_count 0.022658104563541587
count_commercial 0.07006258552857295
count_retail -0.05163095231306766
cover_commercial 0.027629141966347377
cover_retail -0.02597294975653672
distance_to_recreation_ground -0.04167588480614267
distance_to_park 0.015645291825619738
distance_to_retail 0.053876787208506194
distance_to_commercial -0.07695588046834914
cycleway_connectivity -0.0335832683538276
tertiary_connectivity -0.04848694734117437
secondary_connectivity nan
tourist_attractions nan
node_id -0.04344685301296988
hour 0.1703446562796425
minute nan
day 0.021953939404900535
month 0.017539331934698497


