In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [None]:
import math
import matplotlib.pyplot as plt
from scipy import stats
from functools import reduce

In [None]:
#load in the OD matrix for POI and bus stops extracted from GIS
fp_dest100 = "OD_allPOI_100.csv"
fp_dest300 = "OD_allPOI_300.csv"
fp_dest500 = "OD_allPOI_500.csv"
fp_dest400 = "OD_allPOI_400.csv"
fp_dest200 = "OD_allPOI_200.csv"
fp_dest600 = "OD_allPOI_600.csv"
fp_trips = "trips.csv"

In [None]:
df_DestBus100 = pd.read_csv(fp_dest100)
df_DestBus300 = pd.read_csv(fp_dest300)
df_DestBus500 = pd.read_csv(fp_dest500)
df_DestBus200 = pd.read_csv(fp_dest200)
df_DestBus400 = pd.read_csv(fp_dest400)
df_DestBus600 = pd.read_csv(fp_dest600)
df_trips = pd.read_csv(fp_trips)

In [None]:
df_DestBus100.dest_busid = df_DestBus100.dest_busid.astype(str)
df_DestBus200.dest_busid = df_DestBus200.dest_busid.astype(str)
df_DestBus300.dest_busid = df_DestBus300.dest_busid.astype(str)
df_DestBus400.dest_busid = df_DestBus400.dest_busid.astype(str)
df_DestBus500.dest_busid = df_DestBus500.dest_busid.astype(str)
df_DestBus600.dest_busid = df_DestBus600.dest_busid.astype(str)
df_trips.dest_busid = df_trips.dest_busid.astype(str)

In [None]:
# map weights to different POI categories - for no weight measurement change weights to 1
poi_weight = {'child_care': 0.07, 'daily_needs': 0.27, 'culture': 0.07, 'sports':0.28, 'health_care': 0.17,
       'social' : 0.14}


In [None]:
df_trips.info()

In [None]:
# set a beta value (b) based on f(x) = Ke^-bx K is 1 f(0) = k, for beta value use f(600) as the max dist
beta = round ((np.log(0.01)/-600), 4)

In [None]:
def decay_distance (beta, distance):
    '''
    Input: takes in a beta value and a dataframe series containing distance values
    Output: uses the exponential decay funtion to return a decayed distance value
    '''
    
    return distance.map(lambda x: math.exp(-beta * x))

In [None]:
test_decay = df_DestBus100.copy()
#test_decay['decay'] = decay_distance(0.077, test_decay['raw_length'])

In [None]:
#test_decay['poi_weights'] = test_decay['poi_group'].map(poi_weight)
#test_decay.head(20)

In [None]:
#test_decay['poi_access_score'] = test_decay.decay * test_decay.poi_weights


In [None]:
def dest_summary_table (beta, df_DestBus, df_trips, out_name):
    
    '''
    Generate the accessibility score table by first calculating the access scores for the bus stops and 
    their POI and then merging with the trips data
    Requires three inputs
    1. A dataframe containing the bus stops (dest_busid) and their distance (Total_Leng) to POIs (poi). 
    2. A trip table dataframe containing the exit bus stops (dest_busid), origin bus stops (orig_busid) and 
    the person trip ID (Card_Number)
    3. An appended string name that is used to identify the different distance buffer. E.g '100' to generate
    a table for 100m buffer 
    
    The function returns a dataframe containing access scores per bus stop, total number of destination trips,
    origin trips and pois per bus stop and average distance per bus stop
    
    '''

    
    #rename to show that the length is the raw length from GIS in meters
    df_DestBus.rename(columns={'Total_Leng':'raw_length'}, inplace = True)

    #there was infinity in the value and because of these two zero POI
    #print(df_DestBus[df_DestBus['raw_length']==0])

    #df_DestBus = df_DestBus[df_DestBus['raw_length']!=0].copy()
    
    #map POI weights to the POI categories
    df_DestBus['poi_weights'] = df_DestBus['poi_group'].map(poi_weight)
    
    #apply distance decay for an accessibility score

    df_DestBus['length'] = decay_distance(beta, df_DestBus['raw_length'])
    
    #calculating POI access_score
    df_DestBus['poi_access'] = df_DestBus.length * df_DestBus.poi_weights
    
    
    #using sum
    dest_access = df_DestBus.groupby('dest_busid')['poi_access'].sum().reset_index().rename(columns={'poi_access':'access_score'})

    #count number of POIs per bus stop
    dest_poi = df_DestBus.groupby('dest_busid')['poi'].count().reset_index()
    
    #get average length per bus stop
    dest_dist = df_DestBus.groupby('dest_busid')['raw_length'].mean().reset_index().rename(columns={'raw_length':'avg_distance'})

    #count number of trips per dest
    #dest_trips = df_trips.groupby(['dest_busid'])['Card_Number'].count().reset_index().rename(columns={'Card_Number':'TripsPerdest'})
    
    #count number of trips per orig
    #orig_trips = df_trips.groupby(['orig_busid'])['Card_Number'].count().reset_index().rename(columns={'orig_busid':'dest_busid','Card_Number':'TripsPerOrig'})

    
    # merge the aggregates which reflects the distinct trips per bus stop using #from functools import reduce
    #dfs = [dest_trips,orig_trips,dest_poi, dest_dist, dest_access]
    dfs = [df_trips,dest_poi, dest_dist, dest_access]
    dest_summary = reduce(lambda left,right: pd.merge(left,right,on='dest_busid', how='left'), dfs)

    dest_summary.dest_busid = dest_summary.dest_busid.astype(str)
    poi_name = 'poi'+ '_'+ out_name
    avg_dist_name = 'avg_distance'+ '_'+ out_name
    access_name = 'access_score'+ '_'+ out_name
    dest_summary.rename(columns={'poi':poi_name, 'avg_distance':avg_dist_name,
       'access_score':access_name}, inplace=True)
    
    return dest_summary

In [None]:
dest100_summary = dest_summary_table (beta, df_DestBus100, df_trips, '100')
dest200_summary = dest_summary_table (beta, df_DestBus200, df_trips, '200')
dest300_summary = dest_summary_table (beta, df_DestBus300, df_trips, '300')
dest400_summary = dest_summary_table (beta, df_DestBus400, df_trips, '400')
dest500_summary = dest_summary_table (beta, df_DestBus500, df_trips, '500')
dest600_summary = dest_summary_table (beta, df_DestBus600, df_trips, '600')

In [None]:
def csv_save (name, file):
    output = name
    file.to_csv(output, index=False)

In [None]:
#output file for regression analysis
#csv_save('dest100_summary_single.csv',dest100_summary)
#csv_save('dest200_summary_single.csv',dest200_summary)
#csv_save('dest300_summary_single.csv',dest300_summary)
#csv_save('dest400_summary_single.csv',dest400_summary)
#csv_save('dest500_summary_single.csv',dest500_summary)
#csv_save('dest600_summary_single.csv',dest600_summary)

In [None]:
#merge all the distance buffer access dataframes for better descriptive analysis
dfs = [dest100_summary,dest200_summary,dest300_summary,dest400_summary,dest500_summary,dest600_summary]
dest_summary = reduce(lambda left,right: pd.merge(left,right,on='dest_busid', how='left'), dfs)

In [None]:
dest_summary.columns

In [None]:
dest_summary.columns = ['dest_busid', 'TripsPerdest', 'TripsPerOrig', 'all_poi_100',
       'avg_distance_100', 'access_score_100', 'TripsPerdest_y',
       'TripsPerOrig_y', 'all_poi_200', 'avg_distance_200', 'access_score_200',
       'TripsPerdest_x', 'TripsPerOrig_x', 'all_poi_300', 'avg_distance_300',
       'access_score_300', 'TripsPerdest_y1', 'TripsPerOrig_y2', 'all_poi_400',
       'avg_distance_400', 'access_score_400', 'TripsPerdest_x2',
       'TripsPerOrig_x1', 'all_poi_500', 'avg_distance_500', 'access_score_500',
       'TripsPerdest_y3', 'TripsPerOrig_y4', 'all_poi_600', 'avg_distance_600',
       'access_score_600']

In [None]:
dest_summary = dest_summary[['dest_busid', 'TripsPerdest', 'all_poi_100','all_poi_200','all_poi_300','all_poi_400','all_poi_500','all_poi_600',
                            'access_score_100', 'access_score_200', 'access_score_300',  'access_score_400', 'access_score_500', 'access_score_600']].copy()

In [None]:
dest_summary.describe()

In [None]:
#test = dest_summary[['all_poi_100','all_poi_200','all_poi_300','all_poi_400','all_poi_500','all_poi_600']].copy()
#test.dropna(how='all')

In [None]:
#remove bus stops that don't have any access score or any POI in all 6 buffers
test2 = dest_summary.fillna(0)
mask = test2[test2.drop(['dest_busid', 'TripsPerdest'], axis=1) != 0]
dest_summary = dest_summary.loc[mask.dropna(thresh=1).index]

In [None]:
#extract bus stops with no poi
#no_poi = test2[test2['all_poi_600']==0]
#no_poi.to_csv('busStops_no_POIs_weight.csv', index=False)

In [None]:
dest_summary.info()

In [None]:
dest_summary['TripsPerdest'].describe()

In [None]:
dest_summary['TripsPerdest'].var()

In [None]:
#plot histogram of the exit trips
ax = dest_summary['TripsPerdest'].plot.hist(bins = 200, figsize=(8,5))
ax.set_xlabel('Bus stops trip count values')
ax.set_ylabel('Frequency of count value')

In [None]:
df_allpoi = dest_summary[['dest_busid', 'TripsPerdest', 'all_poi_100','all_poi_200','all_poi_300','all_poi_400','all_poi_500','all_poi_600']].copy()

In [None]:
df_access_scores = dest_summary[['dest_busid', 'TripsPerdest', 'access_score_100', 'access_score_200', 'access_score_300',  'access_score_400', 'access_score_500', 'access_score_600']].copy()

In [None]:
df_access_scores.describe()

In [None]:
df_access_scores.var()

In [None]:
interchange = pd.read_csv('interchanges_trips.csv')
interchange.dest_busid = interchange.dest_busid.astype(str)
inter_merge = pd.merge(df_access_scores, interchange, on='dest_busid', how='left')
df_NoInter_Access = inter_merge[inter_merge['Has_Interchange'] !=1].copy()
df_NoInter_Access = df_NoInter_Access[['dest_busid', 'TripsPerdest', 'access_score_100', 'access_score_200', 'access_score_300',  'access_score_400', 'access_score_500', 'access_score_600']].copy()
#inter_merge[inter_merge['Has_Interchange'] !=1].describe()

In [None]:
df_NoInter_Access.describe()

In [None]:
df_NoInter_Access.var()

In [None]:
#save the access scores in cummulative format still containing nan values for use in NBR for case 1 y = a + Bx
df_access_scores.to_csv('access_scores_cum_weight.csv', index=False)
df_NoInter_Access.to_csv('access_scores_cum_NoInterchange_weight.csv', index=False)

In [None]:
df_access_scores[['access_score_100', 'access_score_200','access_score_300', 'access_score_400', 'access_score_500',
       'access_score_600']] = df_access_scores[['access_score_100', 'access_score_200', 'access_score_300', 'access_score_400', 'access_score_500',
       'access_score_600']].fillna(0)

In [None]:
df_access_scores['accessBuffer_100'] = df_access_scores.access_score_100
df_access_scores['accessBuffer_200'] = df_access_scores.access_score_200 - df_access_scores.access_score_100
df_access_scores['accessBuffer_300'] = df_access_scores.access_score_300 - df_access_scores.access_score_200
df_access_scores['accessBuffer_400'] = df_access_scores.access_score_400 - df_access_scores.access_score_300
df_access_scores['accessBuffer_500'] = df_access_scores.access_score_500 - df_access_scores.access_score_400
df_access_scores['accessBuffer_600'] = df_access_scores.access_score_600 - df_access_scores.access_score_500

In [None]:
df_access_scores[df_access_scores['accessBuffer_200'] < 0] = 0
df_access_scores[df_access_scores['accessBuffer_300'] < 0] = 0
df_access_scores[df_access_scores['accessBuffer_400'] < 0] = 0
df_access_scores[df_access_scores['accessBuffer_500'] < 0] = 0
df_access_scores[df_access_scores['accessBuffer_600'] < 0] = 0

In [None]:
#df_access_scores

In [None]:
df_access_scores.describe()

In [None]:
df_accessbuffer = df_access_scores[['dest_busid', 'TripsPerdest','accessBuffer_100', 'accessBuffer_200',
       'accessBuffer_300', 'accessBuffer_400', 'accessBuffer_500',
       'accessBuffer_600']].copy()


In [None]:
inter_merge2 = pd.merge(df_accessbuffer, interchange, on='dest_busid', how='left')
df_NoInter_Access2 = inter_merge2[inter_merge2['Has_Interchange'] !=1].copy()
df_NoInter_Access2 = df_NoInter_Access2[['dest_busid', 'TripsPerdest', 'dest_busid', 'TripsPerdest','accessBuffer_100', 'accessBuffer_200',
       'accessBuffer_300', 'accessBuffer_400', 'accessBuffer_500',
       'accessBuffer_600']].copy()


In [None]:
df_accessbuffer.to_csv('access_scores_distinct2_weight.csv', index=False)
df_NoInter_Access2.to_csv('access_scores_distinct_NoInterchanges_weight.csv', index=False)

In [None]:
#get cummulative counts of Bus Stop with at least one poi
poi_only = dest_summary[['TripsPerdest','all_poi_100','all_poi_200','all_poi_300','all_poi_400','all_poi_500','all_poi_600']].copy()
poi_only.rename(columns={'TripsPerdest':'Total_BusStops','all_poi_100':'100m','all_poi_200':'200m','all_poi_300':'300m','all_poi_400':'400m','all_poi_500':'500m','all_poi_600':'600m'}, inplace=True)
poi_only= poi_only.count()
print (poi_only)
ax = poi_only.plot.bar(colormap='Paired')
plt.xlabel("Cummulative Distance buffers")
plt.ylabel("number of bus stops with at least 1 poi within buffer")

In [None]:
df_access_scores.columns

In [None]:
df_allpoi = dest_summary[['TripsPerdest','all_poi_100','all_poi_200','all_poi_300','all_poi_400','all_poi_500','all_poi_600']].copy()
df_buffers = df_allpoi.copy()

In [None]:
df_buffers.sort_values(by=['TripsPerdest'], inplace=True)
df_buffers = df_buffers.reset_index()

In [None]:
#checking to see if low trips have pois for different buffers
onetrip = df_buffers[df_buffers.TripsPerdest < 100]
print(len(onetrip))
#onetrip.loc[onetrip.isnull().any(axis=1)]
len(onetrip[onetrip['all_poi_400'].isnull()])

In [None]:
#df_buffers['all_poi_100']

In [None]:
df_buffers[['all_poi_100','all_poi_200', 'all_poi_300', 'all_poi_400', 'all_poi_500', 'all_poi_600']] = df_buffers[['all_poi_100',
       'all_poi_200', 'all_poi_300', 'all_poi_400', 'all_poi_500',
       'all_poi_600']].fillna(0)

In [None]:
df_buffers['poiBuffer_100'] = df_buffers.all_poi_100
df_buffers['poiBuffer_200'] = df_buffers.all_poi_200 - df_buffers.all_poi_100
df_buffers['poiBuffer_300'] = df_buffers.all_poi_300 - df_buffers.all_poi_200
df_buffers['poiBuffer_400'] = df_buffers.all_poi_400 - df_buffers.all_poi_300
df_buffers['poiBuffer_500'] = df_buffers.all_poi_500 - df_buffers.all_poi_400
df_buffers['poiBuffer_600'] = df_buffers.all_poi_600 - df_buffers.all_poi_500

In [None]:
#df_access_scores.astype(bool).sum(axis=0)
df_buffers.astype(bool).sum(axis=0)

In [None]:
df_buffers.columns

In [None]:
#to plot non cummulative for counts of bus stops with at least 1 poi for different buffers
poi_only2 = df_buffers[['TripsPerdest','poiBuffer_100','poiBuffer_200', 'poiBuffer_300', 'poiBuffer_400', 'poiBuffer_500','poiBuffer_600']].copy()
poi_only2.rename(columns={'TripsPerdest':'Total_BusStops','poiBuffer_100':'100m','poiBuffer_200':'200m','poiBuffer_300':'300m','poiBuffer_400':'400m','poiBuffer_500':'500m','poiBuffer_600':'600m'}, inplace=True)
poi_only2= poi_only2.astype(bool).sum(axis=0)
print (poi_only2)
ax = poi_only2.plot.bar()
plt.xlabel("Distinct Distance buffers")
plt.ylabel("number of bus stops with at least 1 poi within buffer")

In [None]:
#count non zeroes in each columns
df_accessbuffer.astype(bool).sum(axis=0)

In [None]:
#from pandas.plotting import scatter_matrix
#scatter_matrix(df_allpoi, alpha=0.2, figsize=(50,50), diagonal = 'kde')


In [None]:
df_allpoi.columns

In [None]:
#dest_summary[dest_summary['dest_busid']=='2235']

In [None]:
#df_DestBus600[df_DestBus600['dest_busid']=='3281']
#df_DestBus600[df_DestBus600['dest_busid']=='2959']
#df_DestBus600[df_DestBus600['dest_busid']=='2491']
#df_DestBus600[df_DestBus600['dest_busid']=='2235']