In [7]:
#IMPORTANT
#Code below makes sure all the code in each cell is outputted
#Code from Programming for Business Analytics Workbooks

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [8]:
#IMPORTING RELEVANT PACKAGES

import googlemaps     #importing Google Maps package
from datetime import datetime     #importing datetime function from datetime package
import pandas as pd     #importing pandas and naming it as pd
import re     #importing re (this is used for the manipulation of strings type variables)

In [11]:
#IMPORTING RELEVANT DATASETS

boroughs_df = pd.read_csv("01. borough_centroids.csv")
print(boroughs_df.head())
retail_df = pd.read_csv("01. destination_retail.csv")
print(retail_df.head())

           Borough Name   Latitude  Longitude
0  Barking and Dagenham  51.545272   0.133522
1                Barnet  51.616019  -0.210017
2                Bexley  51.458811   0.140346
3                 Brent  51.558554  -0.267811
4               Bromley  51.371984   0.051538
                                 Name  Postcode   Latitude  Longitude
0           Sainsburys Local SW11 3BX  SW11 3BX  51.470854  -0.170325
1              Tesco Express SE13 6RT  SE13 6RT  51.448463  -0.003753
2             Sainsburys Local W2 1HB    W2 1HB  51.519053  -0.179820
3  Marks and Spencer Hospital NW3 2QG   NW3 2QG  51.553188  -0.165299
4                     Budgens SE6 1TP   SE6 1TP  51.433405   0.009207


In [12]:
#CREATING 2 SETS OF COLUMNS(lists) from origin data and the retail data
#1st set of 2 columns are the borough (origin) and retail (destination) names put together
#2nd set of 2 columns are the borough (origin) and retail (destination) latitudes and longitudes put together

origins_names = list(boroughs_df["Borough Name"])     #assigning the values in borough names column to the origin name variable
destinations_names = list(retail_df["Name"])     #assigning the values in retail names column to the destination name variable

#using list comprehension to put together the origin and destination names in 2 columns (lists)
origins_to_destinations_names = [[x, y] for x in origins_names for y in destinations_names]
#checking the first 5 rows of the columns
print(origins_to_destinations_names[:5])

#zipping the values in the borough latitude and longitude columns together into one tuple and assign the value to the origin location variable
origins_locs = list(zip(boroughs_df["Latitude"], boroughs_df["Longitude"]))
#zipping the values in the borough latitude and longitude columns together into one tuple and assign the value to the origin location variable
destinations_locs = list(zip(retail_df["Latitude"], retail_df["Longitude"]))

#using list comprehension to put together the origin and destination latitudes and longitudes in 2 columns (lists)
origins_to_destinations_locs = [[x, y] for x in origins_locs for y in destinations_locs]
#checking the first 5 rows of the columns
print(origins_to_destinations_locs[:5])

[['Barking and Dagenham', 'Sainsburys Local SW11 3BX'], ['Barking and Dagenham', 'Tesco Express SE13 6RT'], ['Barking and Dagenham', 'Sainsburys Local W2 1HB'], ['Barking and Dagenham', 'Marks and Spencer Hospital NW3 2QG'], ['Barking and Dagenham', 'Budgens SE6 1TP']]
[[(51.54527247, 0.13352213400000001), (51.47085433, -0.170324704)], [(51.54527247, 0.13352213400000001), (51.44846329, -0.0037531970000000002)], [(51.54527247, 0.13352213400000001), (51.51905329, -0.179819814)], [(51.54527247, 0.13352213400000001), (51.5531882, -0.165299286)], [(51.54527247, 0.13352213400000001), (51.43340482, 0.009206876)]]


In [13]:
#ENTERING JOSH'S API KEY which allows me to access Google Maps API
#a new Google Maps API key can be generated by following the instructions below:
#https://developers.google.com/maps/documentation/javascript/get-api-key
gmaps = googlemaps.Client(key='AIzaSyDWNfPu8Ee_ph-HQWWH4ztnWCR3NivuhLQ')

#SETTING TIME FOR DEPTATURE (8am on Monday)
#this is what all the outputs (distance and time) are based on
#8am chosen as this would be the peak of morning rush hour
dept_time_str = '09/11/20 08:00:00'
dept_time = datetime.strptime(dept_time_str, '%d/%m/%y %H:%M:%S')

In [14]:
#REAL RUN - GENERATING DISTANCE AND DURATION FIELDS

distance = []     #Empty list for distance variable, distance values from gmaps.direction function will be added here
duration = []     #Empty list for duration variable, duration values from gmaps.direction function will be added here

#gmaps.direction function automatically selects the fastest mode of public transport
#the function calculates the duration/distance/directions betweeen origin and destination
#this all done using locations (latitudes and longitudes) in the origins and destinations locations list (5 cells above)
for x, y in origins_to_destinations_locs:
    directions_result = gmaps.directions(x,
                                     y,
                                     mode = "transit",
                                     departure_time = dept_time
                                    )
    distance.append(directions_result[0]['legs'][0]['distance']['text'])     #appends the distance of the journey to the distance variable
    duration.append(directions_result[0]['legs'][0]['duration']['text'])     #appends the duration (time) of the journey to the duration variable
    #These two lines of code above are needed as the gmaps.direction function outputs lots of information
    #information includes distance, duration, step by step directions etc. and these are not all relevant to what we need
    
print(distance[:5])     #checking the first 5 rows of the distance variable (list)
print(duration[:5])     #checking the first 5 rows of the duration variable (list)

['27.3 km', '26.1 km', '24.8 km', '24.6 km', '34.0 km']
['1 hour 24 mins', '1 hour 23 mins', '1 hour 15 mins', '1 hour 15 mins', '1 hour 27 mins']


In [15]:
#CREATING DATAFRAME
#orgin name, destination name, distance and time between the two points, activity classification are held in the dataframe (table)

retail_activity_df = pd.DataFrame(origins_to_destinations_names, columns =['Origin', 'Destination'])     #creating data frame with its first two columns origin and destination point nmaes
retail_activity_df["Distance (km)"]=distance     #adding distance column using distance varialbe (from cell above)
retail_activity_df["Duration (mins)"]=duration     #adding duration (time) columm using duration variable (from cell above)
retail_activity_df["Activity"]="Retail"     #adding activity column - this is through categorizing all the entries as retail (we are only working with retail desintations)
print(retail_activity_df.head())     #checking the first 5 rows of the dataframe

                 Origin                         Destination Distance (km)  \
0  Barking and Dagenham           Sainsburys Local SW11 3BX       27.3 km   
1  Barking and Dagenham              Tesco Express SE13 6RT       26.1 km   
2  Barking and Dagenham             Sainsburys Local W2 1HB       24.8 km   
3  Barking and Dagenham  Marks and Spencer Hospital NW3 2QG       24.6 km   
4  Barking and Dagenham                     Budgens SE6 1TP       34.0 km   

  Duration (mins) Activity  
0  1 hour 24 mins   Retail  
1  1 hour 23 mins   Retail  
2  1 hour 15 mins   Retail  
3  1 hour 15 mins   Retail  
4  1 hour 27 mins   Retail  


In [17]:
#DATA PREP 1 - DISTANCE COLUMN

#removing the string " km" from the distance entries and converting the strings into a floats
#these actions are executed through list comprehension
#this will allow the dataframe to filtered/ordered by distance

#the output of the list comprehension replaces the old distance column (the version that contains the unneeded strings)
retail_activity_df["Distance (km)"] = [float(((x.replace(" ", "")).replace("k","")).replace("m","")) for x in retail_activity_df["Distance (km)"]]
#checking the first 5 rows of the dataframe
print(retail_activity_df.head())

                 Origin                         Destination  Distance (km)  \
0  Barking and Dagenham           Sainsburys Local SW11 3BX           27.3   
1  Barking and Dagenham              Tesco Express SE13 6RT           26.1   
2  Barking and Dagenham             Sainsburys Local W2 1HB           24.8   
3  Barking and Dagenham  Marks and Spencer Hospital NW3 2QG           24.6   
4  Barking and Dagenham                     Budgens SE6 1TP           34.0   

  Duration (mins) Activity  
0  1 hour 24 mins   Retail  
1  1 hour 23 mins   Retail  
2  1 hour 15 mins   Retail  
3  1 hour 15 mins   Retail  
4  1 hour 27 mins   Retail  


In [18]:
#CREATING A CONVERT TIME FUNCTION
#the last function in this cell "convert time to mins" will be used to modify the Duration column's entries
#this data prepping is done in the cell below

x = '1 hour 20 mins'
y = '2 hours 20 mins'

def clean_time(text):
    text = text.strip()
    step01 = text.replace(r"s", "")
    step02 = step01.replace(r"hour", "h")
    step03 = step02.replace(r"min", "m")
    return step03

def convert_time_to_mins(text):
    step01 = re.split(r' ', clean_time(text))
    hour = int(step01[0])
    mins = int(step01[2])
    hours_to_mins = hour * 60
    full_convert = hours_to_mins + mins
    return full_convert

print(convert_time_to_mins(x))
print(convert_time_to_mins(y))

80
140


In [19]:
#DATA PREP 2 - DURATION COLUMN
#removing the strings " hour(s)" and " min(s)" from the duration entries and converting the strings into a integers
#these actions are executed through a for loop
#this will allow the dataframe to filtered/ordered by distance
converted_duration = []

for x in retail_activity_df["Duration (mins)"]:
    if "hour" not in x:
        converted_duration.append(int((x.replace("s", "")).replace(" min", "")))
    else:
        converted_duration.append(convert_time_to_mins(x))

#the output of the for loop "converted duration" is replaces the old duration column (the version that contains the unneeded strings)
retail_activity_df["Duration (mins)"] = converted_duration
#checking the first 5 rows of the dataframe
print(retail_activity_df.head())

                 Origin                         Destination  Distance (km)  \
0  Barking and Dagenham           Sainsburys Local SW11 3BX           27.3   
1  Barking and Dagenham              Tesco Express SE13 6RT           26.1   
2  Barking and Dagenham             Sainsburys Local W2 1HB           24.8   
3  Barking and Dagenham  Marks and Spencer Hospital NW3 2QG           24.6   
4  Barking and Dagenham                     Budgens SE6 1TP           34.0   

   Duration (mins) Activity  
0               84   Retail  
1               83   Retail  
2               75   Retail  
3               75   Retail  
4               87   Retail  


In [36]:
#Use this summary table to identify rough max and min distances/durations
#Off this information filter the dataframe and remove possible errors?

check = retail_activity_df.groupby("Origin").describe()
check

Unnamed: 0_level_0,Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins)
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Origin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Barking and Dagenham,1788.0,27.491555,10.498669,1.1,20.3,27.2,35.8,53.5,1788.0,77.186801,21.735573,10.0,63.0,79.0,93.0,143.0
Barnet,1788.0,26.232494,10.1457,2.1,18.5,26.1,34.5,49.0,1788.0,79.986018,20.635763,17.0,65.0,81.0,96.0,130.0
Bexley,1788.0,28.839653,10.271569,0.6,21.8,28.8,36.6,58.9,1788.0,81.602908,21.24855,7.0,69.0,84.0,97.0,143.0
Brent,1788.0,23.754866,10.088914,0.9,16.6,22.6,31.5,51.9,1788.0,67.022931,19.353109,7.0,52.75,68.0,81.0,120.0
Bromley,1788.0,32.401454,10.724258,4.5,24.775,32.3,41.0,59.4,1788.0,87.393177,20.411034,15.0,74.0,87.0,102.0,147.0
Camden,1788.0,17.018121,8.034385,0.5,10.2,17.0,23.1,38.1,1788.0,50.012864,17.418942,4.0,37.0,50.5,63.0,100.0
City of London,1788.0,14.245358,7.415776,0.2,8.3,14.0,19.9,36.5,1788.0,40.996644,15.433957,2.0,30.0,42.0,52.0,87.0
Croydon,1788.0,28.244575,10.785255,1.5,21.8,27.9,35.9,57.2,1788.0,71.440716,19.890927,8.0,58.0,70.0,86.0,135.0
Ealing,1788.0,24.804866,10.135661,0.8,17.2,24.15,32.4,50.4,1788.0,81.319351,21.928416,6.0,67.0,83.0,98.0,134.0
Enfield,1788.0,26.935626,10.142243,0.3,19.3,26.5,35.0,48.5,1788.0,73.965884,20.926082,4.0,60.0,76.0,90.0,132.0


In [38]:
#SAVE AND EXPORT
#Saving the finsihed retail dataframe to the retail dummy data csv
#After executing this line the file should be available on the left hand side of the screen
retail_activity_df.to_csv('03. Retail Activities Table.csv', index=False)