In [2]:
#IMPORTANT
#Code below makes sure all the code in each cell is outputted
#Code from Programming for Business Analytics Workbooks

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
#IMPORTING RELEVANT PACKAGES

import googlemaps     #importing Google Maps package
from datetime import datetime     #importing datetime function from datetime package
import pandas as pd     #importing pandas and naming it as pd
import re     #importing re (this is used for the manipulation of strings type variables)

In [4]:
#IMPORTING RELEVANT DATASETS

boroughs_df = pd.read_csv("01. borough_centroids.csv")
print(boroughs_df.head())
entertainment_df = pd.read_csv("01. destination_entertainment.csv")
print(entertainment_df.head())
postcodes_df = pd.read_csv("01. postcodes_to_boroughs.csv")
print(postcodes_df.head())

           Borough Name   Latitude  Longitude
0  Barking and Dagenham  51.545272   0.133522
1                Barnet  51.616019  -0.210017
2                Bexley  51.458811   0.140346
3                 Brent  51.558554  -0.267811
4               Bromley  51.371984   0.051538
                   Name    Borough   Latitude  Longitude
0       Harlequin (Pub)  Islington  51.530110  -0.106107
1         Peasant (Pub)  Islington  51.526026  -0.103263
2   Blues Kitchen (Pub)    Hackney  51.526565  -0.079972
3  Caraway Lounge (Pub)   Havering  51.585187   0.172747
4             Rye (Pub)  Southwark  51.464652  -0.065397
  Postcode In Use?   Latitude  Longitude          County  Borough
0  BR1 1AA     Yes  51.401546   0.015415  Greater London  Bromley
1  BR1 1AB     Yes  51.406333   0.015208  Greater London  Bromley
2  BR1 1AD      No  51.400057   0.016715  Greater London  Bromley
3  BR1 1AE     Yes  51.404543   0.014195  Greater London  Bromley
4  BR1 1AF     Yes  51.401392   0.014948  Greater Lo

In [5]:
#CREATING 2 SETS OF COLUMNS(lists) from origin data and the educ data
#1st set of 2 columns are the borough (origin) and destination names put together
#2nd set of 2 columns are the borough (origin) and destination latitudes and longitudes put together

origins_names = list(boroughs_df["Borough Name"])     #assigning the values in borough names column to the origin name variable
destinations_names = list(entertainment_df["Name"])     #assigning the values in destination names column to the destination name variable

#using list comprehension to put together the origin and destination names in 2 columns (lists)
origins_to_destinations_names = [[x, y] for x in origins_names for y in destinations_names]
#checking the first 5 rows of the columns
print(origins_to_destinations_names[:5])

#zipping the values in the borough latitude and longitude columns together into one tuple and assign the value to the origin location variable
origins_locs = list(zip(boroughs_df["Latitude"], boroughs_df["Longitude"]))
#zipping the values in the borough latitude and longitude columns together into one tuple and assign the value to the origin location variable
destinations_locs = list(zip(entertainment_df["Latitude"], entertainment_df["Longitude"]))

#using list comprehension to put together the origin and destination latitudes and longitudes in 2 columns (lists)
origins_to_destinations_locs = [[x, y] for x in origins_locs for y in destinations_locs]
#checking the first 5 rows of the columns
print(origins_to_destinations_locs[:5])

[['Barking and Dagenham', 'Harlequin (Pub)'], ['Barking and Dagenham', 'Peasant (Pub)'], ['Barking and Dagenham', 'Blues Kitchen (Pub)'], ['Barking and Dagenham', 'Caraway Lounge (Pub)'], ['Barking and Dagenham', 'Rye (Pub)']]
[[(51.54527247, 0.13352213400000001), (51.53011015, -0.106106585)], [(51.54527247, 0.13352213400000001), (51.52602648, -0.10326310300000001)], [(51.54527247, 0.13352213400000001), (51.52656454, -0.079972313)], [(51.54527247, 0.13352213400000001), (51.58518675, 0.172746725)], [(51.54527247, 0.13352213400000001), (51.46465241, -0.065397387)]]


In [6]:
#ENTERING GREG'S API KEY which allows me to access Google Maps API
#a new Google Maps API key can be generated by following the instructions below:
#https://developers.google.com/maps/documentation/javascript/get-api-key
gmaps = googlemaps.Client(key='AIzaSyArIe2isFfDioQBItMx8XZ4YgqkK9CHfe8')

#SETTING TIME FOR DEPTATURE (8am on Monday)
#this is what all the outputs (distance and time) are based on
#8am chosen as this would be the peak of morning rush hour
dept_time_str = '09/11/20 08:00:00'
dept_time = datetime.strptime(dept_time_str, '%d/%m/%y %H:%M:%S')

In [7]:
#REAL RUN - GENERATING DISTANCE AND DURATION FIELDS

distance = []     #Empty list for distance variable, distance values from gmaps.direction function will be added here
duration = []     #Empty list for duration variable, duration values from gmaps.direction function will be added here

#gmaps.direction function automatically selects the fastest mode of public transport
#the function calculates the duration/distance/directions betweeen origin and destination
#this all done using locations (latitudes and longitudes) in the origins and destinations locations list (5 cells above)
for x, y in origins_to_destinations_locs:
    directions_result = gmaps.directions(x,
                                     y,
                                     mode = "transit",
                                     departure_time = dept_time
                                    )
    try:
        distance.append(directions_result[0]['legs'][0]['distance']['text'])     #appends the distance of the journey to the distance variable
        duration.append(directions_result[0]['legs'][0]['duration']['text'])     #appends the duration (time) of the journey to the duration variable
    #These two lines of code above are needed as the gmaps.direction function outputs lots of information
    #information includes distance, duration, step by step directions etc. and these are not all relevant to what we need
    except IndexError:
        distance.append("NA")
        duration.append("NA")

print(distance[:5])     #checking the first 5 rows of the distance variable (list)
print(duration[:5])     #checking the first 5 rows of the duration variable (list)

['20.8 km', '19.6 km', '17.3 km', '7.9 km', '22.3 km']
['1 hour 5 mins', '1 hour 9 mins', '58 mins', '44 mins', '1 hour 11 mins']


In [8]:
#CREATING DATAFRAME
#orgin name, destination name, distance and time between the two points, activity classification are held in the dataframe (table)

entertainment_activity_df = pd.DataFrame(origins_to_destinations_names, columns =['Origin', 'Destination'])     #creating data frame with its first two columns origin and destination point nmaes
entertainment_activity_df["Distance (km)"]=distance     #adding distance column using distance varialbe (from cell above)
entertainment_activity_df["Duration (mins)"]=duration     #adding duration (time) columm using duration variable (from cell above)
entertainment_activity_df["Activity"]="Entertainment"     #adding activity column - this is through categorizing all the entries as the relevant activity
print(entertainment_activity_df.head())     #checking the first 5 rows of the dataframe

                 Origin           Destination Distance (km) Duration (mins)  \
0  Barking and Dagenham       Harlequin (Pub)       20.8 km   1 hour 5 mins   
1  Barking and Dagenham         Peasant (Pub)       19.6 km   1 hour 9 mins   
2  Barking and Dagenham   Blues Kitchen (Pub)       17.3 km         58 mins   
3  Barking and Dagenham  Caraway Lounge (Pub)        7.9 km         44 mins   
4  Barking and Dagenham             Rye (Pub)       22.3 km  1 hour 11 mins   

        Activity  
0  Entertainment  
1  Entertainment  
2  Entertainment  
3  Entertainment  
4  Entertainment  


In [9]:
#IDENTIFYING AND REMOVING NAs (ERRORS)

entertainment_activity_df.shape     #finding the number of rows in the activity table

error_rows = entertainment_activity_df[entertainment_activity_df["Distance (km)"]=="NA"]     #Subsetting rows in dataframe for rows where NA was generated in the distance/duration column
error_rows.shape     #finding the number of rows that contained error
error_rows_index = list(error_rows.index)     #creating a list of the row numbers containing the errors
entertainment_activity_filtered_df = entertainment_activity_df.drop(error_rows_index)     #filtering out the rows which contain the errors
entertainment_activity_filtered_df.shape     #finding the number of rows in the new table (this should be: original table rows - error rows)

na_in_distance = "NA" in entertainment_activity_filtered_df["Distance (km)"]     #checking if any NAs in the new table's distance column
na_in_distance
na_in_duration = "NA" in entertainment_activity_filtered_df["Duration (mins)"]     #checking if any NAs in the new table's duration column
na_in_duration

(181863, 5)

(8743, 5)

(173120, 5)

False

False

In [10]:
#DATA PREP 1 - DISTANCE COLUMN

#removing the string " km" from the distance entries and converting the strings into a floats
#these actions are executed through list comprehension
#this will allow the dataframe to filtered/ordered by distance

entertainment_activity_filtered_df["Distance (km)"] = [float(((x.replace(" ", "")).replace("k","")).replace("m","")) for x in entertainment_activity_filtered_df["Distance (km)"]]
#checking the first 5 rows of the dataframe
print(entertainment_activity_filtered_df.head())

                 Origin           Destination  Distance (km) Duration (mins)  \
0  Barking and Dagenham       Harlequin (Pub)           20.8   1 hour 5 mins   
1  Barking and Dagenham         Peasant (Pub)           19.6   1 hour 9 mins   
2  Barking and Dagenham   Blues Kitchen (Pub)           17.3         58 mins   
3  Barking and Dagenham  Caraway Lounge (Pub)            7.9         44 mins   
4  Barking and Dagenham             Rye (Pub)           22.3  1 hour 11 mins   

        Activity  
0  Entertainment  
1  Entertainment  
2  Entertainment  
3  Entertainment  
4  Entertainment  


In [11]:
#CREATING A CONVERT TIME FUNCTION
#the last function in this cell "convert time to mins" will be used to modify the Duration column's entries
#this data prepping is done in the cell below

x = '1 hour 20 mins'
y = '2 hours 20 mins'

def clean_time(text):
    text = text.strip()
    step01 = text.replace(r"s", "")
    step02 = step01.replace(r"hour", "h")
    step03 = step02.replace(r"min", "m")
    return step03

def convert_time_to_mins(text):
    step01 = re.split(r' ', clean_time(text))
    hour = int(step01[0])
    mins = int(step01[2])
    hours_to_mins = hour * 60
    full_convert = hours_to_mins + mins
    return full_convert

print(convert_time_to_mins(x))
print(convert_time_to_mins(y))

80
140


In [12]:
#DATA PREP 2 - DURATION COLUMN
#removing the strings " hour(s)" and " min(s)" from the duration entries and converting the strings into a integers
#these actions are executed through a for loop
#this will allow the dataframe to filtered/ordered by distance
converted_duration = []

for x in entertainment_activity_filtered_df["Duration (mins)"]:
    if "hour" not in x:
        converted_duration.append(int((x.replace("s", "")).replace(" min", "")))
    else:
        converted_duration.append(convert_time_to_mins(x))

#the output of the for loop "converted duration" is replaces the old duration column (the version that contains the unneeded strings)
entertainment_activity_filtered_df["Duration (mins)"] = converted_duration
#checking the first 5 rows of the dataframe
print(entertainment_activity_filtered_df.head())

                 Origin           Destination  Distance (km)  Duration (mins)  \
0  Barking and Dagenham       Harlequin (Pub)           20.8               65   
1  Barking and Dagenham         Peasant (Pub)           19.6               69   
2  Barking and Dagenham   Blues Kitchen (Pub)           17.3               58   
3  Barking and Dagenham  Caraway Lounge (Pub)            7.9               44   
4  Barking and Dagenham             Rye (Pub)           22.3               71   

        Activity  
0  Entertainment  
1  Entertainment  
2  Entertainment  
3  Entertainment  
4  Entertainment  


In [13]:
#Use this summary table to identify rough max and min distances/durations
#Off this information filter the dataframe and remove possible errors?

check = entertainment_activity_filtered_df.groupby("Origin").describe()
check

Unnamed: 0_level_0,Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins)
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Origin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Barking and Dagenham,5246.0,25.636275,9.35387,1.0,19.4,23.0,31.1,58.7,5246.0,73.89249,19.752073,11.0,60.0,71.0,87.0,142.0
Barnet,5246.0,23.861037,9.246777,0.8,16.9,21.4,30.2,52.6,5246.0,73.852459,19.807711,11.0,58.0,72.0,88.0,139.0
Bexley,5246.0,27.000133,9.165635,0.3,21.5,24.7,32.8,56.7,5246.0,77.229318,19.863256,4.0,65.0,74.0,91.0,151.0
Brent,5246.0,21.472207,8.711252,1.0,16.0,19.0,26.3,52.8,5246.0,62.449485,17.609145,8.0,49.0,61.0,74.0,155.0
Bromley,5246.0,30.527221,9.47596,0.7,25.125,28.6,36.2,59.0,5246.0,83.654213,18.848605,8.0,72.0,81.0,95.0,166.0
Camden,5246.0,13.850515,8.320193,0.4,6.9,12.0,19.7,90.0,5246.0,43.405642,18.27445,1.0,29.0,41.0,56.0,117.0
City of London,5246.0,11.130004,7.994359,0.1,4.4,9.2,17.3,94.0,5246.0,34.775448,16.78062,1.0,20.0,33.0,47.0,114.0
Croydon,5246.0,26.679794,8.937751,1.1,22.2,25.2,31.2,61.9,5246.0,67.152497,17.836157,8.0,56.0,63.0,78.0,153.0
Ealing,5246.0,23.003031,8.633931,1.3,17.7,21.8,27.975,65.9,5246.0,76.814716,19.487132,12.0,64.0,76.0,89.0,158.0
Enfield,5246.0,24.548799,9.526943,0.2,18.0,22.1,31.0,53.6,5246.0,68.256576,20.204383,3.0,53.0,65.0,83.0,138.0


In [14]:
#SAVE AND EXPORT
#Saving the finsihed retail dataframe to the retail dummy data csv
#After executing this line the file should be available on the left hand side of the screen
entertainment_activity_filtered_df.to_csv('03. Entertainment Activities Table.csv', index=False)