In [1]:
#IMPORTANT
#Code below makes sure all the code in each cell is outputted
#Code from Programming for Business Analytics Workbooks

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
#IMPORTING RELEVANT PACKAGES

import googlemaps     #importing Google Maps package
from datetime import datetime     #importing datetime function from datetime package
import pandas as pd     #importing pandas and naming it as pd
import re     #importing re (this is used for the manipulation of strings type variables)

In [3]:
#IMPORTING RELEVANT DATASETS

boroughs_df = pd.read_csv("01. borough_centroids.csv")
print(boroughs_df.head())
education_df = pd.read_csv("01. destination_education.csv")
print(education_df.head())

           Borough Name   Latitude  Longitude
0  Barking and Dagenham  51.545272   0.133522
1                Barnet  51.616019  -0.210017
2                Bexley  51.458811   0.140346
3                 Brent  51.558554  -0.267811
4               Bromley  51.371984   0.051538
          Name  Postcode   Latitude  Longitude
0   School_566  IG11 8JA  51.541366   0.075290
1   School_587  IG11 8AS  51.536640   0.075507
2   School_589  IG11 7AR  51.534482   0.077039
3  School_1191  SW19 2JY  51.413454  -0.190817
4  School_1194  SW19 1NU  51.416490  -0.197643


In [4]:
#CREATING 2 SETS OF COLUMNS(lists) from origin data and the educ data
#1st set of 2 columns are the borough (origin) and destination names put together
#2nd set of 2 columns are the borough (origin) and destination latitudes and longitudes put together

origins_names = list(boroughs_df["Borough Name"])     #assigning the values in borough names column to the origin name variable
destinations_names = list(education_df["Name"])     #assigning the values in destination names column to the destination name variable

#using list comprehension to put together the origin and destination names in 2 columns (lists)
origins_to_destinations_names = [[x, y] for x in origins_names for y in destinations_names]
#checking the first 5 rows of the columns
print(origins_to_destinations_names[:5])

#zipping the values in the borough latitude and longitude columns together into one tuple and assign the value to the origin location variable
origins_locs = list(zip(boroughs_df["Latitude"], boroughs_df["Longitude"]))
#zipping the values in the borough latitude and longitude columns together into one tuple and assign the value to the origin location variable
destinations_locs = list(zip(education_df["Latitude"], education_df["Longitude"]))

#using list comprehension to put together the origin and destination latitudes and longitudes in 2 columns (lists)
origins_to_destinations_locs = [[x, y] for x in origins_locs for y in destinations_locs]
#checking the first 5 rows of the columns
print(origins_to_destinations_locs[:5])

[['Barking and Dagenham', 'School_566'], ['Barking and Dagenham', 'School_587'], ['Barking and Dagenham', 'School_589'], ['Barking and Dagenham', 'School_1191'], ['Barking and Dagenham', 'School_1194']]
[[(51.54527247, 0.13352213400000001), (51.54136554, 0.075289583)], [(51.54527247, 0.13352213400000001), (51.53664047, 0.075507302)], [(51.54527247, 0.13352213400000001), (51.53448207, 0.077038517)], [(51.54527247, 0.13352213400000001), (51.41345369, -0.19081689999999998)], [(51.54527247, 0.13352213400000001), (51.41649047, -0.19764337199999998)]]


In [5]:
#ENTERING JORDAN'S API KEY which allows me to access Google Maps API
#a new Google Maps API key can be generated by following the instructions below:
#https://developers.google.com/maps/documentation/javascript/get-api-key
gmaps = googlemaps.Client(key='AIzaSyCI_beLQrSSu8ctO4qmqhkVHcD4FjinH0Y')

#SETTING TIME FOR DEPTATURE (8am on Monday)
#this is what all the outputs (distance and time) are based on
#8am chosen as this would be the peak of morning rush hour
dept_time_str = '09/11/20 08:00:00'
dept_time = datetime.strptime(dept_time_str, '%d/%m/%y %H:%M:%S')

In [6]:
#REAL RUN - GENERATING DISTANCE AND DURATION FIELDS

distance = []     #Empty list for distance variable, distance values from gmaps.direction function will be added here
duration = []     #Empty list for duration variable, duration values from gmaps.direction function will be added here

#gmaps.direction function automatically selects the fastest mode of public transport
#the function calculates the duration/distance/directions betweeen origin and destination
#this all done using locations (latitudes and longitudes) in the origins and destinations locations list (5 cells above)
for x, y in origins_to_destinations_locs:
    directions_result = gmaps.directions(x,
                                     y,
                                     mode = "transit",
                                     departure_time = dept_time
                                    )
    try:
        distance.append(directions_result[0]['legs'][0]['distance']['text'])     #appends the distance of the journey to the distance variable
        duration.append(directions_result[0]['legs'][0]['duration']['text'])     #appends the duration (time) of the journey to the duration variable
    #These two lines of code above are needed as the gmaps.direction function outputs lots of information
    #information includes distance, duration, step by step directions etc. and these are not all relevant to what we need
    except IndexError:
        distance.append("NA")
        duration.append("NA")
    
print(distance[:5])     #checking the first 5 rows of the distance variable (list)
print(duration[:5])     #checking the first 5 rows of the duration variable (list)

['4.9 km', '4.7 km', '4.8 km', '31.4 km', '31.6 km']
['25 mins', '23 mins', '24 mins', '1 hour 25 mins', '1 hour 28 mins']


In [7]:
#CREATING DATAFRAME
#orgin name, destination name, distance and time between the two points, activity classification are held in the dataframe (table)

education_activity_df = pd.DataFrame(origins_to_destinations_names, columns =['Origin', 'Destination'])     #creating data frame with its first two columns origin and destination point nmaes
education_activity_df["Distance (km)"]=distance     #adding distance column using distance varialbe (from cell above)
education_activity_df["Duration (mins)"]=duration     #adding duration (time) columm using duration variable (from cell above)
education_activity_df["Activity"]="Education"     #adding activity column - this is through categorizing all the entries as the relevant activity
print(education_activity_df.head())     #checking the first 5 rows of the dataframe

                 Origin  Destination Distance (km) Duration (mins)   Activity
0  Barking and Dagenham   School_566        4.9 km         25 mins  Education
1  Barking and Dagenham   School_587        4.7 km         23 mins  Education
2  Barking and Dagenham   School_589        4.8 km         24 mins  Education
3  Barking and Dagenham  School_1191       31.4 km  1 hour 25 mins  Education
4  Barking and Dagenham  School_1194       31.6 km  1 hour 28 mins  Education


In [8]:
#IDENTIFYING AND REMOVING NAs (ERRORS)

education_activity_df.shape     #finding the number of rows in the activity table

error_rows = education_activity_df[education_activity_df["Distance (km)"]=="NA"]     #Subsetting rows in dataframe for rows where NA was generated in the distance/duration column
error_rows.shape     #finding the number of rows that contained error
error_rows_index = list(error_rows.index)     #creating a list of the row numbers containing the errors

education_activity_filtered_df = education_activity_df.drop(error_rows_index)     #filtering out the rows which contain the errors
education_activity_filtered_df.shape     #finding the number of rows in the new table (this should be: original table rows - error rows)

na_in_distance = "NA" in education_activity_filtered_df["Distance (km)"]     #checking if any NAs in the new table's distance column
na_in_distance
na_in_duration = "NA" in education_activity_filtered_df["Duration (mins)"]     #checking if any NAs in the new table's duration column
na_in_duration

(129822, 5)

(427, 5)

(129395, 5)

False

False

In [9]:
#DATA PREP 1 - DISTANCE COLUMN

#removing the string " km" from the distance entries and converting the strings into a floats
#these actions are executed through list comprehension
#this will allow the dataframe to filtered/ordered by distance

#the output of the list comprehension replaces the old distance column (the version that contains the unneeded strings)
education_activity_filtered_df["Distance (km)"] = [float(((x.replace(" ", "")).replace("k","")).replace("m","")) for x in education_activity_filtered_df["Distance (km)"]]
#checking the first 5 rows of the dataframe
print(education_activity_filtered_df.head())

                 Origin  Destination  Distance (km) Duration (mins)   Activity
0  Barking and Dagenham   School_566            4.9         25 mins  Education
1  Barking and Dagenham   School_587            4.7         23 mins  Education
2  Barking and Dagenham   School_589            4.8         24 mins  Education
3  Barking and Dagenham  School_1191           31.4  1 hour 25 mins  Education
4  Barking and Dagenham  School_1194           31.6  1 hour 28 mins  Education


In [10]:
#CREATING A CONVERT TIME FUNCTION
#the last function in this cell "convert time to mins" will be used to modify the Duration column's entries
#this data prepping is done in the cell below

x = '1 hour 20 mins'
y = '2 hours 20 mins'

def clean_time(text):
    text = text.strip()
    step01 = text.replace(r"s", "")
    step02 = step01.replace(r"hour", "h")
    step03 = step02.replace(r"min", "m")
    return step03

def convert_time_to_mins(text):
    step01 = re.split(r' ', clean_time(text))
    hour = int(step01[0])
    mins = int(step01[2])
    hours_to_mins = hour * 60
    full_convert = hours_to_mins + mins
    return full_convert

print(convert_time_to_mins(x))
print(convert_time_to_mins(y))

80
140


In [11]:
#DATA PREP 2 - DURATION COLUMN
#removing the strings " hour(s)" and " min(s)" from the duration entries and converting the strings into a integers
#these actions are executed through a for loop
#this will allow the dataframe to filtered/ordered by distance
converted_duration = []

for x in education_activity_filtered_df["Duration (mins)"]:
    if "hour" not in x:
        converted_duration.append(int((x.replace("s", "")).replace(" min", "")))
    else:
        converted_duration.append(convert_time_to_mins(x))

#the output of the for loop "converted duration" is replaces the old duration column (the version that contains the unneeded strings)
education_activity_filtered_df["Duration (mins)"] = converted_duration
#checking the first 5 rows of the dataframe
print(education_activity_filtered_df.head())

                 Origin  Destination  Distance (km)  Duration (mins)  \
0  Barking and Dagenham   School_566            4.9               25   
1  Barking and Dagenham   School_587            4.7               23   
2  Barking and Dagenham   School_589            4.8               24   
3  Barking and Dagenham  School_1191           31.4               85   
4  Barking and Dagenham  School_1194           31.6               88   

    Activity  
0  Education  
1  Education  
2  Education  
3  Education  
4  Education  


In [12]:
#Use this summary table to identify rough max and min distances/durations
#Off this information filter the dataframe and remove possible errors?

check = education_activity_filtered_df.groupby("Origin").describe()
check

Unnamed: 0_level_0,Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Distance (km),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins),Duration (mins)
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Origin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Barking and Dagenham,3921.0,31.381586,13.316556,0.6,21.7,30.6,41.1,85.0,3921.0,86.459322,25.3198,8.0,70.0,88.0,104.0,161.0
Barnet,3920.0,29.714847,13.106227,0.7,19.8,29.5,39.1,92.5,3920.0,89.323724,23.475026,8.0,74.0,91.0,106.0,164.0
Bexley,3921.0,32.838613,13.082087,0.6,23.9,32.5,41.5,87.6,3921.0,92.051007,24.272664,7.0,78.0,94.0,108.0,183.0
Brent,3921.0,27.547539,13.015053,1.0,17.8,26.9,36.4,93.9,3921.0,76.267024,22.141633,12.0,61.0,77.0,93.0,153.0
Bromley,3921.0,36.018771,13.616086,1.2,26.2,35.9,45.2,94.0,3921.0,96.268809,24.037294,13.0,81.0,98.0,113.0,169.0
Camden,3921.0,21.298036,10.787378,0.2,13.1,20.3,28.2,74.7,3921.0,59.999235,19.951977,1.0,46.0,60.0,74.0,129.0
City of London,3922.0,18.838985,10.077346,0.6,11.0,17.6,24.8,71.3,3922.0,51.427078,17.662494,7.0,39.0,51.0,63.0,114.0
Croydon,3921.0,32.113466,13.428095,0.4,23.4,32.2,40.3,91.9,3921.0,81.346085,22.853746,5.0,65.0,81.0,97.0,167.0
Ealing,3921.0,28.284621,12.574965,0.2,19.1,27.6,36.7,99.6,3921.0,90.738587,24.974165,3.0,75.0,92.0,108.0,169.0
Enfield,3921.0,30.0823,13.153243,0.4,20.6,29.2,40.4,87.5,3921.0,82.747258,24.559219,6.0,66.0,85.0,101.0,151.0


In [15]:
#SAVE AND EXPORT
#Saving the finsihed retail dataframe to the retail dummy data csv
#After executing this line the file should be available on the left hand side of the screen
education_activity_filtered_df.to_csv('03. Education Activities Table.csv', index=False)