In [1]:
!pip install googlemaps
!pip install beautifulsoup4
!pip install lxml
!pip install requests



In [2]:
# Loading Relevant Packages
import pandas as pd
import numpy as np
import googlemaps # Requires install
from bs4 import BeautifulSoup # Requires install
import requests # Requires install

In [3]:
# Loading Station Dataset
station_data_location = "Transport Data.csv"
station_df = pd.read_csv(station_data_location)
station_df.head()

Unnamed: 0,Station,OS X,OS Y,Latitude,Longitude,Zone,Postcode
0,Abbey Road,539081,183352,51.531952,0.003723,3,E15 3NB
1,Abbey Wood,547297,179002,51.490784,0.120272,4,SE2 9RH
2,Acton Central,520613,180299,51.508758,-0.26343,2,W3 6BH
3,Acton Main Line,520296,181196,51.516887,-0.26769,3,W3 9EH
4,Acton Town,519457,179639,51.503071,-0.280303,3,W3 8HN


In [4]:
# Dictionary Mapping
yes_no = {1: "Yes", 0: "No"}

# Creating Dummy Variables for Zones
station_df = station_df.join(station_df.pop("Zone").str.get_dummies(",").applymap(yes_no.get))

# Renaming Zone Columns for readability
station_df = station_df.rename(columns = {"1": "Zone 1", "2": "Zone 2", "3": "Zone 3","4": "Zone 4","5": "Zone 5","6": "Zone 6","7": "Zone 7","8": "Zone 8","9": "Zone 9"})

In [5]:
# Creating List of Tuples for Station Names, Longitude and Latitude
stationname_long_lat = list(zip(station_df["Station"], station_df["Latitude"], station_df["Longitude"]))
print(stationname_long_lat[:4])

[('Abbey Road', 51.53195199, 0.003723371), ('Abbey Wood', 51.4907841, 0.12027197), ('Acton Central', 51.50875778, -0.263430199), ('Acton Main Line', 51.51688693, -0.267689952)]


In [6]:
# Define Google Maps Client
gmaps = googlemaps.Client(key="AIzaSyBZsW6ajGHpALJrrv8eo7ggSzmpBp_TrD0")

In [7]:
# Querying Google Maps Places API for Addresses

# Empty list of station addresses to be added to station dataframe
station_addresses = []

# Looping over station names, latitude and longitudes to find full address
for name, lat, long in stationname_long_lat:
    
    try:
        places_result = gmaps.places(query = name,
                             location = str(lat)+ "," +str(long),
                             radius = 1,
                             type = "transit_station")

        station_id = places_result['results'][0]['place_id']

        place_details = gmaps.place(place_id = station_id)

        station_addresses.append(place_details['result']['formatted_address'])
    
    except IndexError:
        
        places_result = gmaps.places(query = name,
                             location = str(lat)+ "," +str(long),
                             radius = 1)

        station_id = places_result['results'][0]['place_id']

        place_details = gmaps.place(place_id = station_id)

        station_addresses.append(place_details['result']['formatted_address'])


In [8]:
# Adding Station Addresses to DataFrame
station_df["Address"] = station_addresses

# Removing OS X, OS Y and Old Postcode Columns
station_df = station_df.drop(["OS X", "OS Y", "Postcode"], axis=1)

In [9]:
# Unpivotting Dataframe and removing rows with availability marked as No
station_df_unpivoted = station_df.melt(id_vars=['Station', "Latitude","Longitude", "Address"], var_name='Transport Zone', value_name='Available').sort_values(["Station", "Transport Zone"])
station_df_unpivoted = station_df_unpivoted[station_df_unpivoted["Available"] == "Yes"].iloc[:, 0:5]
station_df_unpivoted.reset_index(drop=True, inplace=True)
station_df_unpivoted.head()

Unnamed: 0,Station,Latitude,Longitude,Address,Transport Zone
0,Abbey Road,51.531952,0.003723,"London E15 3EB, UK",Zone 3
1,Abbey Wood,51.490784,0.120272,"Abbey Wood, London SE2, UK",Zone 4
2,Acton Central,51.508758,-0.26343,"Churchfield Rd, Acton, London W3 6BS, UK",Zone 2
3,Acton Main Line,51.516887,-0.26769,"Acton, London W3, UK",Zone 3
4,Acton Town,51.503071,-0.280303,"Gunnersbury Ln, Acton, London W3 8HN, UK",Zone 3


In [10]:
# Checking for NAs in all columns
station_df_unpivoted.isnull().values.any()

False

In [11]:
# Setting Webpage to be scraped
source = requests.get("https://en.wikipedia.org/wiki/List_of_London_Underground_stations").text
soup = BeautifulSoup(source, "lxml")

# Assigning table (and its rows) with data to relevant variable
table = soup.find("table", class_="wikitable sortable plainrowheaders")
rows = table.find_all("a")

# Empty list where table's data will be stored
table_data = []

# For Loop appending table's rows to empty table data list
for row in rows:
    table_data.append(row.get("title"))

# Selecting relevant rows from the table
table_data = table_data[5:-2]

# Removing N/A elements from table data list
table_data = [data for data in table_data if data != None]

# Defining Function to split Table Data
# Split is applied on the last element of each row (elements containing Travelcard)
def list_divider(list_):
    list_x=[]
    for i in list_:
        if 'Travelcard' in i:
            if list_x:
                yield list_x
            list_x=[]
        else:
            list_x.append(i)

    yield list_x

# Using custom function to split table data
y = list_divider(table_data)
divided_table = list(y)

In [12]:
# Extracting Station names from Table Data
station_names = pd.DataFrame(divided_table)
station_names = station_names.rename(columns = {0: "Station"})["Station"]

In [13]:
# The following terms will be used to exclude elements from tube line dataframe
term = " line"
term2 = "line)"
term3 = "lines)"

# Removing Elements with terms in them using list comprehension
tube_lines = [[ele for ele in sub if term in ele] for sub in divided_table]
tube_lines = [[ele for ele in sub if term2 not in ele] for sub in tube_lines]
tube_lines = [[ele for ele in sub if term3 not in ele] for sub in tube_lines]

# Creating dataframe with tube lines
tube_lines = pd.DataFrame(tube_lines)

In [14]:
# Concatenating Station Names and Tube Lines Dataframes
station_df2 = pd.concat([station_names, tube_lines], axis=1)
station_df2 = station_df2.fillna("")
station_df2["Lines"] = station_df2[0] + ";" + station_df2[1] + ";" + station_df2[2] + ";" + station_df2[3] + ";" + station_df2[4] + ";" + station_df2[5]

# Dictionary Mapping
yes_no = {1: "Yes", 0: "No"}

# Creating Dummy Variables for Tube Lines
station_df2 = station_df2.join(station_df2.pop("Lines").str.get_dummies(";").applymap(yes_no.get))

# Removing irrelavant columns
station_df2.drop([0,1,2,3,4,5], axis=1, inplace=True)

In [15]:
# Unpivotting Dataframe and removing rows with availability marked as No
station_df2_unpivoted = station_df2.melt(id_vars=['Station'], var_name='Tube Line', value_name='Available').sort_values(["Station", "Tube Line"])
station_df2_unpivoted = station_df2_unpivoted[station_df2_unpivoted["Available"] == "Yes"].iloc[:, 0:2]
station_df2_unpivoted.reset_index(drop=True, inplace=True)
station_df2_unpivoted.head()

Unnamed: 0,Station,Tube Line
0,Acton Town tube station,District line
1,Acton Town tube station,Piccadilly line
2,Aldgate East tube station,District line
3,Aldgate East tube station,Hammersmith & City line
4,Aldgate tube station,Circle line (London Underground)


In [16]:
# Checking for NAs in all columns
station_df2_unpivoted.isnull().values.any()

False

In [19]:
# Loading Station Names Mapping Dataset
station_names_mapping_location = "Station Names Mapping.csv"
station_mapping_df = pd.read_csv(station_names_mapping_location)
station_mapping_df = station_mapping_df[["Wikipedia Station Names", "Google API Station Names"]]
station_mapping_df

Unnamed: 0,Wikipedia Station Names,Google API Station Names
0,Acton Town tube station,Acton Town
1,Aldgate tube station,Aldgate
2,Aldgate East tube station,Aldgate East
3,Alperton tube station,Alperton
4,Amersham station,Amersham
...,...,...
684,N/A (Pier),N/A (Pier)
685,N/A (Pier),N/A (Pier)
686,N/A (Pier),N/A (Pier)
687,N/A (Pier),N/A (Pier)


In [20]:
# Switching Wikipedia Station Names for Google API Station Names
map_dict = dict(zip(station_mapping_df["Wikipedia Station Names"], station_mapping_df["Google API Station Names"]))
station_df2_unpivoted["Station"] = station_df2_unpivoted["Station"].map(map_dict)

In [27]:
# Merging Station Address Dataframe with Tube Line Dataframe 
merged_station_df = pd.merge(station_df_unpivoted, station_df2_unpivoted, how="left", on="Station")
merged_station_df = merged_station_df[["Station", "Tube Line", "Transport Zone", "Address", "Longitude", "Latitude"]]

# Setting stations without an underground tube line as the overground line
merged_station_df["Tube Line"] = merged_station_df["Tube Line"].fillna("Overground line")
merged_station_df[merged_station_df["Tube Line"] == "Overground line"]

Unnamed: 0,Station,Tube Line,Transport Zone,Address,Longitude,Latitude
0,Abbey Road,Overground line,Zone 3,"London E15 3EB, UK",0.003723,51.531952
1,Abbey Wood,Overground line,Zone 4,"Abbey Wood, London SE2, UK",0.120272,51.490784
2,Acton Central,Overground line,Zone 2,"Churchfield Rd, Acton, London W3 6BS, UK",-0.263430,51.508758
3,Acton Main Line,Overground line,Zone 3,"Acton, London W3, UK",-0.267690,51.516887
6,Addington Village,Overground line,Zone 3,"Croydon CR0 9BA, UK",-0.032665,51.356239
...,...,...,...,...,...,...
887,Woodside,Overground line,Zone 6,"Spring Ln, Croydon SE25 5DP, UK",-0.065347,51.387098
889,Woolwich,Overground line,Zone 4,"Woolwich New Rd, Woolwich, London SE18 6EU, UK",0.071819,51.491578
890,Woolwich Arsenal,Overground line,Zone 4,"Woolwich New Rd, Woolwich, London SE18 6EU, UK",0.069194,51.489907
891,Woolwich Dockyard,Overground line,Zone 3,"Woolwich, London SE18 5JY, UK",0.054612,51.491108


In [31]:
# # Exporting Dataset as a CSV
# merged_station_df.to_csv("station_data.csv")

# # Exporting Dataset as a JSON
# merged_station_df.to_json("station_data.json")
