In [41]:
import pandas as pd
from haversine import haversine, Unit
import time
import numpy as np

In [42]:
# Load the data
properties = pd.read_csv('../data/Airport Distance/bq-unique-property-postcodes.csv')
properties_stations_shortest_distance = properties.copy()

# Load the station dataset
stations = pd.read_csv("../data/Train Distance/Location_of_UK_stations.csv")
stations = stations[['stationName', 'lat', 'long']]

# Load the regions dataset
regions = pd.read_csv("../data/Train Distance/bq-train-station-region.csv")
regions = regions[['Station name', 'Region']]
regions

Unnamed: 0,Station name,Region
0,Abbey Wood,London
1,Aber,Wales
2,Abercynon,Wales
3,Aberdare,Wales
4,Aberdeen,Scotland
...,...,...
2566,York,Yorkshire and The Humber
2567,Yorton,West Midlands
2568,Ystrad Mynach,Wales
2569,Ystrad Rhondda,Wales


In [43]:
stations

Unnamed: 0,stationName,lat,long
0,Abbey Wood,51.490719,0.120343
1,Aber,51.575363,-3.230890
2,Abercynon,51.642620,-3.329549
3,Aberdare,51.715019,-3.443130
4,Aberdeen,57.143127,-2.097464
...,...,...,...
2607,Yoker,55.892792,-4.387464
2608,York,53.957966,-1.093159
2609,Yorton,52.809009,-2.736450
2610,Ystrad Mynach,51.640884,-3.241342


In [44]:
# Merge the datasets on station name
merged = pd.merge(stations, regions, left_on='stationName', right_on='Station name', how='left')
merged

Unnamed: 0,stationName,lat,long,Station name,Region
0,Abbey Wood,51.490719,0.120343,Abbey Wood,London
1,Aber,51.575363,-3.230890,Aber,Wales
2,Abercynon,51.642620,-3.329549,Abercynon,Wales
3,Aberdare,51.715019,-3.443130,Aberdare,Wales
4,Aberdeen,57.143127,-2.097464,Aberdeen,Scotland
...,...,...,...,...,...
2607,Yoker,55.892792,-4.387464,Yoker,Scotland
2608,York,53.957966,-1.093159,York,Yorkshire and The Humber
2609,Yorton,52.809009,-2.736450,Yorton,West Midlands
2610,Ystrad Mynach,51.640884,-3.241342,Ystrad Mynach,Wales


In [45]:

regions_to_exclude = ['Wales', 'Scotland', np.nan]  # list the regions to exclude
merged = merged[~merged['Region'].isin(regions_to_exclude)]

# regions = regions.dropna(subset=['Region'])
merged

Unnamed: 0,stationName,lat,long,Station name,Region
0,Abbey Wood,51.490719,0.120343,Abbey Wood,London
11,Accrington,53.753193,-2.370016,Accrington,North West
15,Acklington,55.307129,-1.651816,Acklington,North East
16,Acle,52.634647,1.543981,Acle,East of England
17,Acocks Green,52.449291,-1.818980,Acocks Green,West Midlands
...,...,...,...,...,...
2603,Yeovil Junction,50.924686,-2.613198,Yeovil Junction,South West
2604,Yeovil Pen Mill,50.944466,-2.613461,Yeovil Pen Mill,South West
2605,Yetminster,50.896114,-2.573004,Yetminster,South West
2608,York,53.957966,-1.093159,York,Yorkshire and The Humber


In [46]:
stations = merged[['stationName', 'lat', 'long']]
stations


Unnamed: 0,stationName,lat,long
0,Abbey Wood,51.490719,0.120343
11,Accrington,53.753193,-2.370016
15,Acklington,55.307129,-1.651816
16,Acle,52.634647,1.543981
17,Acocks Green,52.449291,-1.818980
...,...,...,...
2603,Yeovil Junction,50.924686,-2.613198
2604,Yeovil Pen Mill,50.944466,-2.613461
2605,Yetminster,50.896114,-2.573004
2608,York,53.957966,-1.093159


In [47]:
# Prepare a dataframe for the distances
distances = pd.DataFrame()

In [48]:
# Check the range of Lat and Long in properties data
print(properties['Lat'].min(), properties['Lat'].max())
print(properties['Long'].min(), properties['Long'].max())



50.149653 99.999999
-5.243127 1.431448


In [49]:
# Remove any rows with invalid Lat and Long values
properties = properties[(properties['Lat'] >= -90) & (properties['Lat'] <= 90)]
properties = properties[(properties['Long'] >= -180) & (properties['Long'] <= 180)]


In [50]:
properties

Unnamed: 0,Postcode,Lat,Long
0,AL1 1AJ,51.744498,-0.328599
1,AL1 1AR,51.739727,-0.317492
2,AL1 1AS,51.749073,-0.335471
3,AL1 1AT,51.742011,-0.319421
4,AL1 1AU,51.741475,-0.319273
...,...,...,...
670098,YO8 9XP,53.772109,-1.072308
670099,YO8 9YA,53.770419,-1.158828
670100,YO8 9YB,53.770000,-1.159352
670101,YO8 9YD,53.769733,-1.171313


In [51]:
# Iterate over each station
for i, station in stations.iterrows():
    station_loc = (station['lat'], station['long'])
    # Start timing the operation
    start_time = time.time()
    
    distances[station['stationName']] = properties.apply(lambda x: haversine((x['Lat'], x['Long']), station_loc), axis=1)
    
    # Calculate and print the elapsed time
    elapsed_time = time.time() - start_time
    print(f"Elapsed time for {station['stationName']}: {elapsed_time} seconds")

Elapsed time for Abbey Wood: 2.5235977172851562 seconds
Elapsed time for Accrington: 2.4679720401763916 seconds
Elapsed time for Acklington: 2.4760990142822266 seconds
Elapsed time for Acle: 2.4372410774230957 seconds
Elapsed time for Acocks Green: 2.440910816192627 seconds
Elapsed time for Acton Central: 2.4671900272369385 seconds
Elapsed time for Acton Main Line: 2.4553608894348145 seconds
Elapsed time for Adderley Park: 2.4539880752563477 seconds
Elapsed time for Addlestone: 2.4445300102233887 seconds
Elapsed time for Adisham: 2.440247058868408 seconds
Elapsed time for Adlington (Cheshire): 2.4607937335968018 seconds
Elapsed time for Adlington (Lancashire): 2.448356866836548 seconds
Elapsed time for Adwick: 2.4517099857330322 seconds
Elapsed time for Aigburth: 2.4520771503448486 seconds
Elapsed time for Ainsdale: 2.431140184402466 seconds
Elapsed time for Aintree: 2.4458348751068115 seconds
Elapsed time for Albany Park: 2.4494240283966064 seconds
Elapsed time for Albrighton: 2.44556

KeyboardInterrupt: 

In [None]:

# Join the distances to the properties dataframe
properties = pd.concat([properties, distances], axis=1)


In [None]:
properties


Unnamed: 0,Postcode,Lat,Long
0,AL1 1AJ,51.744498,-0.328599
1,AL1 1AR,51.739727,-0.317492
2,AL1 1AS,51.749073,-0.335471
3,AL1 1AT,51.742011,-0.319421
4,AL1 1AU,51.741475,-0.319273
...,...,...,...
670098,YO8 9XP,53.772109,-1.072308
670099,YO8 9YA,53.770419,-1.158828
670100,YO8 9YB,53.770000,-1.159352
670101,YO8 9YD,53.769733,-1.171313


In [None]:
properties_stations_shortest_distance['Nearest_Station_Distance'] = distances.min(axis=1)
properties_stations_shortest_distance['Nearest_Station'] = distances.idxmin(axis=1)
properties_stations_shortest_distance

Unnamed: 0,Postcode,Lat,Long,Nearest_Station_Distance,Nearest_Station
0,AL1 1AJ,51.744498,-0.328599,4.838672,Bricket Wood
1,AL1 1AR,51.739727,-0.317492,4.778527,Bricket Wood
2,AL1 1AS,51.749073,-0.335471,5.134822,Bricket Wood
3,AL1 1AT,51.742011,-0.319421,4.909981,Bricket Wood
4,AL1 1AU,51.741475,-0.319273,4.866150,Bricket Wood
...,...,...,...,...,...
670098,YO8 9XP,53.772109,-1.072308,23.278911,Adwick
670099,YO8 9YA,53.770419,-1.158828,22.003275,Adwick
670100,YO8 9YB,53.770000,-1.159352,21.954475,Adwick
670101,YO8 9YD,53.769733,-1.171313,21.886628,Adwick


In [None]:
properties_stations_shortest_distance[properties_stations_shortest_distance["Postcode"] == "WA4 2EQ"]

Unnamed: 0,Postcode,Lat,Long,Nearest_Station_Distance,Nearest_Station
632701,WA4 2EQ,53.374607,-2.557951,4.600269,Birchwood
