In [1]:
import pandas as pd
from haversine import haversine, Unit
import time
import numpy as np
import warnings

warnings.simplefilter("ignore")

In [2]:
# Load the data
properties = pd.read_csv('../data/Airport Distance/bq-unique-property-postcodes.csv')
properties_stations_shortest_distance = properties.copy()

# Load the station dataset
stations = pd.read_csv("../data/Train Distance/Location_of_UK_stations.csv")
stations = stations[['stationName', 'lat', 'long']]

# Load the regions dataset
regions = pd.read_csv("../data/Train Distance/bq-train-station-region.csv")
regions = regions[['Station name', 'Region']]
regions

Unnamed: 0,Station name,Region
0,Abbey Wood,London
1,Aber,Wales
2,Abercynon,Wales
3,Aberdare,Wales
4,Aberdeen,Scotland
...,...,...
2566,York,Yorkshire and The Humber
2567,Yorton,West Midlands
2568,Ystrad Mynach,Wales
2569,Ystrad Rhondda,Wales


In [3]:
stations

Unnamed: 0,stationName,lat,long
0,Abbey Wood,51.490719,0.120343
1,Aber,51.575363,-3.230890
2,Abercynon,51.642620,-3.329549
3,Aberdare,51.715019,-3.443130
4,Aberdeen,57.143127,-2.097464
...,...,...,...
2607,Yoker,55.892792,-4.387464
2608,York,53.957966,-1.093159
2609,Yorton,52.809009,-2.736450
2610,Ystrad Mynach,51.640884,-3.241342


In [4]:
# Merge the datasets on station name
merged = pd.merge(stations, regions, left_on='stationName', right_on='Station name', how='left')
merged

Unnamed: 0,stationName,lat,long,Station name,Region
0,Abbey Wood,51.490719,0.120343,Abbey Wood,London
1,Aber,51.575363,-3.230890,Aber,Wales
2,Abercynon,51.642620,-3.329549,Abercynon,Wales
3,Aberdare,51.715019,-3.443130,Aberdare,Wales
4,Aberdeen,57.143127,-2.097464,Aberdeen,Scotland
...,...,...,...,...,...
2607,Yoker,55.892792,-4.387464,Yoker,Scotland
2608,York,53.957966,-1.093159,York,Yorkshire and The Humber
2609,Yorton,52.809009,-2.736450,Yorton,West Midlands
2610,Ystrad Mynach,51.640884,-3.241342,Ystrad Mynach,Wales


In [5]:

regions_to_exclude = ['Wales', 'Scotland', np.nan]  # list the regions to exclude
merged = merged[~merged['Region'].isin(regions_to_exclude)]

# regions = regions.dropna(subset=['Region'])
merged

Unnamed: 0,stationName,lat,long,Station name,Region
0,Abbey Wood,51.490719,0.120343,Abbey Wood,London
11,Accrington,53.753193,-2.370016,Accrington,North West
15,Acklington,55.307129,-1.651816,Acklington,North East
16,Acle,52.634647,1.543981,Acle,East of England
17,Acocks Green,52.449291,-1.818980,Acocks Green,West Midlands
...,...,...,...,...,...
2603,Yeovil Junction,50.924686,-2.613198,Yeovil Junction,South West
2604,Yeovil Pen Mill,50.944466,-2.613461,Yeovil Pen Mill,South West
2605,Yetminster,50.896114,-2.573004,Yetminster,South West
2608,York,53.957966,-1.093159,York,Yorkshire and The Humber


In [6]:
stations = merged[['stationName', 'lat', 'long']]
stations


Unnamed: 0,stationName,lat,long
0,Abbey Wood,51.490719,0.120343
11,Accrington,53.753193,-2.370016
15,Acklington,55.307129,-1.651816
16,Acle,52.634647,1.543981
17,Acocks Green,52.449291,-1.818980
...,...,...,...
2603,Yeovil Junction,50.924686,-2.613198
2604,Yeovil Pen Mill,50.944466,-2.613461
2605,Yetminster,50.896114,-2.573004
2608,York,53.957966,-1.093159


In [7]:
# Prepare a dataframe for the distances
distances = pd.DataFrame()

In [8]:
# Check the range of Lat and Long in properties data
print(properties['Lat'].min(), properties['Lat'].max())
print(properties['Long'].min(), properties['Long'].max())



49.895171 55.797415
-6.352647 1.762773


In [9]:
# Remove any rows with invalid Lat and Long values
properties = properties[(properties['Lat'] >= -90) & (properties['Lat'] <= 90)]
properties = properties[(properties['Long'] >= -180) & (properties['Long'] <= 180)]


In [10]:
properties

Unnamed: 0,Postcode,Lat,Long
0,GL8 8HA,51.640620,-2.152272
1,SS2 4RJ,51.547842,0.739400
2,DA14 6BQ,51.422746,0.099688
3,KT6 6HR,51.393117,-0.299780
4,ME7 4EB,51.380130,0.557023
...,...,...,...
1192273,WS13 6SY,52.688117,-1.806688
1192274,LE17 6LB,52.454033,-1.057775
1192275,CB5 8FS,52.209282,0.136840
1192276,HD9 7BE,53.589470,-1.762828


In [11]:
# Haversine function to calculate distance
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

# Start timing the operation
start_time = time.time()
# Iterate over each station
for i, station in stations.iterrows():
    station_loc_lon = station['long']
    station_loc_lat = station['lat']
    

    distances[station['stationName']] = haversine_np(properties['Long'], properties['Lat'], station_loc_lon, station_loc_lat)
    
# Calculate and print the elapsed time
elapsed_time = time.time() - start_time
print(f"Elapsed time for stations distances: {elapsed_time} seconds")


Elapsed time for stations distances: 68.32654118537903 seconds


In [12]:
# Join the distances to the properties dataframe
properties = pd.concat([properties, distances], axis=1)


In [13]:
properties

Unnamed: 0,Postcode,Lat,Long,Abbey Wood,Accrington,Acklington,Acle,Acocks Green,Acton Central,Acton Main Line,...,Yardley Wood,Yarm,Yate,Yatton,Yeoford,Yeovil Junction,Yeovil Pen Mill,Yetminster,York,Yorton
0,GL8 8HA,51.640620,-2.152272,157.860844,235.217136,408.780586,275.198430,92.705591,131.255660,130.870040,...,89.130489,322.830139,22.317199,54.332366,145.660482,85.766107,83.735553,87.752204,267.157928,135.789248
1,SS2 4RJ,51.547842,0.739400,43.274358,322.432237,446.680828,132.675674,201.658801,69.477169,69.697673,...,202.312281,357.284828,219.196860,247.524705,322.713976,243.298528,242.651829,241.620305,294.801144,275.165889
2,DA14 6BQ,51.422746,0.099688,7.687715,307.963215,446.966523,166.989611,174.038899,26.903486,27.493176,...,173.921975,356.202013,175.721982,202.946759,276.418323,196.933233,196.305668,195.230897,292.947966,247.326404
3,KT6 6HR,51.393117,-0.299780,31.055366,297.174006,444.072321,186.900296,156.887896,13.088109,13.930658,...,156.256892,353.075371,148.546217,175.287333,248.759889,169.416357,168.738633,167.817479,289.983682,228.933803
4,ME7 4EB,51.380130,0.557023,32.651852,329.531482,460.275414,154.891618,201.578878,58.605835,59.097178,...,201.798661,370.207610,207.740579,234.722773,306.400909,226.676936,226.168216,224.751559,307.267572,275.217893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192273,WS13 6SY,52.688117,-1.806688,187.116255,124.148488,291.213572,225.893175,26.552549,168.138262,167.255086,...,29.805100,204.198993,134.483383,160.152185,250.100455,203.642798,201.530240,205.981179,148.845757,63.967078
1192274,LE17 6LB,52.454033,-1.057775,134.025837,168.833650,319.426362,176.959736,51.553569,118.276564,117.343874,...,54.087470,228.888319,138.385181,169.315880,261.952422,200.897346,199.038754,202.163990,167.141312,119.891677
1192275,CB5 8FS,52.209282,0.136840,79.858424,239.901816,363.720304,106.429991,135.468624,82.558546,81.797629,...,137.304686,273.749119,191.285972,223.083873,311.018637,237.591089,236.258909,237.362856,210.945683,205.426578
1192276,HD9 7BE,53.589470,-1.762828,265.670973,43.918155,191.009704,244.735539,126.758097,252.429061,251.487128,...,129.937558,105.263633,232.133730,254.707198,339.943117,301.714952,299.558969,304.328487,60.090754,108.268103


In [14]:
distances

Unnamed: 0,Abbey Wood,Accrington,Acklington,Acle,Acocks Green,Acton Central,Acton Main Line,Adderley Park,Addlestone,Adisham,...,Yardley Wood,Yarm,Yate,Yatton,Yeoford,Yeovil Junction,Yeovil Pen Mill,Yetminster,York,Yorton
0,157.860844,235.217136,408.780586,275.198430,92.705591,131.255660,130.870040,95.853909,119.059063,236.329571,...,89.130489,322.830139,22.317199,54.332366,145.660482,85.766107,83.735553,87.752204,267.157928,135.789248
1,43.274358,322.432237,446.680828,132.675674,201.658801,69.477169,69.697673,205.588910,87.033740,46.655387,...,202.312281,357.284828,219.196860,247.524705,322.713976,243.298528,242.651829,241.620305,294.801144,275.165889
2,7.687715,307.963215,446.966523,166.989611,174.038899,26.903486,27.493176,178.318061,40.970483,78.961622,...,173.921975,356.202013,175.721982,202.946759,276.418323,196.933233,196.305668,195.230897,292.947966,247.326404
3,31.055366,297.174006,444.072321,186.900296,156.887896,13.088109,13.930658,161.307434,13.098756,105.470720,...,156.256892,353.075371,148.546217,175.287333,248.759889,169.416357,168.738633,167.817479,289.983682,228.933803
4,32.651852,329.531482,460.275414,154.891618,201.578878,58.605835,59.097178,205.724050,72.333283,47.203526,...,201.798661,370.207610,207.740579,234.722773,306.400909,226.676936,226.168216,224.751559,307.267572,275.217893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192273,187.116255,124.148488,291.213572,225.893175,26.552549,168.138262,167.255086,22.962549,171.845803,261.129664,...,29.805100,204.198993,134.483383,160.152185,250.100455,203.642798,201.530240,205.981179,148.845757,63.967078
1192274,134.025837,168.833650,319.426362,176.959736,51.553569,118.276564,117.343874,54.013390,126.429033,205.328877,...,54.087470,228.888319,138.385181,169.315880,261.952422,200.897346,199.038754,202.163990,167.141312,119.891677
1192275,79.858424,239.901816,363.720304,106.429991,135.468624,82.558546,81.797629,138.547968,102.368448,130.065128,...,137.304686,273.749119,191.285972,223.083873,311.018637,237.591089,236.258909,237.362856,210.945683,205.426578
1192276,265.670973,43.918155,191.009704,244.735539,126.758097,252.429061,251.487128,123.051966,261.080889,329.173023,...,129.937558,105.263633,232.133730,254.707198,339.943117,301.714952,299.558969,304.328487,60.090754,108.268103


In [15]:
properties

Unnamed: 0,Postcode,Lat,Long,Abbey Wood,Accrington,Acklington,Acle,Acocks Green,Acton Central,Acton Main Line,...,Yardley Wood,Yarm,Yate,Yatton,Yeoford,Yeovil Junction,Yeovil Pen Mill,Yetminster,York,Yorton
0,GL8 8HA,51.640620,-2.152272,157.860844,235.217136,408.780586,275.198430,92.705591,131.255660,130.870040,...,89.130489,322.830139,22.317199,54.332366,145.660482,85.766107,83.735553,87.752204,267.157928,135.789248
1,SS2 4RJ,51.547842,0.739400,43.274358,322.432237,446.680828,132.675674,201.658801,69.477169,69.697673,...,202.312281,357.284828,219.196860,247.524705,322.713976,243.298528,242.651829,241.620305,294.801144,275.165889
2,DA14 6BQ,51.422746,0.099688,7.687715,307.963215,446.966523,166.989611,174.038899,26.903486,27.493176,...,173.921975,356.202013,175.721982,202.946759,276.418323,196.933233,196.305668,195.230897,292.947966,247.326404
3,KT6 6HR,51.393117,-0.299780,31.055366,297.174006,444.072321,186.900296,156.887896,13.088109,13.930658,...,156.256892,353.075371,148.546217,175.287333,248.759889,169.416357,168.738633,167.817479,289.983682,228.933803
4,ME7 4EB,51.380130,0.557023,32.651852,329.531482,460.275414,154.891618,201.578878,58.605835,59.097178,...,201.798661,370.207610,207.740579,234.722773,306.400909,226.676936,226.168216,224.751559,307.267572,275.217893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192273,WS13 6SY,52.688117,-1.806688,187.116255,124.148488,291.213572,225.893175,26.552549,168.138262,167.255086,...,29.805100,204.198993,134.483383,160.152185,250.100455,203.642798,201.530240,205.981179,148.845757,63.967078
1192274,LE17 6LB,52.454033,-1.057775,134.025837,168.833650,319.426362,176.959736,51.553569,118.276564,117.343874,...,54.087470,228.888319,138.385181,169.315880,261.952422,200.897346,199.038754,202.163990,167.141312,119.891677
1192275,CB5 8FS,52.209282,0.136840,79.858424,239.901816,363.720304,106.429991,135.468624,82.558546,81.797629,...,137.304686,273.749119,191.285972,223.083873,311.018637,237.591089,236.258909,237.362856,210.945683,205.426578
1192276,HD9 7BE,53.589470,-1.762828,265.670973,43.918155,191.009704,244.735539,126.758097,252.429061,251.487128,...,129.937558,105.263633,232.133730,254.707198,339.943117,301.714952,299.558969,304.328487,60.090754,108.268103


In [16]:
# # Drop 'Nearest_Station' and 'Nearest_Station_Distance' columns before conversion and calculation
# properties = properties.drop(['Nearest_Station', 'Nearest_Station_Distance'], axis=1)


In [17]:
properties

Unnamed: 0,Postcode,Lat,Long,Abbey Wood,Accrington,Acklington,Acle,Acocks Green,Acton Central,Acton Main Line,...,Yardley Wood,Yarm,Yate,Yatton,Yeoford,Yeovil Junction,Yeovil Pen Mill,Yetminster,York,Yorton
0,GL8 8HA,51.640620,-2.152272,157.860844,235.217136,408.780586,275.198430,92.705591,131.255660,130.870040,...,89.130489,322.830139,22.317199,54.332366,145.660482,85.766107,83.735553,87.752204,267.157928,135.789248
1,SS2 4RJ,51.547842,0.739400,43.274358,322.432237,446.680828,132.675674,201.658801,69.477169,69.697673,...,202.312281,357.284828,219.196860,247.524705,322.713976,243.298528,242.651829,241.620305,294.801144,275.165889
2,DA14 6BQ,51.422746,0.099688,7.687715,307.963215,446.966523,166.989611,174.038899,26.903486,27.493176,...,173.921975,356.202013,175.721982,202.946759,276.418323,196.933233,196.305668,195.230897,292.947966,247.326404
3,KT6 6HR,51.393117,-0.299780,31.055366,297.174006,444.072321,186.900296,156.887896,13.088109,13.930658,...,156.256892,353.075371,148.546217,175.287333,248.759889,169.416357,168.738633,167.817479,289.983682,228.933803
4,ME7 4EB,51.380130,0.557023,32.651852,329.531482,460.275414,154.891618,201.578878,58.605835,59.097178,...,201.798661,370.207610,207.740579,234.722773,306.400909,226.676936,226.168216,224.751559,307.267572,275.217893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192273,WS13 6SY,52.688117,-1.806688,187.116255,124.148488,291.213572,225.893175,26.552549,168.138262,167.255086,...,29.805100,204.198993,134.483383,160.152185,250.100455,203.642798,201.530240,205.981179,148.845757,63.967078
1192274,LE17 6LB,52.454033,-1.057775,134.025837,168.833650,319.426362,176.959736,51.553569,118.276564,117.343874,...,54.087470,228.888319,138.385181,169.315880,261.952422,200.897346,199.038754,202.163990,167.141312,119.891677
1192275,CB5 8FS,52.209282,0.136840,79.858424,239.901816,363.720304,106.429991,135.468624,82.558546,81.797629,...,137.304686,273.749119,191.285972,223.083873,311.018637,237.591089,236.258909,237.362856,210.945683,205.426578
1192276,HD9 7BE,53.589470,-1.762828,265.670973,43.918155,191.009704,244.735539,126.758097,252.429061,251.487128,...,129.937558,105.263633,232.133730,254.707198,339.943117,301.714952,299.558969,304.328487,60.090754,108.268103


In [18]:
chunk_size = 5000
total_chunks = int(np.ceil(len(properties) / chunk_size))
start_time = time.time()

# Create an empty DataFrame to store results
properties_stations_shortest_distance = pd.DataFrame(columns=['Postcode', 'Nearest_Station_Distance', 'Nearest_Station'])

for i in range(0, len(properties), chunk_size):
    # Time the operation
    operation_start_time = time.time()
    
    # Get the chunk of data
    properties_chunk = properties.iloc[i:i + chunk_size]
    
    # Calculate and store the minimum distance and corresponding station for each chunk
    properties_chunk = properties_chunk.iloc[:,3:-2].astype(float)
    properties.loc[properties_chunk.index, 'Nearest_Station_Distance'] = properties_chunk.min(axis=1)
    properties.loc[properties_chunk.index, 'Nearest_Station'] = properties_chunk.idxmin(axis=1)

    # Append the results to the new DataFrame
    properties_stations_shortest_distance = properties_stations_shortest_distance.append(
        properties.loc[properties_chunk.index, ['Postcode', 'Nearest_Station_Distance', 'Nearest_Station']]
    )

    # Calculate and print the elapsed time for the operation and progress
    operation_elapsed_time = time.time() - operation_start_time
    total_elapsed_time = time.time() - start_time
    print(f'Processed chunk {i // chunk_size + 1} / {total_chunks}, operation time: {operation_elapsed_time} seconds, total elapsed time: {total_elapsed_time} seconds')


Processed chunk 1 / 239, operation time: 0.9358251094818115 seconds, total elapsed time: 0.9544351100921631 seconds
Processed chunk 2 / 239, operation time: 0.417741060256958 seconds, total elapsed time: 1.3722429275512695 seconds
Processed chunk 3 / 239, operation time: 0.5525329113006592 seconds, total elapsed time: 1.9248378276824951 seconds
Processed chunk 4 / 239, operation time: 0.36725401878356934 seconds, total elapsed time: 2.2921669483184814 seconds
Processed chunk 5 / 239, operation time: 0.676314115524292 seconds, total elapsed time: 2.968546152114868 seconds
Processed chunk 6 / 239, operation time: 0.9976696968078613 seconds, total elapsed time: 3.96628999710083 seconds
Processed chunk 7 / 239, operation time: 0.783595085144043 seconds, total elapsed time: 4.749984979629517 seconds
Processed chunk 8 / 239, operation time: 0.5386979579925537 seconds, total elapsed time: 5.288750886917114 seconds
Processed chunk 9 / 239, operation time: 0.40950989723205566 seconds, total ela

In [19]:
# properties_stations_shortest_distance = properties[['Postcode', 'Nearest_Station_Distance', 'Nearest_Station']]
# properties_stations_shortest_distance

In [20]:
properties_stations_shortest_distance

Unnamed: 0,Postcode,Nearest_Station_Distance,Nearest_Station
0,GL8 8HA,9.742920,Kemble
1,SS2 4RJ,1.111976,Southend East
2,DA14 6BQ,1.315313,Sidcup
3,KT6 6HR,0.299212,Surbiton
4,ME7 4EB,0.904389,Gillingham (Kent)
...,...,...,...
1192273,WS13 6SY,0.469461,Lichfield Trent Valley
1192274,LE17 6LB,10.497000,Market Harborough
1192275,CB5 8FS,1.641322,Cambridge
1192276,HD9 7BE,0.985377,Brockholes


In [21]:
properties_stations_shortest_distance[properties_stations_shortest_distance["Postcode"] == "SW2 3BQ"]

Unnamed: 0,Postcode,Nearest_Station_Distance,Nearest_Station
425421,SW2 3BQ,0.48038,Streatham Hill


In [22]:
# Specify the output file path for the merged CSV file
output_file_path = f'../data/properties_trains_shortest_distance.csv'

# Write the merged_data DataFrame to a CSV file
properties_stations_shortest_distance.to_csv(output_file_path, index=False)