# Computing Distance

## Import

In [1]:
import os
import pandas as pd
import itertools
import numpy as np
from functions.geoutilities import computeHaversine, computeHaversineVector

## Load Data

In [2]:
main_path = 'inputs'
file_path = os.path.join(main_path, 'zipcodes.csv')
zipcodes = pd.read_csv(file_path)
zipcodes.head(10)

Unnamed: 0,zip,city,state,latitude,longitude,timezone,dst
0,210,Portsmouth,NH,43.005895,-71.013202,-5,1
1,211,Portsmouth,NH,43.005895,-71.013202,-5,1
2,212,Portsmouth,NH,43.005895,-71.013202,-5,1
3,213,Portsmouth,NH,43.005895,-71.013202,-5,1
4,214,Portsmouth,NH,43.005895,-71.013202,-5,1
5,215,Portsmouth,NH,43.005895,-71.013202,-5,1
6,501,Holtsville,NY,40.922326,-72.637078,-5,1
7,544,Holtsville,NY,40.922326,-72.637078,-5,1
8,601,Adjuntas,PR,18.180103,-66.74947,-4,0
9,602,Aguada,PR,18.363285,-67.18024,-4,0


Filter some citis

In [3]:
# filter
cities = ("Chicago", "Los Angeles", "New York")
rows_cities = zipcodes['city'].map(lambda x: x in cities )
df_cities = zipcodes.loc[rows_cities,['city', 'latitude','longitude'] ]

# remove duplicates
df_cities.drop_duplicates(subset = ['city'], keep = 'first', inplace = True)
df_cities.head(10)

Unnamed: 0,city,latitude,longitude
3443,New York,40.750742,-73.99653
27062,Chicago,41.886456,-87.62325
38821,Los Angeles,33.972914,-118.24878


Create all possible 2-pair combinations of cities to compute distance

In [4]:
combo = itertools.combinations(df_cities['city'], 2) # iterator with tuples
city1, city2 = zip(*combo) # unzip
df_dist = pd.DataFrame({"city1": city1, "city2": city2})
df_dist.head(10)

Unnamed: 0,city1,city2
0,New York,Chicago
1,New York,Los Angeles
2,Chicago,Los Angeles


Add lat/long and compute distance

In [5]:
df_dist = df_dist.merge(df_cities, how = 'left', left_on = 'city1', right_on = 'city' )
df_dist.drop(["city"], axis = 1, inplace = True)
df_dist = df_dist.merge(df_cities, how = 'left', left_on = 'city2', right_on = 'city', suffixes = ('1','2'))
df_dist.drop(["city"], axis = 1, inplace = True)
df_dist.head(10)

Unnamed: 0,city1,city2,latitude1,longitude1,latitude2,longitude2
0,New York,Chicago,40.750742,-73.99653,41.886456,-87.62325
1,New York,Los Angeles,40.750742,-73.99653,33.972914,-118.24878
2,Chicago,Los Angeles,41.886456,-87.62325,33.972914,-118.24878


Compute distance

In [6]:
df_dist['distance'] = df_dist.apply(lambda x: 
                                    computeHaversine(x['latitude1'], x['longitude1'], x['latitude2'], x['longitude2']), axis = 1)
df_dist.head(10)

Unnamed: 0,city1,city2,latitude1,longitude1,latitude2,longitude2,distance
0,New York,Chicago,40.750742,-73.99653,41.886456,-87.62325,710.747081
1,New York,Los Angeles,40.750742,-73.99653,33.972914,-118.24878,2448.548498
2,Chicago,Los Angeles,41.886456,-87.62325,33.972914,-118.24878,1745.59311


Compute distance using vectorized function

In [7]:
df_dist['distance2'] = computeHaversineVector(df_dist['latitude1'], df_dist['longitude1'], df_dist['latitude2'], df_dist['longitude2'])
df_dist.head(10)


Unnamed: 0,city1,city2,latitude1,longitude1,latitude2,longitude2,distance,distance2
0,New York,Chicago,40.750742,-73.99653,41.886456,-87.62325,710.747081,710.747081
1,New York,Los Angeles,40.750742,-73.99653,33.972914,-118.24878,2448.548498,2448.548498
2,Chicago,Los Angeles,41.886456,-87.62325,33.972914,-118.24878,1745.59311,1745.59311
