In [12]:
### Importing libraries
import numpy as np
import pandas as pd
import cProfile
import timeit

In [13]:
## Reading the data file
df = pd.read_csv("clinics.csv")
df.head()

Unnamed: 0,bizID,bizCat,bizCatSub,bizName,bizAddr,bizCity,bizState,bizZip,bizPhone,bizFax,...,bizURL,locAreaCode,locFIPS,locTimeZone,locDST,locLat,locLong,locMSA,locPMSA,locCounty
0,1,Clinics,Clinics,Hino Ronald H MD,98-151 Pali Momi Street Suite 142,Aiea,HI,96701,(808)487-2477,,...,,808,15003,PST-2,N,21.398,-157.8981,3320.0,,Honolulu
1,2,Clinics,Clinics,Farmer Joesph F Md,1225 Breckenridge Drive,Little Rock,AR,72205,(501)225-2594,,...,,501,5119,CST,Y,34.7495,-92.3533,4400.0,,Pulaski
2,3,Clinics,Clinics & Medical Centers,Najjar Fadi Md,1155 West Linda Avenue Suite B,Hermiston,OR,97838,(541)289-1122,,...,,541,41059,PST,Y,45.8456,-119.2817,,,Umatilla
3,4,Clinics,Clinics & Medical Centers,Kittson Memorial Upper Level Nursing Home,1010 South Birch Avenue,Hallock,MN,56728,(218)843-2525,,...,,218,27069,CST,Y,48.7954,-97.009,,,Kittson
4,5,Clinics,Clinics & Medical Centers,Thompson Robert B Md,100 North Eagle Creek Drive,Lexington,KY,40509,(859)258-4000,,...,www.lexingtonclinic.com,859,21067,EST,Y,37.9935,-84.3712,4280.0,,Fayette


In [14]:
## Function to calculate haversine distance using latitude and longitude values
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

In [15]:
%%timeit
## Implementing haversine distance calculation using a for loop based approach
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['locLat'], df.iloc[i]['locLong'])
        distance_list.append(d)
    return distance_list

46 ns ± 2.44 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [16]:
## cProfile for implementing haversine distance calculation using a for loop based approach
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['locLat'], df.iloc[i]['locLong'])
        distance_list.append(d)
    return distance_list
cProfile.run("df['distance'] = haversine_looping(df)")

         32540 function calls (32163 primitive calls) in 0.035 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(append)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(atleast_2d)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(bincount)
        4    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(concatenate)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(copyto)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(ndim)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(ravel)
      189    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)
       30    0.001    0.000    0.001    0.000 <ipython-input-14-9bfa63cadb72>:2(haversine)
        1    0.001 

In [20]:
%%timeit
### profiling using iterows function
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['locLat'], row['locLong']))
df['distance'] = haversine_series

3.54 ms ± 349 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
### cprofile using iterrows function
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['locLat'], row['locLong']))
cProfile.run("df['distance'] = haversine_series")

         129 function calls (123 primitive calls) in 0.000 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(atleast_2d)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(copyto)
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 _asarray.py:16(asarray)
        1    0.000    0.000    0.000    0.000 _asarray.py:88(asanyarray)
        1    0.000    0.000    0.000    0.000 base.py:2637(get_loc)
        2    0.000    0.000    0.000    0.000 base.py:3898(__contains__)
        1    0.000    0.000    0.000    0.000 base.py:615(__len__)
        1    0.000    0.000    0.000    0.000 blocks.py:2089(should_store)
        1    0.000    0.000    0.000    0.000 blocks.py:339(dtype)
        1    0.000    0.000    0.000    0.000 blocks.py:368(set)
        1    0.000    0.000

In [21]:
%%timeit
## Calculating haversine distance using apply() function
def lambda_haversine():
    df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['locLat'], row['locLong']), \
                          axis=1)
    return df["distance"]

45.3 ns ± 0.923 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [22]:
## cProfile for haversine distance using apply() function
def lambda_haversine(df):
    df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['locLat'], row['locLong']), \
                          axis=1)
    return df["distance"]
cProfile.run("lambda_haversine(df)")

         5594 function calls (5421 primitive calls) in 0.014 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(atleast_2d)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(copyto)
       21    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:103(release)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:143(__init__)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:147(__enter__)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:151(__exit__)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:157(_get_module_lock)
        1    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:176(cb)
        1

In [23]:
%%timeit
### Vect0rozied implementation of haversine distance calculation
df['distance'] = haversine(40.671, -73.985, df['locLat'], df['locLat'])


1.71 ms ± 57.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [24]:
## cProfile for vectorized implementaion
cProfile.run("df['distance'] = haversine(40.671, -73.985, df['locLat'], df['locLat'])")

         5352 function calls (5293 primitive calls) in 0.008 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(atleast_2d)
       11    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(prod)
       40    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)
        1    0.000    0.000    0.007    0.007 <ipython-input-14-9bfa63cadb72>:2(haversine)
        1    0.000    0.000    0.007    0.007 <string>:1(<module>)
        2    0.000    0.000    0.000    0.000 __init__.py:121(_maybe_match_name)
       11    0.000    0.000    0.000    0.000 __init__.py:156(maybe_upcast_for_op)
       11    0.000    0.000    0.000    0.000 __init__.py:424(_align_method_SERIES)
       11    0.000    0.000    0.002    0.000 __init__.py:445(_construct_result)
       11    0.000    0.000    0.005    0.000 __init__.py:492(wrap

In [25]:
%%timeit
def haversine_numpy(df):
    ## Vectorized implementation of haversine function using numpy array
    df['distance'] = haversine(40.671, -73.985, df['locLat'].values, df['locLat'].values)
haversine_numpy(df)


107 µs ± 6.16 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [26]:
## cProfile for vectorized implementation of haversine function using numpy array

cProfile.run("df['distance'] = haversine(40.671, -73.985, df['locLat'].values, df['locLat'].values)")

         202 function calls (196 primitive calls) in 0.001 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(atleast_2d)
        1    0.000    0.000    0.000    0.000 <ipython-input-14-9bfa63cadb72>:2(haversine)
        1    0.000    0.000    0.001    0.001 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 _asarray.py:16(asarray)
        1    0.000    0.000    0.000    0.000 _asarray.py:88(asanyarray)
        2    0.000    0.000    0.000    0.000 base.py:1383(nlevels)
        3    0.000    0.000    0.000    0.000 base.py:2637(get_loc)
        4    0.000    0.000    0.000    0.000 base.py:3898(__contains__)
        1    0.000    0.000    0.000    0.000 base.py:3912(__getitem__)
        1    0.000    0.000    0.000    0.000 base.py:5294(ensure_index)
        1    0.000    0.000    0.000    0.000 base.py:615(__len__)
        1    0.00