In [1]:
import numpy as np
import pandas as pd
from math import *
%load_ext line_profiler
%load_ext memory_profiler

# Optimizando Pandas

1. Bucle sobre las filas
2. Bucle sobre los indices
3. Bucle con iterrows()
4. Bucle con apply()
5. Vectorizacion con Pandas series
6. Vectorizacion con NumPy arrays

# Funciones

In [10]:
# Funcion basica que calcula la distancia entre dos puntos de la tierra en https://en.wikipedia.org/wiki/Haversine_formula
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

# Funcion que aplica sobre las filas de un Dataframe y devuelve una serie de distancias a un punto dado
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        distance_list.append(d)
    return distance_list

# Funcion que aplica sobre los indices de un Dataframe y devuelve una serie de distancias a un punto dado
def haversine_looping_index(df):
    distance_list = []
    for idx in df.index:
        d = haversine(40.671, -73.985, df.loc[idx,'latitude'], df.loc[idx,'longitude'])
        distance_list.append(d)
    return distance_list

In [3]:
df = pd.read_csv('new_york_hotels.csv',encoding='cp1252')
df.head()

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39


In [71]:
df.shape

(1631, 12)

In [72]:
df.dtypes

ean_hotel_id        int64
name               object
address1           object
city               object
state_province     object
postal_code        object
latitude          float64
longitude         float64
star_rating       float64
high_rate         float64
low_rate          float64
distance          float64
dtype: object

## 1. Bucle sobre los registros

In [8]:
%%timeit
df['distance'] = haversine_looping(df)

1 loop, best of 3: 541 ms per loop


## 2. Bucle sobre los indices

In [11]:
%%timeit
df['distance'] = haversine_looping_index(df)

10 loops, best of 3: 54.3 ms per loop


## 3. Bucle sobre iterrows

In [34]:
%%timeit
# Haversine aplicado a las filas via iterrows
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985,\
                                      row['latitude'], row['longitude']))
df['distance'] = haversine_series

10 loops, best of 3: 148 ms per loop


## 4. Bucle con apply

In [11]:
%%timeit
df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

10 loops, best of 3: 65.9 ms per loop


In [4]:
# Haversine aplicado sobre filas con Profiling
%lprun -f haversine df.apply(lambda row: haversine(40.671, -73.985, row["latitude"], row["longitude"]), axis=1)

## 5. Bucle sobre vectorizacion de Pandas

In [12]:
%%timeit
df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

1000 loops, best of 3: 1.54 ms per loop


In [13]:
# Vectorizacion con profiling
%lprun -f haversine haversine(40.671, -73.985, df["latitude"], df["longitude"])

## 6. Bucle sobre vectorizacion numPy arrays

In [14]:
%%timeit
df['distance'] = haversine(40.671, -73.985,df['latitude'].values, df['longitude'].values)

The slowest run took 6.92 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 264 µs per loop


In [8]:
%%timeit
# Conversion de pandas arrays a NumPy ndarrays
np_lat = df['latitude'].values
np_lon = df['longitude'].values

The slowest run took 80.56 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 6.26 µs per loop


In [15]:
%lprun -f haversine df["distance"] = haversine(40.671, -73.985, df["latitude"].values, df["longitude"].values)

### Memory usage

In [68]:
%mprun -f haversine df["distance"] = haversine(40.671, -73.985, df["latitude"].values, df["longitude"].values)

ERROR: Could not find file <ipython-input-67-d2b5c58ef814>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.



In [65]:
from haversine import haversine
%mprun -f haversine df["distance"] = haversine(40.671, -73.985, df["latitude"].values, df["longitude"].values)


