Compare speed between native and cythonized math functions

In [1]:
%load_ext Cython

In [2]:
%%cython
from libc.math cimport log, sqrt
def log_c(float x):
    return log(x)/2.302585092994046
def sqrt_c(float x):
    return sqrt(x)

In [3]:
import os
from os.path import expanduser
import pandas as pd
import pandas.io.sql as pd_sql
import math
from functions.auth.connections import postgres_connection

connection_uri = postgres_connection('mountain_project')

In [4]:
def transform_features(df):
    """ Add log and sqrt values
    """
    # add log values for ols linear regression
    df['log_star_ratings'] = df['star_ratings'].apply(lambda x: math.log(x+1, 10))
    df['log_ticks'] = df['ticks'].apply(lambda x: math.log(x+1, 10))
    df['log_avg_stars'] = df['avg_stars'].apply(lambda x: math.log(x+1, 10))
    df['log_length'] = df['length_'].apply(lambda x: math.log(x+1, 10))
    df['log_grade'] = df['grade'].apply(lambda x: math.log(x+2, 10))
    df['log_on_to_do_lists'] = df['on_to_do_lists'].apply(lambda x: math.log(x+1, 10)) # Target
    
    # add sqrt values for Poisson regression
    df['sqrt_star_ratings'] = df['star_ratings'].apply(lambda x: math.sqrt(x))
    df['sqrt_ticks'] = df['ticks'].apply(lambda x: math.sqrt(x))
    df['sqrt_avg_stars'] = df['avg_stars'].apply(lambda x: math.sqrt(x))
    df['sqrt_length'] = df['length_'].apply(lambda x: math.sqrt(x))
    df['sqrt_grade'] = df['grade'].apply(lambda x: math.sqrt(x+1))
    
    return df


def transform_features_cythonized(df):
    """ Add log and sqrt values using cythonized math functions
    """
    # add log values for ols linear regression
    df['log_star_ratings'] = df.star_ratings.apply(lambda x: log_c(x+1))
    df['log_ticks'] = df.ticks.apply(lambda x: log_c(x+1))
    df['log_avg_stars'] = df.avg_stars.apply(lambda x: log_c(x+1))
    df['log_length'] = df.length_.apply(lambda x: log_c(x+1))
    df['log_grade'] = df.grade.apply(lambda x: log_c(x+2))
    df['log_on_to_do_lists'] = df.on_to_do_lists.apply(lambda x: log_c(x+1))
    
    # add sqrt values for Poisson regression
    df['sqrt_star_ratings'] = df.star_ratings.apply(lambda x: sqrt_c(x))
    df['sqrt_ticks'] = df.ticks.apply(lambda x: sqrt_c(x))
    df['sqrt_avg_stars'] = df.avg_stars.apply(lambda x: sqrt_c(x))
    df['sqrt_length'] = df.length_.apply(lambda x: sqrt_c(x))
    df['sqrt_grade'] = df.grade.apply(lambda x: sqrt_c(x+1))
    
    return df

In [5]:
query = """
SELECT b.avg_stars, b.your_stars, b.length_, b.grade,
       r.star_ratings, r.suggested_ratings, r.on_to_do_lists, r.ticks
FROM routes b
LEFT JOIN ratings r ON b.url_id = r.url_id
WHERE b.area_name = 'buttermilks'
  AND length_ IS NOT NULL
"""
df = pd_sql.read_sql(query, connection_uri) # grab data as a dataframe
df.head()

Unnamed: 0,avg_stars,your_stars,length_,grade,star_ratings,suggested_ratings,on_to_do_lists,ticks
0,1.3,-1,10.0,-1.0,30,5,8,52
1,2.0,-1,15.0,-1.0,1,1,0,1
2,2.2,-1,18.0,-0.25,41,6,11,87
3,2.8,-1,18.0,-0.25,84,20,49,168
4,1.8,-1,12.0,-0.25,46,15,12,102


In [6]:
%%timeit
transform_features(df)

3.15 ms ± 55.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
%%timeit
transform_features_cythonized(df) # Cythonized math functions are only barely faster

2.98 ms ± 31.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
