In [1]:
import pandas as pd
import pandas.io.sql as pd_sql
import psycopg2 as pg
from sqlalchemy import create_engine
from math import log, sqrt

# psychopg connection parameters
params = {'user': 'postgres', 'host': 'localhost', 'port': 5432, 'password': None}
connection = pg.connect(**params, dbname='mountain_project') # Connect

# sqlalchemy connection arguments
connection_str = f'postgres://postgres:{None}@localhost:5432/mountain_project'
engine = create_engine(connection_str)

In [2]:
%load_ext Cython

In [3]:
%%cython
from libc.math cimport log, sqrt # Importing cython math functions
def clog(float x):
    return log(x)/2.302585092994046
def csqrt(float x):
    return sqrt(x)

In [4]:
# Python Math Functions
def add_log_values(df): # For Linear Regression
    df['log_star_ratings'] = df.star_ratings.apply(lambda x: log(x+1, 10))
    df['log_ticks'] = df.ticks.apply(lambda x: log(x+1, 10))
    df['log_avg_stars'] = df.avg_stars.apply(lambda x: log(x+1, 10))
    df['log_length'] = df.length_.apply(lambda x: log(x+1, 10))
    df['log_grade'] = df.grade.apply(lambda x: log(x+2, 10))
    df['log_on_to_do_lists'] = df.on_to_do_lists.apply(lambda x: log(x+1, 10)) # Target
    return df

def add_sqrt_values(df): # For Poisson Regression
    df['sqrt_star_ratings'] = df.star_ratings.apply(lambda x: sqrt(x))
    df['sqrt_ticks'] = df.ticks.apply(lambda x: sqrt(x))
    df['sqrt_avg_stars'] = df.avg_stars.apply(lambda x: sqrt(x))
    df['sqrt_length'] = df.length_.apply(lambda x: sqrt(x))
    df['sqrt_grade'] = df.grade.apply(lambda x: sqrt(x+1))
    return df

In [5]:
# Cythonized functions
def c_add_log_values(df): # For Linear Regression
    df['log_star_ratings'] = df.star_ratings.apply(lambda x: clog(x+1))
    df['log_ticks'] = df.ticks.apply(lambda x: clog(x+1))
    df['log_avg_stars'] = df.avg_stars.apply(lambda x: clog(x+1))
    df['log_length'] = df.length_.apply(lambda x: clog(x+1))
    df['log_grade'] = df.grade.apply(lambda x: clog(x+2))
    df['log_on_to_do_lists'] = df.on_to_do_lists.apply(lambda x: clog(x+1)) # Target
    return df

def c_add_sqrt_values(df): # For Poisson Regression
    df['sqrt_star_ratings'] = df.star_ratings.apply(lambda x: csqrt(x))
    df['sqrt_ticks'] = df.ticks.apply(lambda x: csqrt(x))
    df['sqrt_avg_stars'] = df.avg_stars.apply(lambda x: csqrt(x))
    df['sqrt_length'] = df.length_.apply(lambda x: csqrt(x))
    df['sqrt_grade'] = df.grade.apply(lambda x: csqrt(x+1))
    return df

In [6]:
query = """
SELECT f.id_, f.avg_stars, f.length_, f.grade, s.star_ratings, s.suggested_ratings, s.on_to_do_lists, s.ticks
FROM buttermilks f, buttermilks_scrape s
WHERE f.id_=s.id_
"""
buttermilks_df = pd_sql.read_sql(query, connection) # grab data as a dataframe
buttermilks_df = buttermilks_df.sort_values('id_').reset_index(drop=True) # enforce index

In [7]:
%%timeit
buttermilks_log_df = add_log_values(buttermilks_df)

8.62 ms ± 211 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
%%timeit
buttermilks_log_df = c_add_log_values(buttermilks_df) # Cythonized math functions are faster

7.76 ms ± 65.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
