In [1]:
import pandas as pd
import math

import pandas.io.sql as pd_sql
import psycopg2 as pg
from sqlalchemy import create_engine

# psychopg connection parameters
params = {'user': 'postgres', 'host': 'localhost', 'port': 5432, 'password': None}
connection = pg.connect(**params, dbname='mountain_project') # Connect
cursor = connection.cursor()

# sqlalchemy connection arguments
connection_str = f'postgres://postgres:{None}@localhost:5432/mountain_project'
engine = create_engine(connection_str)

In [2]:
# For ordinary-least-squares linear regression
def add_log_values(df):
    df['log_star_ratings'] = df['star_ratings'].apply(lambda x: math.log(x+1, 10))
    df['log_ticks'] = df['ticks'].apply(lambda x: math.log(x+1, 10))
    df['log_avg_stars'] = df['avg_stars'].apply(lambda x: math.log(x+1, 10))
    df['log_length'] = df['length_'].apply(lambda x: math.log(x+1, 10))
    df['log_grade'] = df['grade'].apply(lambda x: math.log(x+2, 10))
    df['log_on_to_do_lists'] = df['on_to_do_lists'].apply(lambda x: math.log(x+1, 10)) # Target
    return df

# For Poisson regression
def add_sqrt_values(df):
    df['sqrt_star_ratings'] = df['star_ratings'].apply(lambda x: math.sqrt(x))
    df['sqrt_ticks'] = df['ticks'].apply(lambda x: math.sqrt(x))
    df['sqrt_avg_stars'] = df['avg_stars'].apply(lambda x: math.sqrt(x))
    df['sqrt_length'] = df['length_'].apply(lambda x: math.sqrt(x))
    df['sqrt_grade'] = df['grade'].apply(lambda x: math.sqrt(x+1))
    return df

In [3]:
query = """
SELECT f.id_, f.avg_stars, f.length_, f.grade, s.star_ratings, s.suggested_ratings, s.on_to_do_lists, s.ticks
FROM buttermilks f, buttermilks_scrape s
WHERE f.id_=s.id_
"""
buttermilks_df = pd_sql.read_sql(query, connection) # grab data as a dataframe
buttermilks_df = buttermilks_df.sort_values('id_').reset_index(drop=True) # enforce index
buttermilks_log_df = add_log_values(buttermilks_df)
buttermilks_log_sqrt_df = add_sqrt_values(buttermilks_log_df)
#buttermilks_log_sqrt_df[['id_', 'log_star_ratings', 'log_ticks', 'log_avg_stars', 'log_length', 'log_grade', 'log_on_to_do_lists',
#                       'sqrt_star_ratings', 'sqrt_ticks', 'sqrt_avg_stars', 'sqrt_length', 'sqrt_grade']].to_sql('buttermilks_transform', engine, index=False)

In [69]:
query = """
SELECT f.id_, f.avg_stars, f.length_, f.grade, s.star_ratings, s.suggested_ratings, s.on_to_do_lists, s.ticks
FROM druid_stones f, druid_stones_scrape s
WHERE f.id_=s.id_
"""
druid_stones_df = pd_sql.read_sql(query, connection) # grab data as a dataframe
druid_stones_df = druid_stones_df.sort_values('id_').reset_index(drop=True) # enforce index
druid_stones_log_df = add_log_values(druid_stones_df)
druid_stones_log_sqrt_df = add_sqrt_values(druid_stones_log_df)
druid_stones_log_sqrt_df[['id_', 'log_star_ratings', 'log_ticks', 'log_avg_stars', 'log_length', 'log_grade', 'log_on_to_do_lists',
                       'sqrt_star_ratings', 'sqrt_ticks', 'sqrt_avg_stars', 'sqrt_length', 'sqrt_grade']].to_sql('druid_stones_transform', engine, index=False)

In [71]:
query = """
SELECT f.id_, f.avg_stars, f.length_, f.grade, s.star_ratings, s.suggested_ratings, s.on_to_do_lists, s.ticks
FROM happy_boulders f, happy_boulders_scrape s
WHERE f.id_=s.id_
"""
happy_boulders_df = pd_sql.read_sql(query, connection) # grab data as a dataframe
happy_boulders_df = happy_boulders_df.sort_values('id_').reset_index(drop=True) # enforce index
happy_boulders_log_df = add_log_values(happy_boulders_df)
happy_boulders_log_sqrt_df = add_sqrt_values(happy_boulders_log_df)
happy_boulders_log_sqrt_df[['id_', 'log_star_ratings', 'log_ticks', 'log_avg_stars', 'log_length', 'log_grade', 'log_on_to_do_lists',
                       'sqrt_star_ratings', 'sqrt_ticks', 'sqrt_avg_stars', 'sqrt_length', 'sqrt_grade']].to_sql('happy_boulders_transform', engine, index=False)

In [72]:
query = """
SELECT f.id_, f.avg_stars, f.length_, f.grade, s.star_ratings, s.suggested_ratings, s.on_to_do_lists, s.ticks
FROM sad_boulders f, sad_boulders_scrape s
WHERE f.id_=s.id_
"""
sad_boulders_df = pd_sql.read_sql(query, connection) # grab data as a dataframe
sad_boulders_df = sad_boulders_df.sort_values('id_').reset_index(drop=True) # enforce index
sad_boulders_log_df = add_log_values(sad_boulders_df)
sad_boulders_log_sqrt_df = add_sqrt_values(sad_boulders_log_df)
sad_boulders_log_sqrt_df[['id_', 'log_star_ratings', 'log_ticks', 'log_avg_stars', 'log_length', 'log_grade', 'log_on_to_do_lists',
                       'sqrt_star_ratings', 'sqrt_ticks', 'sqrt_avg_stars', 'sqrt_length', 'sqrt_grade']].to_sql('sad_boulders_transform', engine, index=False)

In [8]:
query = """
SELECT f.id_, f.avg_stars, f.length_, f.grade, s.star_ratings, s.suggested_ratings, s.on_to_do_lists, s.ticks
FROM joshua_tree f, joshua_tree_scrape s
WHERE f.id_=s.id_
"""
joshua_tree_df = pd_sql.read_sql(query, connection) # grab data as a dataframe
joshua_tree_df = joshua_tree_df.sort_values('id_').reset_index(drop=True) # enforce index
joshua_tree_log_df = add_log_values(joshua_tree_df)
joshua_tree_log_sqrt_df = add_sqrt_values(joshua_tree_log_df)
#joshua_tree_log_sqrt_df[['id_', 'log_star_ratings', 'log_ticks', 'log_avg_stars', 'log_length', 'log_grade', 'log_on_to_do_lists',
#                       'sqrt_star_ratings', 'sqrt_ticks', 'sqrt_avg_stars', 'sqrt_length', 'sqrt_grade']].to_sql('joshua_tree_transform', engine, index=False)