In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option('display.width', 100)
pd.set_option("display.precision", 2)
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 16, 'font.family': 'sans'})

In [3]:
def load_dataframe_from_yelp_2(query):
    """
    Connects to yelp_2 database on Postgres and
    loads a Pandas dataframe based off sql query.

    Args:
        query (string): Sql query to select data from yelp_2.

    Returns:
        Dataframe: Pandas dataframe of records
                    from sql query of yelp_2 database.
    """
    connect = 'postgresql+psycopg2://postgres:password@localhost:5432/yelp_2'
    engine = create_engine(connect)
    df = pd.read_sql(query, con=engine)
    df = df.copy()
    return df

def counter(x):
    if x in ['None', None, '']:
        return 0
    else:
        y = x.split(',')
        return len(y)

In [4]:
query = '''
        SELECT *
        FROM all_features
        LIMIT 10000
        ;
        '''
df = load_dataframe_from_yelp_2(query)

In [5]:
dataset_release_date = pd.to_datetime('2020-3-25 19:13:01')

In [6]:
df['review_date'] = pd.to_datetime(df['review_date'], unit='ms')
df['user_yelping_since'] = pd.to_datetime(df['user_yelping_since'])

# Targets
Combination of review useful, funny, and cool vote counts evenly weighted.

In [7]:
# No discounting or scaling
df['T1_REG_ufc_total'] = df[['review_useful', 'review_funny', 'review_cool']].sum(axis=1)
df['T2_CLS_ufc_votes_or_not'] = df['T1_REG_ufc_total'] > 0
def usefulness_level(x):
    if x == 0:
        return 'zero'
    elif x < 10:
        return 'low'
    elif x < 100:
        return 'medium'
    elif x >= 100:
        return 'high'
    else:
        return 'unknown'
df['T3_CLS_ufc_level'] = df['T1_REG_ufc_total'].apply(usefulness_level)
# Time discounted
def target_time_discount(ufc_total, review_date):
    return (ufc_total / ((dataset_release_date - review_date).days)) * 365

df['T4_REG_ufc_per_year'] = df.apply(lambda x: target_time_discount(x.T1_REG_ufc_total, x.review_date), axis=1)
df['T5_CLS_ufc_per_year_level'] = df['T4_REG_ufc_per_year'].apply(usefulness_level)
# Time and Business Popularity Discounted
df['T6_REG_ufc_per_year_bus_disc'] = df['T4_REG_ufc_per_year'] / df['business_review_count']

df.drop(labels=['review_useful', 'review_funny', 'review_cool'], axis=1, inplace=True)

# Features 

## Nan/Null Values
Extremely small percentage were Nan/Null
    so dropping all rows with the Nan/Nulls.
Other Nan/Nulls avoided during sql joins and eda_prep.
Due to the large amount of records available
    and the relative completeness of the data,
    I am being liberal with dropping incomplete records.

In [8]:
df.dropna(inplace=True)

## Combining Columns
Columns that represent similar data
    that don't have individual value
    are summed together.

In [9]:
compliment_columns = ['user_compliment_hot', 'user_compliment_more', 
                      'user_compliment_profile', 'user_compliment_cute',
                      'user_compliment_list', 'user_compliment_note',
                      'user_compliment_plain', 'user_compliment_cool',
                      'user_compliment_funny', 'user_compliment_writer',
                      'user_compliment_photos']
df['user_compliments'] = df[compliment_columns].sum(axis=1)
df.drop(labels=compliment_columns, axis=1, inplace=True)

df['user_total_ufc'] = df[['user_useful', 'user_funny', 'user_cool']].sum(axis=1)
df.drop(labels=['user_useful', 'user_funny', 'user_cool'], axis=1, inplace=True)

## User Elite

In [10]:
def count_elite(x):
    elite_count = 0 
    if x in ['None', None, '']:
        elite_count = 0
    else:
        y = x.split(',')
        elite_count = len(y)
    return elite_count

df['user_elite_count'] = df.user_elite.apply(count_elite)

def years_since_most_recent_elite(x):
    z = 0 
    if x in ['None', None, '']:
        z = 0
    else:    
        y = pd.to_numeric(x.split(','))
        z = max(y)
    return 2020 - z

df['user_elite_most_recent'] = df.user_elite.apply(years_since_most_recent_elite)

## Time Discounting

### User Elite

In [11]:
def count_elite_td(user_elite, review_date):
    if user_elite in ['None', None, '']:
        return 0
    else:
        split_elites = user_elite.split(',')
        elites_pre_review = [elite for elite in split_elites if pd.to_datetime(elite) < review_date]
        return len(elites_pre_review)

df['user_elite_count_td'] = df.apply(lambda x: count_elite_td(x.user_elite, x.review_date), axis=1)

def years_since_most_recent_elite_td(user_elite, review_date): 
    z = 0
    if user_elite in ['None', None, '']:
        z = 0
    else:    
        split_elites = pd.to_numeric(user_elite.split(','))
        elites_pre_review = [elite for elite in split_elites if pd.to_datetime(elite) < review_date]
        if len(elites_pre_review) == 0:
            z = 0
        else:
            z = max(elites_pre_review)
    return review_date.year - z
df['user_elite_most_recent_td'] = df.apply(lambda x: years_since_most_recent_elite_td(x.user_elite, x.review_date), axis=1)

### All Other User Features

In [12]:
user_features_needing_time_discounting = ['user_total_ufc', 'user_compliments', 'user_review_count', 'user_fans', 'user_friend_count']

def user_time_discount(count_feature, user_yelping_since, review_date):
    return (count_feature / (dataset_release_date - user_yelping_since).days) * ((review_date - user_yelping_since).days)

for feature in user_features_needing_time_discounting:
    df[f'{feature}_td'] = df.apply(lambda x: user_time_discount(x[feature], x.user_yelping_since, x.review_date), axis=1)

### Business Checkin and Review Count

In [13]:
business_features_needing_time_discounting = ['business_review_count', 'business_checkin_count']

def business_time_discount(count_feature, oldest_checkin, review_date):
    return (count_feature / (dataset_release_date - oldest_checkin).days) * ((review_date - oldest_checkin).days)

for feature in business_features_needing_time_discounting:
    df[f'{feature}_td'] = df.apply(lambda x: business_time_discount(x[feature], x.business_oldest_checkin, x.review_date), axis=1)

In [16]:
df.columns

Index(['review_id', 'user_id', 'business_id', 'review_stars', 'review_date', 'business_latitude',
       'business_longitude', 'business_avg_stars', 'business_review_count', 'business_is_open',
       'business_categories', 'business_checkin_count', 'business_oldest_checkin',
       'business_newest_checkin', 'user_avg_stars', 'user_review_count', 'user_yelping_since',
       'user_fans', 'user_friend_count', 'user_elite', 'T1_REG_ufc_total',
       'T2_CLS_ufc_votes_or_not', 'T3_CLS_ufc_level', 'T4_REG_ufc_per_year',
       'T5_CLS_ufc_per_year_level', 'T6_REG_ufc_per_year_bus_disc', 'user_compliments',
       'user_total_ufc', 'user_elite_count', 'user_elite_most_recent', 'user_elite_count_td',
       'user_elite_most_recent_td', 'user_total_ufc_td', 'user_compliments_td',
       'user_review_count_td', 'user_fans_td', 'user_friend_count_td', 'business_review_count_td',
       'business_checkin_count_td'],
      dtype='object')