In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.precision", 2)
%matplotlib inline
plt.style.use('ggplot')

In [3]:
def load_dataframe_from_yelp_2(query):
    """
    Connects to yelp_2 database on Postgres and
    loads a Pandas dataframe based off sql query.

    Args:
        query (string): Sql query to select data from yelp_2.

    Returns:
        Dataframe: Pandas dataframe of records
                    from sql query of yelp_2 database.
    """
    connect = 'postgresql+psycopg2://postgres:password@localhost:5432/yelp_2'
    engine = create_engine(connect)
    df = pd.read_sql(query, con=engine)
    df = df.copy()
    return df

def counter(x):
    if x in ['None', None, '']:
        return 0
    else:
        y = x.split(',')
        return len(y)

In [5]:
query = '''
        SELECT *
        FROM checkin
        ;
        '''
df = load_dataframe_from_yelp_2(query)

In [7]:
df.describe(include='all')

In [None]:
df['checkin_count'] = df.date.apply(counter)

In [None]:
df.head(5)

In [None]:
df['date_list'] = [pd.to_datetime(x) for x in df.date.str.split(',')]

In [None]:
df = df.drop('date', axis=1)
df.head()

In [None]:
month_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

date_column_list = []
date_comparison_list = []
for year in list(range(2010, 2021)):
    for month in month_list:
        date = f'{month} {year}'
        date_column_list.append(f'checkins_before_{month}_{year}')
        datetime = pd.to_datetime(date)
        date_comparison_list.append(datetime)

In [None]:
for idx, val in enumerate(date_column_list):
    df[val] = df.date_list.apply(lambda x: sum(1 if y < date_comparison_list[idx] else 0 for y in x))
    df[f'percent_of_{val}'] = df[val] / df['checkin_count']