In [None]:

%logstop
%logstart -t -r -q ipython_command_log.py global

#- IRONHACKS RESEARCH TRACKING CODE
#----------------------------------
# The following code is used to help our research team understand how you 
# our notebook environment. We do not collect any personal information with
# the following code, it is used to measure when and how often you work on
# your submission files.

import os
from datetime import datetime
import IPython.core.history as history

ha = history.HistoryAccessor()
ha_tail = ha.get_tail(1)
ha_cmd = next(ha_tail)
session_id = str(ha_cmd[0])
command_id = str(ha_cmd[1])
timestamp = datetime.utcnow().isoformat()
history_line = ','.join([session_id, command_id, timestamp]) + '\n'
logfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')
logfile.write(history_line)
logfile.close()

In [None]:
#- IMPORT THE LIBRARIES 
#-----------------------

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics

Funtions

In [None]:
def query(querys):
    """
    Will return a tuple with pandas dataframes of all the query results
    input: list of querys
    output: tuple of dataframes
    """
    BIGQUERY_PROJECT = 'ironhacks-covid19-data'
    BIGQUERY_KEYPATH = '../home/jovyan/key.json'

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = BIGQUERY_KEYPATH
    bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)
    
    df_results = []
    for i, q in enumerate(querys):
        df = bigquery_client.query(q).to_dataframe()
        df_results.append(df)
        num_cols = len(df.columns)
        num_rows = len(df)
        print("Query {a} has {b} elements with {c} features".format(a=i, c=num_cols, b=num_rows))
    return tuple(df_results)

In [None]:
def rollout_dataframe(df, var_cols, rolling_epoch, predict=False):
    """
    Coverts a time series formated dataframe into a dataframe that can be used for training a classifiaction based model type (like random forest)
    var_cols are all the column names that are varites you want to use in the new dataframe (can handle uni and multivariate)
    rolling_epoch is number time steps per row for training. There are no repeats, so if I have 44 time steps and rolling epoch is 4, new row 1
    will be weeks 1,2,3,4 and then weeks 5 label is attached. Row 2 is weeks 6,7,8,9 abd week 10 is the label. can also make the dataframe needed to predict
    week 44 counts
    input: df=df to roll out; var_cols=column names to rollout; rolling_epoch=number of time steps per row; predict=if you want to make one for predicitng week 44
    returns: pandas dataframe
    """
    roll_cols = []
    for x in variation_cols:
        for y in range(1,rolling_epoch+1):
            roll_cols.append(x+str(y))
    if not predict:
        roll_cols.append('label')
    roll_cols = ['latitude', 'longitude'] + roll_cols
    
    
    if predict:
        roll_cols = ["poi_id"] + roll_cols
        df_main = pd.DataFrame([],  columns = roll_cols)
        for poi_id in poi_ids:
            tmp = df.loc[df["poi_id"]==poi_id].copy()
            cols = [tmp.iloc[0]['poi_id'], float(tmp.iloc[0]['latitude']), float(tmp.iloc[0]['longitude'])]
            for var in variation_cols:
                for y in range(len(tmp)-rolling_epoch, len(tmp)):
                    a = tmp.iloc[y][var]
                    cols.append(a)
            df_main.loc[-1] = cols  
            df_main.index = df_main.index + 1 
            df_main = df_main.sort_index()  
    else:
        df_main = pd.DataFrame([],  columns = roll_cols)
        for poi_id in poi_ids:
            tmp = df.loc[df["poi_id"]==poi_id].copy()
            for x in range(0, len(tmp), rolling_epoch+1):
                if x + rolling_epoch >= len(tmp):
                    break
                cols = [float(tmp.iloc[0]['latitude']), float(tmp.iloc[0]['longitude'])]
                for var in variation_cols:
                    for y in range(x, x+rolling_epoch):
                        a = tmp.iloc[y][var]
                        cols.append(a)
                cols.append(tmp.iloc[x+rolling_epoch]['raw_visit_counts'])
                df_main.loc[-1] = cols  
                df_main.index = df_main.index + 1
                df_main = df_main.sort_index() 
    return df_main

Getting data

In [None]:
"""
there has be to be two querys here because for some reason, I found that the social distancing table has lot and lots of duplicate rows (like tens of millions). 
This makes any query where I try to join in sql to get to large and fail. Therefore, I removed the duplicates and merged them with pandas
"""
query1 = """
         SELECT *
         FROM `ironhacks_covid19_competition`.`weekly_patterns`
         """
query2 = """
         SELECT *
         FROM `ironhacks_covid19_competition`.`cbg_social_distancing`
         WHERE cbg in (SELECT DISTINCT(poi_cbg) FROM `ironhacks_covid19_competition`.`weekly_patterns`) 
         """
(df_weekly, df_social_dist) = query([query1, query2])

# drop the duplicates from the social distancing table
df_social_dist.drop_duplicates(subset=None, keep="first", inplace=True)

# rename the cbg id column so table can be merged
df_weekly.rename(columns={'poi_cbg':'cbg'}, inplace=True)

# merge dataframes on both the cbg and week_number columns to get final per week merged table
df = pd.merge(df_weekly, df_social_dist, on = ["cbg", "week_number"], how = "inner")

# print some quick info on the table to make sure it was made right
print("Columns:")
print('\n'.join(df.columns))
print("\nResults:")
print(df.head())

In [None]:
# get a set of all the poi_ids that are being used here
poi_ids = list(set(df_rf["poi_id"].tolist()))

# check that there are no missing values
print(sum(df_main.isna().values))

# a quick investigation to see if every poi has the same right number of weeks present
print(df.groupby(by='poi_id').agg('count')['week_number'])

In [None]:
# roll out the dataframe so it can be use by things like randomforest, svm adaboost, gradient boost
variation_cols = ['raw_visit_counts', 'visits_concentration',
       'distance_from_home', 'median_dwell', 'device_count_week',
       'completely_home_device_count_per_week',
       'median_home_dwell_time_per_week',
       'median_non_home_dwell_time_per_week']
rolling_epoch = 4

df_rolling = rollout_dataframe(df, variation_cols, rolling_epoch)
num_cols = len(df_rolling.columns)
num_rows = len(df_rolling)
print("Dataframe has {b} elements with {c} features".format(c=num_cols, b=num_rows))
print(df_rolling.head())

In [None]:
num_cols = len(df_rolling.columns)
num_rows = len(df_rolling)
print("Dataframe has {b} elements with {c} features".format(c=num_cols, b=num_rows))
print(df_rolling.head())

Building model and hyper parameter search

In [None]:
# do some hyperparamter searching
parameters = {'n_estimators':[100,150,200]}

X, y = df_rolling.iloc[:,:-1], df_rolling.iloc[:,-1]
rfr = sklearn.ensemble.RandomForestRegressor()
rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
clf = GridSearchCV(rfr, param_grid=parameters, cv=rs)
clf.fit(X, y)
clf.cv_results_

Making predictions

In [None]:
# build a predict df for predicitng week 44
predict_df = rollout_dataframe(df, variation_cols, rolling_epoch, predict=True)

In [None]:
# get the predications and the poi_ids they are for
pred_X = predict_df.iloc[:,1:]
labels = predict_df.iloc[:,0]
preds = clf.predict(pred_X)
# predictions are not whole numbers, so round 
preds = preds.round()

Writing results

In [None]:
# write to csv file for submission
with open("results_rf.csv", "w") as f:
    f.write("poi_id,raw_visit_counts\n")
    for poi_id, count in zip(labels, preds):
        f.write("{a},{b}\n".format(a=poi_id, b=int(count)))
    f.close()