In [None]:
%%capture
%logstop
%logstart -t -r -q ipython_command_log.py global

#- IRONHACKS RESEARCH TRACKING CODE
#----------------------------------
# The following code is used to help our research team understand how you 
# our notebook environment. We do not collect any personal information with
# the following code, it is used to measure when and how often you work on
# your submission files.

import os
from datetime import datetime
import IPython.core.history as history

ha = history.HistoryAccessor()
ha_tail = ha.get_tail(1)
ha_cmd = next(ha_tail)
session_id = str(ha_cmd[0])
command_id = str(ha_cmd[1])
timestamp = datetime.utcnow().isoformat()
history_line = ','.join([session_id, command_id, timestamp]) + '\n'
logfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')
logfile.write(history_line)
logfile.close()

In [None]:
!pip install mpld3
!pip install folium

In [None]:
import numpy as np
import pandas as pd
import warnings
import requests
import random

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from scipy.stats.stats import pearsonr

from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
from ipywidgets import *
from ipyleaflet import *
import pyarrow

import folium
from folium import plugins
from folium.plugins import HeatMap, HeatMapWithTime

import matplotlib.pyplot as plt
import seaborn as sns
import mpld3

In [None]:
warnings.filterwarnings('ignore')

In [None]:
BIGQUERY_PROJECT = 'ironhacks-covid19-data'
BIGQUERY_KEYPATH = '../service-account.json'

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = BIGQUERY_KEYPATH
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
query = """
SELECT poi_id, poi_cbg, location_name, week_number, date_start, raw_visit_counts, visits_concentration, distance_from_home, median_dwell
FROM ironhacks_covid19_competition.weekly_patterns;
"""

query_job = bigquery_client.query(query)

In [None]:
poi_lists_query = """
SELECT *
FROM ironhacks_covid19_competition.prediction_list_poi
"""

query_job_poi = bigquery_client.query(poi_lists_query)

In [None]:
poi_list = query_job_poi.to_dataframe()
weekly_patterns = query_job.to_dataframe()

Modelling functions

In [None]:
def arima(poi_id):
    data = weekly_patterns[weekly_patterns['poi_id'] == poi_id].sort_values(by='week_number')
    
    model = ARIMA(data['raw_visit_counts'].values, order=(1,0,0))
    fitted = model.fit()
    
    forecast = fitted.forecast(steps=1)[0]
    data = data[['week_number', 'raw_visit_counts']]
    
    return data, forecast

In [None]:
def convert_fig_to_html(fig):
    """ Convert Matplotlib figure 'fig' into a <img> tag for HTML use using base64 encoding. """
    import urllib
    from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
    import StringIO

    canvas = FigureCanvas(fig)
    png_output = StringIO.StringIO()
    canvas.print_png(png_output)
    data = png_output.getvalue().encode('base64')

    return '<img src="data:image/png;base64,{}">'.format(urllib.quote(data.rstrip('\n')))

In [None]:
locations = weekly_patterns['poi_id'].unique()

Modelling time series data using Autoregression model

In [None]:
data = []
forecasts = []

print("Modelling started")

for idx, location in enumerate(locations):
    try:
        d, forecast = arima(location)
        data.append(d)
        forecasts.append(forecast)

        if idx % 500 == 0:
            print("Finished modelling {}/1804 locations".format(idx))
            
    except Exception as e:
        print(e)
        break;

In [None]:
# retrieving top 50 busiest POIs from the predicted values

forecasts_ = forecasts
data_50 = []
forecasts_50 = []
locations_50 = []

for f in sorted(forecasts_, reverse=True)[:50]:
    forecasts_50.append(f)
    data_50.append(data[forecasts_.index(f)])
    loc = weekly_patterns.loc[weekly_patterns['poi_id'] == locations[forecasts_.index(f)]]['location_name'].unique()[0]
    locations_50.append(loc)

In [None]:
# plotting top 50 busiest POIs

fig, axes = plt.subplots(10,5, figsize=(26,13))

for ax, datum, forecast, location in zip(axes.flatten(), data_50, forecasts_50, locations_50):
    ax.plot(datum['week_number'], datum['raw_visit_counts'], label="Historic data")
    ax.plot(44, forecast, 'o', label="Forecast for week 44")
    ax.set_title(location)

fig.text(0.5, -0.04, 'Week number', ha='center', fontsize=20)
fig.text(-0.02, 0.5, 'Visit counts', va='center', rotation='vertical', fontsize=20)
plt.tight_layout()
plt.legend(bbox_to_anchor=(2.1, 1.5))

plt.savefig('visit_counts_viz.png', dpi=300, bbox_inches='tight', transparent=True)

In [None]:
# calculating general statistics like average visit concentration and median dwell time

concs, dists, dwells = [], [], []

for week in weekly_patterns['week_number'].unique():
    temp = weekly_patterns[weekly_patterns['week_number'] == week]
    concs.append(temp['visits_concentration'].mean())
    dists.append(temp['distance_from_home'].mean())
    dwells.append(temp['median_dwell'].mean())

In [None]:
# plotting general statistics

fig, axes = plt.subplots(1,2, figsize=(10,4))

axes[0].plot(range(11,42), concs, label="Visit concentration")
axes[0].set_xlabel("Week Number")
axes[0].set_ylabel("Visit concentration")
axes[0].set_title("Avg. visit concentration per week")
# plt.plot(dists, label="Distance from home")
axes[1].plot(range(11,42), dwells, label="Median dwell")
axes[1].set_xlabel("Week Number")
axes[1].set_ylabel("Median dwell (hrs)")
axes[1].set_title("Avg. median dwell per week")

plt.savefig('stats_viz.png', dpi=300, bbox_inches='tight')
plt.show()

Preparing maps with 'folium'

In [None]:
# preparing datasets

counts = pd.read_csv('counts.csv')
df = pd.read_csv('df.csv').sort_values(by='poi_id')

output = pd.read_csv(r'C:\Users\Ryzen\Downloads\IronHacks\submission_prediction_output.csv')
output = output.merge(df.drop(['week_number', 'raw_visit_counts', 'poi_cbg'], axis=1), how='left', on='poi_id').drop_duplicates()

In [None]:
# CONSTANTS
MAP_LAT=40.402
MAP_LON=-86.902
MAP_CENTER = (MAP_LAT, MAP_LON)

In [None]:
# preparing data for heatmap animation
# retrieving latitude, longitude, raw visit counts and week number from the dataset

heat_data = []
inds = []

for week in df.week_number.unique():
    temp = df[df.week_number == week]
    data = []
    for index, row in temp.iterrows():
        data.append([row['latitude'], row['longitude'], row['raw_visit_counts']])
        
    heat_data.append(data)
    inds.append('week ' + str(week))

In [None]:
# preparing data for percent change map
# percent change = (visits in week 44 - visits in week 40) / (visits in week 40) * 100

diff_data = []

for poi in df.location_name.unique():
    try:
        temp = df[df.location_name == poi]
        diff = temp[temp['week_number'] == 43]['raw_visit_counts'].values - temp[temp['week_number'] == 40]['raw_visit_counts'].values
        diff = diff[0] / temp[temp['week_number'] == 40]['raw_visit_counts'] * 100
        
        diff_data.append([temp['latitude'].unique()[0], temp['longitude'].unique()[0], diff.values[0], poi])
    except:
        continue

In [None]:
# preparing data for prediction data map

pred_data = []

for poi in output.location_name.unique():
    try:
        temp = df[df.location_name == poi]
        
        pred_data.append([temp['latitude'].unique()[0], temp['longitude'].unique()[0], temp['raw_visit_counts'].unique()[0], poi])
    except:
        continue

In [None]:
# generating heatmap animation

m = folium.Map(location=MAP_CENTER,
                    zoom_start = 12, height='80%', width='80%')

HeatMapWithTime(heat_data, index=inds, use_local_extrema=True, gradient={.3: 'blue', .66: 'lime', 1: 'red'}, radius=10).add_to(m)

m.save('heatmap_animation.html')

In [None]:
# generating percent change map

m1 = folium.Map(location=MAP_CENTER,
                    zoom_start = 12, height='80%', width='80%')


plugins.MarkerCluster(
        locations=[[i[0], i[1]] for i in diff_data],
        popups=["% change in {}: {:.2f}".format(i[3], i[2]) for i in diff_data]).add_to(m1)

m1.save(r'C:\Users\Ryzen\Downloads\IronHacks\marker_cluster_diff.html')

In [None]:
# generating prediction values map

m2 = folium.Map(location=MAP_CENTER,
                    zoom_start = 12, height='80%', width='80%')


plugins.MarkerCluster(
        locations=[[i[0], i[1]] for i in pred_data],
        popups=["Week 44 visits in {}: {}".format(i[3], int(i[2])) for i in pred_data]).add_to(m2)

m2.save(r'C:\Users\Ryzen\Downloads\IronHacks\marker_cluster_predictions.html')