# **Heuristic Benchmark**

### ***Loading Libraries***

In [1]:
!pip install hypertune

Collecting hypertune
  Downloading hypertune-1.1.0-py3-none-any.whl (22 kB)
Installing collected packages: hypertune
Successfully installed hypertune-1.1.0


In [2]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/129.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [3]:
!pip install tensorflow-io

Collecting tensorflow-io
  Downloading tensorflow_io-0.37.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.37.0


In [4]:
!pip install shap

Collecting shap
  Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.1/540.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8


In [5]:
!pip install witwidget

Collecting witwidget
  Downloading witwidget-1.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets>=7.0.0->witwidget)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, witwidget
Successfully installed jedi-0.19.1 witwidget-1.8.1


In [6]:
# Operating Systems
import os
import sys
import shutil
import pathlib
import argparse
import datetime
import collections

# SHAP
import shap

# Numerical Computing
import numpy as np

# Data Manipuation
import pandas as pd

# SciPy
import scipy
from scipy import stats
from scipy import signal
from scipy.io import wavfile

# Data Visualization
import itertools
import seaborn as sns
from PIL import Image
import matplotlib.pyplot as plt

# Tabulate
from tabulate import tabulate

# Hyperparameter Tuning
import hypertune

# Tools
from setuptools import setup
from setuptools import find_packages

# BigQuery
from google.cloud import bigquery
from google.colab import auth

# Scikit-Learn
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_recall_fscore_support as score

# Extreme Gradient Boosting
import xgboost as xgb

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim

# TensorFlow
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import Model
import tensorflow_datasets as tfds
from tensorflow_hub import KerasLayer
from tensorflow import feature_column as fc
from tensorflow.python.framework import dtypes
from tensorflow.keras.preprocessing import text
from tensorflow.keras.utils import to_categorical
from tensorflow_io.bigquery import BigQueryClient
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, layers, models, utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Input, Flatten, Conv2D, MaxPooling2D, Lambda

# Keras API
import keras_tuner as kt

# WidGet
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

# Apache
# import apache_beam as beam
# import apache_beam.runners.interactive.interactive_beam as ib
# from apache_beam.runners.interactive import interactive_runner

# print("Apache Beam Properly Imported")

In [7]:
# User Authentication
auth.authenticate_user()

# BigQuery Library
# !pip install --upgrade google-cloud-bigquery

In [8]:
project_id = 'core-catalyst-425922-v9'
os.environ['GOOGLE_CLOUD_PROJECT'] = project_id

# BigQuery Client Config
client = bigquery.Client(project=project_id)

### ***1. Regression on Poorly Understood Features***



In [9]:
%%bigquery
SELECT
  bqutil.fn.median(ARRAY_AGG(TIMESTAMP_DIFF(a.creation_date, q.creation_date, SECOND))) AS time_to_answer
FROM `bigquery-public-data.stackoverflow.posts_questions` q
JOIN `bigquery-public-data.stackoverflow.posts_answers` a
ON q.accepted_answer_id = a.id

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,time_to_answer
0,2293.0


In [10]:
%%bigquery
WITH benchmark_eval AS (
SELECT
  2120 - TIMESTAMP_DIFF(a.creation_date, q.creation_date, SECOND) AS error
FROM `bigquery-public-data.stackoverflow.posts_questions` q
JOIN `bigquery-public-data.stackoverflow.posts_answers` a
ON q.accepted_answer_id = a.id
)

SELECT
   AVG(ABS(error)) AS mean_absolute_error
FROM
   benchmark_eval

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,mean_absolute_error
0,937346.038415


### ***2. Classification on poorly understood features***

In [11]:
%%bigquery
SELECT
  AVG(IF(a.last_edit_date IS NULL, 0, 1)) AS prob_edited
FROM `bigquery-public-data.stackoverflow.posts_questions` q
JOIN `bigquery-public-data.stackoverflow.posts_answers` a
ON q.accepted_answer_id = a.id

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,prob_edited
0,0.365446


In [12]:
%%bigquery
SELECT
  COUNTIF(ENDS_WITH(u.location, 'France')) / COUNT(u.location) AS from_france,
  COUNTIF(ENDS_WITH(u.location, 'India')) / COUNT(u.location) AS from_india
FROM `bigquery-public-data.stackoverflow.posts_questions` q
JOIN `bigquery-public-data.stackoverflow.posts_answers` a
ON q.accepted_answer_id = a.id
JOIN `bigquery-public-data.stackoverflow.users` u
ON u.id = a.owner_user_id

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,from_france,from_india
0,0.031066,0.081208


### ***3. Regression with one good numeric feature***

In [13]:
%%bigquery
With trips AS (
SELECT
  total_amount,
  ST_Distance(ST_GeogPoint(pickup_longitude, pickup_latitude),
              ST_GeogPoint(dropoff_longitude, dropoff_latitude))/1000 AS dist
FROM `bigquery-public-data.new_york.tlc_yellow_trips_2015`
WHERE pickup_latitude BETWEEN 35 and 45
AND dropoff_latitude BETWEEN 35 and 45
AND pickup_longitude BETWEEN -80 and -70
AND dropoff_longitude BETWEEN -80 and -70
AND total_amount IS NOT NULL
)

SELECT AVG(total_amount)/AVG(dist)
FROM trips

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,f0_
0,4.644356


###  ***4. Regression with one or two important features***

In [14]:
%%bigquery
CREATE TEMPORARY FUNCTION is_peak_hour(start_date TIMESTAMP) aS
(EXTRACT(DAYOFWEEK FROM start_date) BETWEEN 2 AND 6 -- weekday
    AND (
       EXTRACT(HOUR FROM start_date) BETWEEN 6 AND 10
       OR
       EXTRACT(HOUR FROM start_date) BETWEEN 15 AND 18))
;

SELECT
   start_station_name,
   is_peak_hour(start_date) AS is_peak,
   AVG(duration) AS predicted_duration,
FROM `bigquery-public-data.london_bicycles.cycle_hire`
GROUP BY 1, 2
ORDER BY predicted_duration DESC
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,start_station_name,is_peak,predicted_duration
0,"Contact Centre, Southbury House",False,7012.5
1,"Stewart's Road, Nine Elms",False,6401.018182
2,"Brandon Street, Walworth",False,4662.567401
3,"Speakers' Corner 2, Hyde Park",True,4455.441717
4,"Burgess Park Albany Road, Walworth",False,3983.575369
5,"Speakers' Corner 2, Hyde Park",False,3785.754375
6,"South Bermondsey Station, Bermondsey",False,3774.0
7,"Speakers' Corner 1, Hyde Park",True,3728.008525
8,"Stewart's Road, Nine Elms",True,3727.42268
9,"Speakers' Corner 1, Hyde Park",False,3702.115147


In [15]:
%%bigquery
CREATE TEMPORARY FUNCTION is_peak_hour(start_date TIMESTAMP) aS
(EXTRACT(DAYOFWEEK FROM start_date) BETWEEN 2 AND 6 -- weekday
    AND (
       EXTRACT(HOUR FROM start_date) BETWEEN 6 AND 10
       OR
       EXTRACT(HOUR FROM start_date) BETWEEN 15 AND 18))
;

WITH benchmark AS (
SELECT
   start_station_name,
   is_peak_hour(start_date) AS is_peak,
   AVG(duration) AS predicted_duration,
FROM `bigquery-public-data.london_bicycles.cycle_hire`
GROUP BY 1, 2
)

SELECT
   SQRT( SUM( (duration - predicted_duration)*(duration - predicted_duration)) / COUNT(duration) ) AS rmse
FROM `bigquery-public-data.london_bicycles.cycle_hire` c
JOIN benchmark b
ON c.start_station_name = b.start_station_name AND is_peak_hour(c.start_date) = b.is_peak

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,rmse
0,7357.85606
