# GBQ Script

## get df

In [1]:
import pandas as pd

# read file temp (from data script)
def read_local(path_filename):
    """
    read_local(path_filename)
    reads the csv file and parses date col as date, setting the date as the index
    returns the df
    """
    df = pd.read_csv(path_filename)
    df['Date'] = pd.to_datetime(df['Date'])
    return df.set_index('Date')
    
df = read_local("play.csv")
df


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,28.950001,29.082500,28.690001,29.037500,27.332472,115127600
2017-01-04,28.962500,29.127501,28.937500,29.004999,27.301876,84472400
2017-01-05,28.980000,29.215000,28.952499,29.152500,27.440716,88774400
2017-01-06,29.195000,29.540001,29.117500,29.477501,27.746637,127007600
2017-01-09,29.487499,29.857500,29.485001,29.747499,28.000778,134247600
...,...,...,...,...,...,...
2020-12-24,131.320007,133.460007,131.100006,131.970001,131.161407,54930100
2020-12-28,133.990005,137.339996,133.509995,136.690002,135.852509,124486200
2020-12-29,138.050003,138.789993,134.339996,134.869995,134.043640,121047300
2020-12-30,135.580002,135.990005,133.399994,133.720001,132.900696,96452100


## get splits

In [2]:
#splits from time split
from datetime import datetime as dt

class train_val_split:
    
    def __init__(self, df, duration=30, window=1, prediction_period=5, start = '2018-01-01', end = '2020-03-31'):
        self.duration = duration # training period, for dates
        self.window = window # rolling window freq, for dates
        self.prediction_period = prediction_period # prediciton horizon, for dates
        self.start = start
        self.end = end
        self.df = df.loc[(df.index >= start) & (df.index <= end)].reset_index()
        self.start_ind = self.df[self.df.Date>=self.start].index.min()
           
    
    @staticmethod
    def _strfdate(date):
        return dt.strftime(date, '%Y-%m-%d')
           
    
    def split_by_date(self):
        self.end_ind = self.start_ind + self.duration
        
        dates = []
        
        while self.end_ind <=  len(self.df) - self.prediction_period:
            date_start = self._strfdate(self.df.Date[self.start_ind])
            date_end = self._strfdate(self.df.Date[self.end_ind])
            
            dates.append([date_start, date_end])
            
            self.start_ind += self.window
            self.end_ind = self.start_ind + self.duration
            
        return dates
    
    
    def get_val_map(self, start='2020-06-01', end='2020-12-31'):
        start_ind = self.df[self.df.Date>=start].index.min()
        end_ind = self.df[self.df.Date<=end].index.max()
        
        return {self._strfdate(self.df.Date[n-5]):self._strfdate(self.df.Date[n]) for n in range(start_ind, end_ind+1)}  

In [3]:
split = train_val_split(df)
train_date_list = split.split_by_date()

In [4]:
train_date_list[:5]
train_index = train_date_list[0]
train_index

['2018-01-02', '2018-02-14']

## Get Model

In [5]:
from google.cloud import bigquery as bq
from tqdm import tqdm

In [7]:


client = bq.Client.from_service_account_json("service-account-file.json")
coeff_dict = {}

for n, date in enumerate(tqdm(train_date_list)):

    start, end = train_index[0], train_index[1]
    #create model
    query1 = f"""
            CREATE OR REPLACE MODEL ioracle.main.arima_test
            OPTIONS
              (model_type = 'ARIMA_PLUS',
               time_series_timestamp_col = 'Date',
               time_series_data_col = 'Adj_Close',
               auto_arima = TRUE,
               data_frequency = 'DAILY',
               decompose_time_series = TRUE,
               holiday_region='US',
               CLEAN_SPIKES_AND_DIPS=FALSE
              ) AS
            SELECT *
            FROM `ioracle.main.aapl_data`
            WHERE Date BETWEEN '{start}' AND '{end}'
            ORDER BY Date ASC"""

    client.query(query1)
    
    #cannot scale cos will be new scaler with evey training
    
    #iterate through training periods
    query2 = """
            SELECT
             *
            FROM
             ML.EVALUATE(MODEL ioracle.main.arima_test)
            """
    
    results = client.query(query2).to_dataframe()
    val_series = results.sort_values('log_likelihood', ascending=False).iloc[0]
   
    coeff_dict[n] = val_series[:4].to_dict()


100%|█████████████████████████████████████████████████████████████████████████████████| 531/531 [16:10<00:00,  1.83s/it]


In [8]:
from statistics import mode
def get_mode(dic):
    count_list = []
    for v in dic.values():
        count_list.append(tuple([v[k] for k in v]))
    results = mode(count_list) 
    return results[:3], results[-1]
    

non_seasonal_order, include_drift = get_mode(coeff_dict)  
non_seasonal_order, include_drift    

((0, 1, 5), False)

In [None]:
#hard code (if model not run)
non_seasonal_order = (0, 1, 5)
include_drift = True

In [9]:
val_split = train_val_split(df, end='2020-12-31')
val_map = val_split.get_val_map()
val_map

{'2020-05-22': '2020-06-01',
 '2020-05-26': '2020-06-02',
 '2020-05-27': '2020-06-03',
 '2020-05-28': '2020-06-04',
 '2020-05-29': '2020-06-05',
 '2020-06-01': '2020-06-08',
 '2020-06-02': '2020-06-09',
 '2020-06-03': '2020-06-10',
 '2020-06-04': '2020-06-11',
 '2020-06-05': '2020-06-12',
 '2020-06-08': '2020-06-15',
 '2020-06-09': '2020-06-16',
 '2020-06-10': '2020-06-17',
 '2020-06-11': '2020-06-18',
 '2020-06-12': '2020-06-19',
 '2020-06-15': '2020-06-22',
 '2020-06-16': '2020-06-23',
 '2020-06-17': '2020-06-24',
 '2020-06-18': '2020-06-25',
 '2020-06-19': '2020-06-26',
 '2020-06-22': '2020-06-29',
 '2020-06-23': '2020-06-30',
 '2020-06-24': '2020-07-01',
 '2020-06-25': '2020-07-02',
 '2020-06-26': '2020-07-06',
 '2020-06-29': '2020-07-07',
 '2020-06-30': '2020-07-08',
 '2020-07-01': '2020-07-09',
 '2020-07-02': '2020-07-10',
 '2020-07-06': '2020-07-13',
 '2020-07-07': '2020-07-14',
 '2020-07-08': '2020-07-15',
 '2020-07-09': '2020-07-16',
 '2020-07-10': '2020-07-17',
 '2020-07-13':

In [16]:
train_start = '2018-01-01'
val_start = '2020-06-01'
val_end = '2020-12-31'


In [37]:
from time import sleep

pred_list=[]

for k, v in tqdm(val_map.items()):

    query3 = "DROP MODEL IF EXISTS ioracle.main.arima_model"
    client.query(query3)
    
    query4 = f"""
    CREATE OR REPLACE MODEL ioracle.main.arima_model
    OPTIONS
      (model_type = 'ARIMA_PLUS',
       time_series_timestamp_col = 'Date',
       time_series_data_col = 'Adj_Close',
       auto_arima = FALSE,
       data_frequency = 'DAILY',
       decompose_time_series = TRUE,
       holiday_region='US',
       CLEAN_SPIKES_AND_DIPS=FALSE,
       NON_SEASONAL_ORDER={non_seasonal_order},
       INCLUDE_DRIFT={include_drift}
      ) AS
    SELECT Date, Adj_Close
    FROM `ioracle.main.aapl_data`
    WHERE Date BETWEEN '{train_start}' AND '{k}'
    ORDER BY Date ASC"""

    client.query(query4)
    
    sleep(10) # give time for model to be built

    query5 = """
    SELECT
     *
    FROM
     ML.FORECAST(MODEL ioracle.main.arima_model,
     STRUCT(20 AS horizon)
     )
    """

    forecast = client.query(query5).to_dataframe()
    forecast['Date'] = forecast.forecast_timestamp.apply(lambda x: dt.strftime(x, '%Y-%m-%d'))
    pred_list.append(forecast.set_index('Date').loc[v,'forecast_value'])

100%|█████████████████████████████████████████████████████████████████████████████████| 150/150 [32:17<00:00, 12.92s/it]


In [39]:
def get_y(df, start, end):
    return df.loc[(df.index>=start) & (df.index<=end)]['Adj Close']

y = get_y(df, val_start, val_end)
    

In [41]:
# save prediction dataframe to csv
pd.DataFrame(pred_list, index=y.index, columns=['y_pred']).to_csv('aapl_arima_pred.csv')

In [43]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y, pred_list)

4.588839355207132