In [2]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import numpy as np
from multiprocessing import Pool
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [5]:
def calculate_wap(df: pd.DataFrame) -> pd.DataFrame:
    df_copy = df.copy()
    df_copy['WAP'] = (df_copy['bid_price1']*df_copy['ask_size1'] + df_copy['ask_price1']*df_copy['bid_size1']) /\
    (df_copy['bid_size1'] + df_copy['ask_size1'])
    return df_copy

def calculate_log_returns(df: pd.DataFrame) -> pd.DataFrame:
    df_copy = df.copy()
    if 'WAP' not in df_copy.columns:
        df_copy = calculate_wap(df_copy)
    
    df_copy['log_return'] = np.log(df_copy['WAP'] / df_copy['WAP'].shift(1))
    
    return df_copy

def calculate_volatility(data: np.array):
    volatility = np.sqrt(np.sum(data**2))
    return volatility

def interpolute_log_returns(df: pd.DataFrame) -> pd.DataFrame:
    full_seconds = pd.DataFrame({'seconds_in_bucket': range(600)})
    
    df_interpolated = pd.merge(full_seconds, df, on='seconds_in_bucket', how='left')
    df_interpolated['log_return'] = df_interpolated['log_return'].fillna(0)
    return df_interpolated

def predict_future(training_data: pd.DataFrame, time_length=6):
    predictions = []
    for seconds in range(time_length):
        model = ARIMA(training_data, order=(8,1,2))
        model_fit = model.fit()
        forecast = model_fit.forecast(1)
        predictions.append(forecast)
        training_data = np.append(training_data[1:], forecast)

    return np.array(predictions)

In [None]:
df = pd.read_csv("./individual_book_train/stock_1.csv")
df['WAP'] = (df['bid_price1']*df['ask_size1'] + df['ask_price1']*df['bid_size1']) /\
    (df['bid_size1'] + df['ask_size1'])
df_filtered = df[df['time_id'] == 5]

df_filtered = calculate_log_returns(df_filtered)
df_filtered = interpolute_log_returns(df_filtered)

df_train = df_filtered[(df_filtered['seconds_in_bucket'] < 500)]
df_test = df_filtered[df_filtered['seconds_in_bucket'] >= 500]

In [None]:
def process_stock(stock_id):
    try:
        df = pd.read_csv(f'individual_book_train/stock_{stock_id}.csv')
        df['WAP'] = (df['bid_price1']*df['ask_size1'] + df['ask_price1']*df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
        df = calculate_log_returns(df)
        df = interpolute_log_returns(df)
        df_grouped = df.groupby('time_id')
        
        results = []
        keys = []
        
        for group_name, group_df in tqdm(df_grouped):
            key = f'{stock_id}-{group_name}'
            volatility = []
            for i in range(60):
                v = calculate_volatility(np.array(df_filtered['log_return'][(df_filtered['seconds_in_bucket']<(i+1)*10)&(df_filtered['seconds_in_bucket']>=i*10)]))
                volatility.append(v)
            
            X_train = np.array(volatility[:54])
            y_test = np.array(volatility[-6:])
            predictions = predict_future(X_train)

            mape = np.abs(y_test-predictions)/y_test
            
            results.append(mape)
            keys.append(key)
        
        return results, keys
    except FileNotFoundError:
        print(f"File not found: stock_{stock_id}.csv")
        return [], []

In [264]:
stock_ids = range(0, 127)
with Pool(processes=8) as pool:
    all_results = pool.map(process_stock, stock_ids)

final_results = pd.concat(all_results, ignore_index=True)
final_results.to_csv('arima_mape.csv', index=False)
print("All features generated and saved.")

Process SpawnPoolWorker-43:
Process SpawnPoolWorker-46:
Process SpawnPoolWorker-44:
Process SpawnPoolWorker-42:
Process SpawnPoolWorker-41:
Process SpawnPoolWorker-47:
Process SpawnPoolWorker-48:
Process SpawnPoolWorker-45:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/mia/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/mia/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/mia/anaconda3/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/Users/mia/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Use

KeyboardInterrupt: 