In [1]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Dropout, LayerNormalization, MultiHeadAttention, Input
from tensorflow.keras.layers import Attention, Reshape
from tensorflow.keras.models import Model

import models
import utils
import data_processing

2024-02-20 23:48:12.015413: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
# defining the necessary constants
SPIKES_THRESHOLD = 2
TARGET_COLUMN = 'Value'
PETROL_FILE_PATH = '../../volza/petroleum/petrol_crude_oil_spot_price.csv'
RANDOM_STATE = 42
START_DATE = "2020-01-01"
END_DATE = "2022-12-31"
COMMODITY = "oil"
WTI_RESULTS_FOLDER = "WTI"
BRENT_RESULTS_FOLDER = "Brent"

# if required subfolders aren't present within oil folder to store the results, create them
if not os.path.exists(os.path.join(COMMODITY, WTI_RESULTS_FOLDER)):
    os.mkdir(os.path.join(COMMODITY, WTI_RESULTS_FOLDER))
    
if not os.path.exists(os.path.join(COMMODITY, BRENT_RESULTS_FOLDER)):
    os.mkdir(os.path.join(COMMODITY, BRENT_RESULTS_FOLDER))

### Filtering data available by date

In [3]:
# reading the CSV file
petrol_df = pd.read_csv(PETROL_FILE_PATH, sep=";")
petrol_df.head(2)

Unnamed: 0,Date,Spot Prices,Value,duoarea,area-name,product,product-name,process,process-name,series
0,1986-01-03,WTI Spot Price (U.S. Dollars per Barrel),26.0,YCUOK,,EPCWTI,WTI Crude Oil,PF4,Spot Price FOB,RWTC
1,1986-01-08,WTI Spot Price (U.S. Dollars per Barrel),25.87,YCUOK,,EPCWTI,WTI Crude Oil,PF4,Spot Price FOB,RWTC


In [4]:
# filter the rows based on the required dates
petrol_df_filtered = petrol_df.loc[(petrol_df["Date"] <= END_DATE) & (petrol_df["Date"] >= START_DATE)]
petrol_df_filtered = petrol_df_filtered.sort_values(by='Date')
petrol_df_filtered.shape

(1534, 10)

In [5]:
initial_row_count = petrol_df_filtered.shape[0]

columns_of_interest = ["Value"]
petrol_df_filtered = petrol_df_filtered.dropna(subset=columns_of_interest)
print(f"Rows dropped due to NaN values: {initial_row_count - petrol_df_filtered.shape[0]}")

Rows dropped due to NaN values: 0


### Detecting spikes using moving average

In [6]:
def detect_spikes(df, column, window_size):
    ## Detecting spikes
    moving_avg = df[column].rolling(window=window_size).mean()
    std_dev = df[column].rolling(window=window_size).std()

    # Set a threshold to identify spikes
    return (abs(df[column] - moving_avg) > SPIKES_THRESHOLD * std_dev).astype(int)

### Function to run ARIMA over the time series and return the residuals

In [7]:
import pmdarima as pm

def train_arima(dataframe: pd.DataFrame, target_column: str):
    # Fit an Auto ARIMA model to the 'Price' series
    model = pm.auto_arima(
        dataframe[target_column], seasonal=True, m=12,
        suppress_warnings=True, stepwise=True, error_action='ignore'
    )

    # Forecast the series using the model (in-sample prediction)
    forecast = model.predict_in_sample()

    # Calculate residuals (difference between actual and forecasted values)
    residuals = dataframe[target_column] - forecast
    return residuals

### Function to run spike prediction for given window sizes

In [10]:
from sklearn.model_selection import train_test_split

def analyze_spike_prediction(
        dataframe: pd.DataFrame,
        window_sizes: list,
        target_column: str,
        output_folder_path: str
    ):
    for window_size in window_sizes:
        print(f"Evaluating window size: {window_size}")
        dataframe['Spikes'] = detect_spikes(dataframe, target_column, window_size)
        
        # train ARIMA and extract residuals
        dataframe['ARIMA_Residuals'] = train_arima(dataframe, target_column)
        
        # preparing features and target
        feature_columns = [target_column, 'ARIMA_Residuals']
        X, y = data_processing.prepare_features_and_target(dataframe, feature_columns, 'Spikes')
        
        # split the data
        X_train_raw, X_test_raw, y_train, y_test = train_test_split(
                                                        X, y, test_size=0.2,
                                                        random_state=RANDOM_STATE,
                                                        shuffle=False
                                                    )
        
        # scale features
        X_train_scaled, X_test_scaled = data_processing.scale_features(X_train_raw, X_test_raw)
        
        # create sequences
        X_train, y_train = data_processing.create_sequences(X_train_scaled, y_train, window_size)
        X_test, y_test = data_processing.create_sequences(X_test_scaled, y_test, window_size)
        
        
        output_file_path = os.path.join(output_folder_path, f"results_{window_size}.csv")
        models.evaluate_all(X_train, y_train, X_test, y_test, output_file_path)

### Spike detection for WTI

In [9]:
wti_df = petrol_df_filtered.loc[petrol_df_filtered["product-name"] == "WTI Crude Oil"]
print(wti_df.shape)

(766, 10)


In [None]:
SPIKE_WINDOW_SIZES = [10, 20, 30, 40, 60, 80, 100]

analyze_spike_prediction(
    dataframe=wti_df,
    window_sizes=SPIKE_WINDOW_SIZES,
    target_column='Value',
    output_folder_path=os.path.join(COMMODITY, WTI_RESULTS_FOLDER)
)

### Spike detection for Brent

In [18]:
brent_df = petrol_df_filtered.loc[petrol_df_filtered["product-name"] == "UK Brent Crude Oil"]
print(brent_df.shape)

(768, 10)


In [None]:
analyze_spike_prediction(
    dataframe=brent_df,
    window_sizes=SPIKE_WINDOW_SIZES,
    target_column='Value',
    output_folder_path=os.path.join(COMMODITY, BRENT_RESULTS_FOLDER)
)