In [1]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install scikit-learn
# %pip install tensorflow
# %pip install -U imbalanced-learn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Dropout, LayerNormalization, MultiHeadAttention, Input
from tensorflow.keras.layers import Attention, Reshape
from tensorflow.keras.models import Model




In [3]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import models
import utils
import data_processing


In [4]:
COMMODITY = 'cobalt'

DATE_COLUMN = 'Date'
VALUE_COLUMN = 'Value'  
QUANTITY_COLUMN = 'Std. Quantity (KG)'
UNIT_RATE_COLUMN = 'Std. Unit Rate ($/KG)'
BRENT_OIL_COLUMN = 'Brent Oil Value'
WTI_OIL_COLUMN = 'WTI Oil Value'
SHIP_COUNT_COLUMN = 'ship_count'
PORT_COUNT_COLUMN = 'popular_port_count'

VALUE_SPIKES_COLUMN = 'Value Spikes'  
QUANTITY_SPIKES_COLUMN = 'Std. Quantity (KG) Spikes'
UNIT_RATE_SPIKES_COLUMN = 'Std. Unit Rate ($/KG) Spikes'
BRENT_OIL_SPIKES_COLUMN = 'Brent Oil Value Spikes'
WTI_OIL_SPIKES_COLUMN = 'WTI Oil Value Spikes'
SHIP_COUNT_SPIKES_COLUMN = 'Ship Count Spikes'
PORT_COUNT_SPIKES_COLUMN = 'Port Count Spikes'

FEATURE_COLUMNS = [VALUE_COLUMN, QUANTITY_COLUMN, UNIT_RATE_COLUMN,  WTI_OIL_COLUMN, BRENT_OIL_COLUMN, SHIP_COUNT_COLUMN, PORT_COUNT_COLUMN]
# FEATURE_COLUMNS = [VALUE_COLUMN, QUANTITY_COLUMN, UNIT_RATE_COLUMN,  WTI_OIL_SPIKES_COLUMN, BRENT_OIL_SPIKES_COLUMN]
# FEATURE_COLUMNS = [VALUE_COLUMN, QUANTITY_COLUMN, UNIT_RATE_COLUMN,  WTI_OIL_COLUMN, BRENT_OIL_COLUMN, SHIP_COUNT_SPIKES_COLUMN, PORT_COUNT_SPIKES_COLUMN]
TARGET_COLUMN = 'Price'

ORIGIN_COUNTRY_COLUMN = 'Country of Origin'
DEST_COUNTRY_COLUMN = 'Country of Destination'

PETROL_FILE_PATH = '../../volza/petroleum/petrol_crude_oil_spot_price.csv'
VOLZA_FILE_PATH = f'../../volza/{COMMODITY}/{COMMODITY}.csv'
PRICE_FILE_PATH = f"../../volza/{COMMODITY}/{COMMODITY}_prices.csv"
AIS_POPULAR_FILE_PATH = f'../../ais/ais_ml_features.csv' 

NB_OUTPUT_PATH = f"{COMMODITY}/{COMMODITY}_model_performance (No Balancing).csv"
RUS_OUTPUT_PATH = f"{COMMODITY}/{COMMODITY}_model_performance (Random Under Sampling).csv"
ROS_OUTPUT_PATH = f"{COMMODITY}/{COMMODITY}_model_performance (Random Over Sampling).csv"


SPIKES_THRESHOLD = 2
SPIKES_WINDOW_SIZE = 20
BIN_COUNT = 10
FILL_METHOD = 'ffill'

RANDOM_STATE = 42

## Dataframe Prep

In [5]:
from datetime import datetime

#Formatting the date and price for Volza data
volza_pd = pd.read_csv(VOLZA_FILE_PATH)
volza_pd = volza_pd[(volza_pd["Country of Origin"].notnull()) & (volza_pd["Country of Destination"].notnull())]
volza_pd = volza_pd.rename(columns={'Unnamed: 0': 'ID'})
volza_pd['Date'] = volza_pd['Date'].apply(lambda x: x.split(' ')[0])
volza_pd['Date'] = pd.to_datetime(volza_pd['Date'], errors='raise', format='%Y-%m-%d')
volza_pd = utils.convert_to_kg(volza_pd)
volza_pd.head(3)

Unnamed: 0,ID,Date,HS Code,Product Description,Consignee,Notify Party Name,Shipper,Std. Quantity,Std. Unit,Standard Unit Rate INR,...,Freight Term,Marks Number,HS Product Description,Gross Weight,Consignee Address,Shipper Address,Notify Party Address,Country Name,Std. Quantity (KG),Std. Unit Rate ($/KG)
0,0,2020-03-03,81052029000K,BRAND: HANRUI COBALT|EFS 1 GRANULATE COBA,TANTAL ARGENTINA SRL,,,200.0,KGS,-,...,-,-,OTHERS,0.0,,,,Argentina T2 Import,200.0,42.805
1,1,2020-11-25,8105200000,DO 3202000987-001 DECLARATION (1-1) INVOICE: 2...,INVERSIONES RINCON MEDINA LTDA,,GE ADDITIVE,11.0,KGS,-,...,-,-,COBALT MATTES AND OTHER INTERMEDIATE PRODUCTS ...,11.0,CRA 47 79 234,101 NORTH CAMPUS DRIVR IMPERIAL PA15126,,Columbia T3+ Import,11.0,251.360909
3,3,2020-12-14,81052000,"COBALT DUST: ""BEGO WIROBOND C+""-10PCS*5K",BELADENT SRL MOLDOVA OR NISPORENI,,OOO SIMPLAND 121353 OR MOSCOVA SK,50.0,KGS,-,...,-,-,,0.0,MOLDOVA OR NISPORENI,,,Moldova T3 Import,50.0,384.0744


In [6]:
#Preprocessing the AIS data
ais_popular_pd = pd.read_csv(AIS_POPULAR_FILE_PATH)
ais_popular_pd['Date'] = pd.to_datetime(ais_popular_pd['Date'])
ais_popular_pd.head(3)


Unnamed: 0,Date,ship_count,popular_port,popular_port_count
0,2020-11-10,8,LTKLJ,18
1,2020-11-12,20,IDSKP,8
2,2020-11-29,9,CNSHA,2


In [7]:
#Preprocessing the price data
prices_pd = pd.read_csv(PRICE_FILE_PATH)
# prices_pd['Date'] = prices_pd['Date'].apply(lambda x: datetime.strptime(x, "%b %d, %Y").strftime("%Y-%m-%d"))
prices_pd['Date'] = pd.to_datetime(prices_pd['Date'], errors='raise', format='%m/%d/%Y')
prices_pd['Price'] = prices_pd['Price'].str.replace(',', '').astype(float)
prices_pd = prices_pd[['Date', 'Price']]
prices_pd

Unnamed: 0,Date,Price
0,2024-02-23,28288.0
1,2024-02-22,28288.0
2,2024-02-21,28288.0
3,2024-02-19,28137.0
4,2024-02-16,28137.0
...,...,...
1034,2020-01-08,31323.0
1035,2020-01-07,31578.5
1036,2020-01-06,31834.0
1037,2020-01-03,31850.0


In [8]:
#Aggregate volza data by day
date_wise_volza = volza_pd.groupby("Date")[[VALUE_COLUMN,QUANTITY_COLUMN,'Gross Weight']].sum()

In [9]:
# Avg of Commodity Price in Volza
avg_price_volza = volza_pd.groupby('Date')[UNIT_RATE_COLUMN].mean()
date_wise_volza = date_wise_volza.join(avg_price_volza, how='left')
date_wise_volza

Unnamed: 0_level_0,Value,Std. Quantity (KG),Gross Weight,Std. Unit Rate ($/KG)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01,1.217433e+08,4.147777e+06,42055.0,92.545195
2020-01-02,9.020977e+04,2.300000e+03,0.0,39.903707
2020-01-03,4.110707e+02,1.000000e+01,0.0,41.107069
2020-01-04,5.637466e+04,1.000000e+03,0.0,56.374664
2020-01-05,4.051522e+04,1.140000e+03,0.0,35.644705
...,...,...,...,...
2023-12-25,8.913675e+04,3.000000e+03,0.0,30.255888
2023-12-26,1.160515e+05,3.000000e+03,0.0,38.338660
2023-12-27,5.611000e+04,1.841780e+03,0.0,46.549676
2023-12-28,2.256061e+05,4.446500e+03,0.0,76.015813


In [10]:
# Petroleum data prep
petrol_df = pd.read_csv(PETROL_FILE_PATH, delimiter=';', on_bad_lines='warn')
petrol_df['Date'] = pd.to_datetime(petrol_df['Date'])

# Split based on types of oil
brent_df = petrol_df[petrol_df['product-name']=='UK Brent Crude Oil']
wti_df = petrol_df[petrol_df['product-name']=='WTI Crude Oil']

brent_df.rename(columns={'Value':'Brent Oil Value'}, inplace=True)
wti_df.rename(columns={'Value':'WTI Oil Value'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brent_df.rename(columns={'Value':'Brent Oil Value'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wti_df.rename(columns={'Value':'WTI Oil Value'}, inplace=True)


In [11]:
# Combining dataframes
prices_pd = prices_pd.set_index('Date')
ais_popular_pd = ais_popular_pd.set_index('Date')
date_wise_volza = date_wise_volza.join(ais_popular_pd, how="left").fillna(method=FILL_METHOD)
aggregated_df = date_wise_volza.join(prices_pd, how="left").fillna(method=FILL_METHOD)
aggregated_df = aggregated_df.merge(brent_df[[DATE_COLUMN, BRENT_OIL_COLUMN]], on='Date', how='left').fillna(method=FILL_METHOD)
aggregated_df = aggregated_df.merge(wti_df[[DATE_COLUMN, WTI_OIL_COLUMN]], on='Date', how='left').fillna(method=FILL_METHOD)
aggregated_df

  date_wise_volza = date_wise_volza.join(ais_popular_pd, how="left").fillna(method=FILL_METHOD)
  aggregated_df = date_wise_volza.join(prices_pd, how="left").fillna(method=FILL_METHOD)
  aggregated_df = aggregated_df.merge(brent_df[[DATE_COLUMN, BRENT_OIL_COLUMN]], on='Date', how='left').fillna(method=FILL_METHOD)
  aggregated_df = aggregated_df.merge(wti_df[[DATE_COLUMN, WTI_OIL_COLUMN]], on='Date', how='left').fillna(method=FILL_METHOD)


Unnamed: 0,Date,Value,Std. Quantity (KG),Gross Weight,Std. Unit Rate ($/KG),ship_count,popular_port,popular_port_count,Price,Brent Oil Value,WTI Oil Value
0,2020-01-01,1.217433e+08,4.147777e+06,42055.0,92.545195,,,,,67.77,61.14
1,2020-01-02,9.020977e+04,2.300000e+03,0.0,39.903707,,,,32355.0,67.05,61.17
2,2020-01-03,4.110707e+02,1.000000e+01,0.0,41.107069,,,,31850.0,69.08,63.00
3,2020-01-04,5.637466e+04,1.000000e+03,0.0,56.374664,,,,31850.0,69.08,63.00
4,2020-01-05,4.051522e+04,1.140000e+03,0.0,35.644705,,,,31850.0,69.08,63.00
...,...,...,...,...,...,...,...,...,...,...,...
1302,2023-12-25,8.913675e+04,3.000000e+03,0.0,30.255888,8925.0,USMSY,2673.0,28819.0,78.89,72.16
1303,2023-12-26,1.160515e+05,3.000000e+03,0.0,38.338660,8925.0,USMSY,2673.0,28819.0,78.89,72.16
1304,2023-12-27,5.611000e+04,1.841780e+03,0.0,46.549676,8925.0,USMSY,2673.0,28792.5,78.89,72.16
1305,2023-12-28,2.256061e+05,4.446500e+03,0.0,76.015813,8925.0,USMSY,2673.0,28787.5,78.89,72.16


In [12]:
def detect_spikes(df, column, window_size):
    ## Detecting spikes
    moving_avg = df[column].rolling(window=window_size).mean()
    std_dev = df[column].rolling(window=window_size).std()

    # Set a threshold to identify spikes
    return (abs(aggregated_df[column] - moving_avg) > SPIKES_THRESHOLD * std_dev).astype(int)

# aggregated_df['spikes'] = detect_spikes(aggregated_df, 'Price')
# print("SPIKES : NON SPIKES = ")
# print(aggregated_df['spikes'].value_counts())
# print("PERCENT OF SPIKES", aggregated_df['spikes'].value_counts()[1]/len(aggregated_df))

# **Detect spikes**

In [13]:
# aggregated_df[VALUE_SPIKES_COLUMN] = detect_spikes(aggregated_df, VALUE_COLUMN)
# aggregated_df[QUANTITY_SPIKES_COLUMN] = detect_spikes(aggregated_df, QUANTITY_COLUMN)
# aggregated_df[UNIT_RATE_SPIKES_COLUMN] = detect_spikes(aggregated_df, UNIT_RATE_COLUMN)
# aggregated_df[WTI_OIL_SPIKES_COLUMN] = detect_spikes(aggregated_df, WTI_OIL_COLUMN)
# aggregated_df[BRENT_OIL_SPIKES_COLUMN] = detect_spikes(aggregated_df, BRENT_OIL_COLUMN)
# aggregated_df[SHIP_COUNT_SPIKES_COLUMN] = detect_spikes(aggregated_df, SHIP_COUNT_COLUMN)
# aggregated_df[PORT_COUNT_SPIKES_COLUMN] = detect_spikes(aggregated_df, PORT_COUNT_COLUMN)

# #Visualise Dataset

# # Plotting the graph
# fig, ax1 = plt.subplots(figsize=(12, 6))

# # Plotting 'Value', 'Quantity', and 'Gross Weight' on the left y-axis
# ax1.plot(aggregated_df.index, aggregated_df[VALUE_SPIKES_COLUMN], label='Value Spikes', color='b')
# ax1.plot(aggregated_df.index, aggregated_df[QUANTITY_SPIKES_COLUMN], label='Quantity Spikes', color='g')
# ax1.plot(aggregated_df.index, aggregated_df[UNIT_RATE_SPIKES_COLUMN], label='Unit Rate Spikes', color='k')
# ax1.plot(aggregated_df.index, aggregated_df[BRENT_OIL_SPIKES_COLUMN], label='Brent Oil Value Spikes', color='m')
# ax1.plot(aggregated_df.index, aggregated_df[WTI_OIL_SPIKES_COLUMN], label='WTI Oil Value Spikes', color='c')
# ax1.plot(aggregated_df.index, aggregated_df[SHIP_COUNT_COLUMN], label='Ship Count Spikes', color='darkorange')
# ax1.plot(aggregated_df.index, aggregated_df[PORT_COUNT_COLUMN], label='Port Count Value Spikes', color='violet')

# ax1.set_xlabel('Date')
# ax1.set_ylabel('Value / Quantity / Gross Weight / Brent Oil Value / WTI Oil Value / Ship Count / Port Count ', color='b')
# ax1.tick_params('y', colors='b')

# # Creating a second y-axis for 'Price'
# ax2 = ax1.twinx()
# ax2.plot(aggregated_df.index, aggregated_df['Price'], label='Price', color='orange')
# ax2.set_ylabel('Price', color='orange')
# ax2.tick_params('y', colors='orange')

# # Display legend
# fig.tight_layout()
# fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))

# # Display the graph
# # plt.show()

In [14]:
#remove date 2020-01-01
aggregated_df = aggregated_df[aggregated_df.index != '2020-01-01']
aggregated_df

Unnamed: 0,Date,Value,Std. Quantity (KG),Gross Weight,Std. Unit Rate ($/KG),ship_count,popular_port,popular_port_count,Price,Brent Oil Value,WTI Oil Value
0,2020-01-01,1.217433e+08,4.147777e+06,42055.0,92.545195,,,,,67.77,61.14
1,2020-01-02,9.020977e+04,2.300000e+03,0.0,39.903707,,,,32355.0,67.05,61.17
2,2020-01-03,4.110707e+02,1.000000e+01,0.0,41.107069,,,,31850.0,69.08,63.00
3,2020-01-04,5.637466e+04,1.000000e+03,0.0,56.374664,,,,31850.0,69.08,63.00
4,2020-01-05,4.051522e+04,1.140000e+03,0.0,35.644705,,,,31850.0,69.08,63.00
...,...,...,...,...,...,...,...,...,...,...,...
1302,2023-12-25,8.913675e+04,3.000000e+03,0.0,30.255888,8925.0,USMSY,2673.0,28819.0,78.89,72.16
1303,2023-12-26,1.160515e+05,3.000000e+03,0.0,38.338660,8925.0,USMSY,2673.0,28819.0,78.89,72.16
1304,2023-12-27,5.611000e+04,1.841780e+03,0.0,46.549676,8925.0,USMSY,2673.0,28792.5,78.89,72.16
1305,2023-12-28,2.256061e+05,4.446500e+03,0.0,76.015813,8925.0,USMSY,2673.0,28787.5,78.89,72.16


In [15]:
# #Visualise Dataset
# # Plotting the graph
# fig, ax1 = plt.subplots(figsize=(12, 6))

# # Plotting 'Value', 'Quantity', and 'Gross Weight' on the left y-axis
# ax1.plot(aggregated_df.index, aggregated_df[VALUE_COLUMN], label='Value', color='b')
# ax1.plot(aggregated_df.index, aggregated_df[QUANTITY_COLUMN], label='Quantity', color='g')
# ax1.plot(aggregated_df.index, aggregated_df[UNIT_RATE_COLUMN], label='Unit Rate', color='k')
# ax1.plot(aggregated_df.index, aggregated_df[BRENT_OIL_COLUMN], label='Brent Oil Value', color='m')
# ax1.plot(aggregated_df.index, aggregated_df[WTI_OIL_COLUMN], label='WTI Oil Value', color='c')
# ax1.plot(aggregated_df.index, aggregated_df[SHIP_COUNT_COLUMN], label='Ship Count Value', color='darkorange')
# ax1.plot(aggregated_df.index, aggregated_df[PORT_COUNT_COLUMN], label='Port Count Value', color='violet')

# ax1.set_xlabel('Date')
# ax1.set_ylabel('Value / Quantity / Gross Weight', color='b')
# ax1.tick_params('y', colors='b')

# # Creating a second y-axis for 'Price'
# ax2 = ax1.twinx()
# ax2.plot(aggregated_df.index, aggregated_df['Price'], label='Price', color='orange')
# ax2.set_ylabel('Price', color='orange')
# ax2.tick_params('y', colors='orange')

# # Display legend
# fig.tight_layout()
# fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))

# # Display the graph
# # plt.show()

In [16]:
# # Plotting the price data
# plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
# plt.plot(aggregated_df.index, aggregated_df['Price'], label='Price', color='blue')

# # Highlighting spikes
# spike_indices = aggregated_df[aggregated_df['spikes'] == 1].index
# spike_prices = aggregated_df.loc[spike_indices, 'Price']
# plt.scatter(spike_indices, spike_prices, color='red', marker='^', label='Spikes')

# # Adding labels and title
# plt.xlabel('Date')
# plt.ylabel('Price')
# plt.title('Price Data with Spikes')
# plt.legend()

# # Display the plot
# # plt.show()

## Baseline

In [17]:
# # Count % of spikes 
# total_spikes = aggregated_df['spikes'].sum()
# total_data_points = len(aggregated_df)
# percentage_of_spikes = (total_spikes / total_data_points) * 100

# print(f"Percentage of Spikes: {percentage_of_spikes:.2f}%")

In [18]:
# from sklearn.metrics import precision_score, recall_score

# # Probability of spike
# spike_prob = aggregated_df['spikes'].mean()

# # Random baseline predictions
# random_predictions = np.random.choice([0, 1], size=len(aggregated_df), p=[1-spike_prob, spike_prob])

# # Calculate precision and recall for the random baseline
# random_precision = precision_score(aggregated_df['spikes'], random_predictions)
# random_recall = recall_score(aggregated_df['spikes'], random_predictions)

# print(f"Random Guessing Precision: {random_precision}")
# print(f"Random Guessing Recall: {random_recall}")


## Data Prep for Classification

In [19]:
COLUMNS = FEATURE_COLUMNS + [TARGET_COLUMN]
print(COLUMNS)

['Value', 'Std. Quantity (KG)', 'Std. Unit Rate ($/KG)', 'WTI Oil Value', 'Brent Oil Value', 'ship_count', 'popular_port_count', 'Price']


In [20]:
# # Discretize
# from sklearn.preprocessing import KBinsDiscretizer

# def discretize(df, columns, bins):
#     est = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='kmeans')
#     df[columns] = est.fit_transform(df[columns])
#     return df

# # FEATURES_1 = [VALUE_COLUMN, QUANTITY_COLUMN, UNIT_RATE_COLUMN]
# # FEATURES_2 = [WTI_OIL_COLUMN, BRENT_OIL_COLUMN]

# test_df = aggregated_df.copy()
# test_df[FEATURE_COLUMNS] = test_df[FEATURE_COLUMNS].fillna(0)

# # test_df = discretize(test_df, FEATURES_1, 2)
# # test_df = discretize(test_df, FEATURES_2, BIN_COUNT)
# discretized_df = discretize(test_df, FEATURE_COLUMNS, BIN_COUNT)
# # discretized_df = test_df.copy()
# test_df.head(2)


In [21]:
# fig, ax1 = plt.subplots(figsize=(12, 6))

# # Plotting 'Value', 'Quantity', and 'Gross Weight' on the left y-axis
# ax1.plot(test_df.index, test_df[VALUE_COLUMN], label='Value', color='b')
# ax1.plot(test_df.index, test_df[QUANTITY_COLUMN], label='Quantity', color='g')
# ax1.plot(test_df.index, test_df[UNIT_RATE_COLUMN], label='Unit Rate', color='k')
# ax1.plot(test_df.index, test_df[BRENT_OIL_COLUMN], label='Brent Oil Value', color='m')
# ax1.plot(test_df.index, test_df[WTI_OIL_COLUMN], label='WTI Oil Value', color='c')
# ax1.plot(test_df.index, test_df[SHIP_COUNT_COLUMN], label='Ship Count Value', color='darkorange')
# ax1.plot(test_df.index, test_df[PORT_COUNT_COLUMN], label='Port Count Value', color='violet')

# ax1.set_xlabel('Date')
# ax1.set_ylabel('Value / Quantity / Gross Weight', color='b')
# ax1.tick_params('y', colors='b')

# # Creating a second y-axis for 'Price'
# ax2 = ax1.twinx()
# ax2.plot(test_df.index, test_df['Price'], label='Price', color='orange')
# ax2.set_ylabel('Price', color='orange')
# ax2.tick_params('y', colors='orange')

# # Display legend
# fig.tight_layout()
# fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))

# # Display the graph
# plt.show()

In [22]:
# # Convert the discretized data into a DataFrame
# discretized_df = pd.DataFrame(discretized_df, columns=FEATURE_COLUMNS)

# unique_values = discretized_df[VALUE_COLUMN].fillna(method=FILL_METHOD).unique()
# print(unique_values)

# bin_counts = {col: discretized_df[col].value_counts() for col in FEATURE_COLUMNS}

# # Plotting
# plt.figure(figsize=(15, len(FEATURE_COLUMNS) * 5))

# for i, column in enumerate(FEATURE_COLUMNS):
#     plt.subplot(len(FEATURE_COLUMNS), 1, i + 1)
#     bin_counts[column].sort_index().plot(kind='bar', ax=plt.gca())

#     plt.title(f'{column} Distribution')
#     plt.ylabel('Frequency')
#     plt.xlabel('Bins')

# plt.tight_layout()
# plt.show()


In [23]:
# Clean up before passing to Arima
initial_row_count = aggregated_df.shape[0]

columns_of_interest = ['Price']  # Add other columns as necessary

aggregated_df = aggregated_df.dropna(subset=columns_of_interest)

rows_dropped = initial_row_count - aggregated_df.shape[0]

print(f"Rows dropped due to NaN values: {rows_dropped}")

Rows dropped due to NaN values: 1


## Train / Test Set Up

In [24]:
from sklearn.model_selection import train_test_split
import pmdarima as pm

# Fit an Auto ARIMA model to the 'Price' series
model = pm.auto_arima(aggregated_df['Price'], seasonal=True, m=12, suppress_warnings=True, stepwise=True, error_action='ignore')

# Forecast the series using the model (in-sample prediction)
forecast = model.predict_in_sample()

# Calculate residuals (difference between actual and forecasted values)
residuals = aggregated_df['Price'] - forecast

# Append residuals to DataFrame as a new feature (using residuals as a way to detect spike / anomaly)
aggregated_df = aggregated_df.copy()
aggregated_df['ARIMA_Residuals'] = residuals

In [25]:
# Trying out different window sizes
SPIKE_WINDOW_SIZES = [10, 20, 30, 40, 60, 80, 100]
results_dfs = []

for window_size in SPIKE_WINDOW_SIZES:
    print(f"Evaluating window size: {window_size}")

    aggregated_df['spikes'] = detect_spikes(aggregated_df, 'Price', window_size)

    # Prepare features and target
    FEATURE_COLUMNS = [TARGET_COLUMN, 'ARIMA_Residuals']  # Adjust as needed
    X, y = data_processing.prepare_features_and_target(aggregated_df, FEATURE_COLUMNS, 'spikes')

    # Split data 
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=False)

    # Scale features
    X_train_scaled, X_test_scaled = data_processing.scale_features(X_train_raw, X_test_raw)

    # Create sequences
    X_train, y_train = data_processing.create_sequences(X_train_scaled, y_train, window_size)
    X_test, y_test = data_processing.create_sequences(X_test_scaled, y_test, window_size)

    print(f'X train shape: {X_train.shape}')

    output_file_path = f'{COMMODITY} + arima/results_{window_size}.csv'
    pred_file_path = f'{COMMODITY} + arima/predictions/{window_size}'
    print(pred_file_path)
    results_df = models.evaluate_all(X_train, y_train, X_test, y_test, output_file_path, pred_file_path)
    results_df['Window Size'] = window_size
    results_dfs.append(results_df)

Evaluating window size: 10
X train shape: (1035, 10, 2)
cobalt + arima/predictions/10



Predictions saved to CSV file: cobalt + arima/predictions/10/LSTM_250_layers_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/10/LSTM_250_layers_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/10/LSTM_200_layers_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/10/LSTM_200_layers_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/10/LSTM_100_layers_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/10/LSTM_100_layers_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/10/LSTM_50_layers_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/10/LSTM_50_layers_predictions.npy


Predictions saved to CSV file: cobalt + arima/predictions/10/CNN_Attention_32_filters_7_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictio

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Predictions saved to CSV file: cobalt + arima/predictions/100/LSTM_100_layers_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/LSTM_100_layers_predictions.npy


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Predictions saved to CSV file: cobalt + arima/predictions/100/LSTM_50_layers_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/LSTM_50_layers_predictions.npy


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_Attention_32_filters_7_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/CNN_Attention_32_filters_7_kernels_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_Attention_32_filters_5_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/CNN_Attention_32_filters_5_kernels_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_Attention_32_filters_3_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/CNN_Attention_32_filters_3_kernels_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_Attention_64_filters_7_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/CNN_Attention_64_filters_7_kernels_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_Attention_64_filters_5_kernels

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_32_filters_7_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/CNN_32_filters_7_kernels_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_32_filters_5_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/CNN_32_filters_5_kernels_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_32_filters_3_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/CNN_32_filters_3_kernels_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_64_filters_7_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/CNN_64_filters_7_kernels_predictions.npy
Predictions saved to CSV file: cobalt + arima/predictions/100/CNN_64_filters_5_kernels_predictions.csv
Predictions saved to NPY file: cobalt + arima/predictions/100/CNN_64_filt

## Evaluation

In [26]:
# Display all results df
for idx, df in enumerate(results_dfs):
    print(f"Results for window size: {SPIKE_WINDOW_SIZES[idx]}")
    display(df) 

Results for window size: 10


Unnamed: 0,Name,Params,Accuracy,Precision (0),Recall (0),F1 (0),Precision (1),Recall (1),F1 (1),Window Size
0,LSTM,250 layers,0.940711,0.939759,1.0,0.968944,1.0,0.210526,0.347826,10
1,LSTM,200 layers,0.940711,0.939759,1.0,0.968944,1.0,0.210526,0.347826,10
2,LSTM,100 layers,0.940711,0.939759,1.0,0.968944,1.0,0.210526,0.347826,10
3,LSTM,50 layers,0.940711,0.939759,1.0,0.968944,1.0,0.210526,0.347826,10
4,CNN with Attention,"32 filters, kernel size 7",0.952569,0.95122,1.0,0.975,1.0,0.368421,0.538462,10
5,CNN with Attention,"32 filters, kernel size 5",0.964427,0.962963,1.0,0.981132,1.0,0.526316,0.689655,10
6,CNN with Attention,"32 filters, kernel size 3",0.976285,0.983051,0.991453,0.987234,0.882353,0.789474,0.833333,10
7,CNN with Attention,"64 filters, kernel size 7",0.952569,0.95122,1.0,0.975,1.0,0.368421,0.538462,10
8,CNN with Attention,"64 filters, kernel size 5",0.972332,0.970954,1.0,0.985263,1.0,0.631579,0.774194,10
9,CNN with Attention,"64 filters, kernel size 3",0.972332,0.978903,0.991453,0.985138,0.875,0.736842,0.8,10


Results for window size: 20


Unnamed: 0,Name,Params,Accuracy,Precision (0),Recall (0),F1 (0),Precision (1),Recall (1),F1 (1),Window Size
0,LSTM,250 layers,0.934156,0.950673,0.976959,0.963636,0.75,0.576923,0.652174,20
1,LSTM,200 layers,0.950617,0.947598,1.0,0.973094,1.0,0.538462,0.7,20
2,LSTM,100 layers,0.954733,0.951754,1.0,0.975281,1.0,0.576923,0.731707,20
3,LSTM,50 layers,0.950617,0.947598,1.0,0.973094,1.0,0.538462,0.7,20
4,CNN with Attention,"32 filters, kernel size 7",0.958848,0.955947,1.0,0.977477,1.0,0.615385,0.761905,20
5,CNN with Attention,"32 filters, kernel size 5",0.950617,0.951542,0.995392,0.972973,0.9375,0.576923,0.714286,20
6,CNN with Attention,"32 filters, kernel size 3",0.958848,0.955947,1.0,0.977477,1.0,0.615385,0.761905,20
7,CNN with Attention,"64 filters, kernel size 7",0.954733,0.951754,1.0,0.975281,1.0,0.576923,0.731707,20
8,CNN with Attention,"64 filters, kernel size 5",0.958848,0.955947,1.0,0.977477,1.0,0.615385,0.761905,20
9,CNN with Attention,"64 filters, kernel size 3",0.958848,0.955947,1.0,0.977477,1.0,0.615385,0.761905,20


Results for window size: 30


Unnamed: 0,Name,Params,Accuracy,Precision (0),Recall (0),F1 (0),Precision (1),Recall (1),F1 (1),Window Size
0,LSTM,250 layers,0.974249,0.981221,0.990521,0.985849,0.9,0.818182,0.857143,30
1,LSTM,200 layers,0.982833,0.985915,0.995261,0.990566,0.95,0.863636,0.904762,30
2,LSTM,100 layers,0.982833,0.981395,1.0,0.99061,1.0,0.818182,0.9,30
3,LSTM,50 layers,0.991416,0.99061,1.0,0.995283,1.0,0.909091,0.952381,30
4,CNN with Attention,"32 filters, kernel size 7",0.991416,0.995261,0.995261,0.995261,0.954545,0.954545,0.954545,30
5,CNN with Attention,"32 filters, kernel size 5",0.982833,0.990521,0.990521,0.990521,0.909091,0.909091,0.909091,30
6,CNN with Attention,"32 filters, kernel size 3",0.987124,0.990566,0.995261,0.992908,0.952381,0.909091,0.930233,30
7,CNN with Attention,"64 filters, kernel size 7",0.991416,0.995261,0.995261,0.995261,0.954545,0.954545,0.954545,30
8,CNN with Attention,"64 filters, kernel size 5",0.982833,0.990521,0.990521,0.990521,0.909091,0.909091,0.909091,30
9,CNN with Attention,"64 filters, kernel size 3",0.987124,0.990566,0.995261,0.992908,0.952381,0.909091,0.930233,30


Results for window size: 40


Unnamed: 0,Name,Params,Accuracy,Precision (0),Recall (0),F1 (0),Precision (1),Recall (1),F1 (1),Window Size
0,LSTM,250 layers,0.950673,0.979899,0.965347,0.972569,0.708333,0.809524,0.755556,40
1,LSTM,200 layers,0.986547,0.985366,1.0,0.992629,1.0,0.857143,0.923077,40
2,LSTM,100 layers,0.96861,0.980296,0.985149,0.982716,0.85,0.809524,0.829268,40
3,LSTM,50 layers,0.973094,0.975728,0.99505,0.985294,0.941176,0.761905,0.842105,40
4,CNN with Attention,"32 filters, kernel size 7",0.955157,0.984848,0.965347,0.975,0.72,0.857143,0.782609,40
5,CNN with Attention,"32 filters, kernel size 5",0.964126,0.985,0.975248,0.9801,0.782609,0.857143,0.818182,40
6,CNN with Attention,"32 filters, kernel size 3",0.973094,0.985149,0.985149,0.985149,0.857143,0.857143,0.857143,40
7,CNN with Attention,"64 filters, kernel size 7",0.964126,0.985,0.975248,0.9801,0.782609,0.857143,0.818182,40
8,CNN with Attention,"64 filters, kernel size 5",0.964126,0.985,0.975248,0.9801,0.782609,0.857143,0.818182,40
9,CNN with Attention,"64 filters, kernel size 3",0.964126,0.985,0.975248,0.9801,0.782609,0.857143,0.818182,40


Results for window size: 60


Unnamed: 0,Name,Params,Accuracy,Precision (0),Recall (0),F1 (0),Precision (1),Recall (1),F1 (1),Window Size
0,LSTM,250 layers,0.965517,0.97191,0.988571,0.98017,0.92,0.821429,0.867925,60
1,LSTM,200 layers,0.965517,0.97191,0.988571,0.98017,0.92,0.821429,0.867925,60
2,LSTM,100 layers,0.970443,0.977401,0.988571,0.982955,0.923077,0.857143,0.888889,60
3,LSTM,50 layers,0.960591,0.982659,0.971429,0.977011,0.833333,0.892857,0.862069,60
4,CNN with Attention,"32 filters, kernel size 7",0.960591,0.988304,0.965714,0.976879,0.8125,0.928571,0.866667,60
5,CNN with Attention,"32 filters, kernel size 5",0.965517,0.988372,0.971429,0.979827,0.83871,0.928571,0.881356,60
6,CNN with Attention,"32 filters, kernel size 3",0.955665,0.988235,0.96,0.973913,0.787879,0.928571,0.852459,60
7,CNN with Attention,"64 filters, kernel size 7",0.970443,0.988439,0.977143,0.982759,0.866667,0.928571,0.896552,60
8,CNN with Attention,"64 filters, kernel size 5",0.965517,0.988372,0.971429,0.979827,0.83871,0.928571,0.881356,60
9,CNN with Attention,"64 filters, kernel size 3",0.960591,0.988304,0.965714,0.976879,0.8125,0.928571,0.866667,60


Results for window size: 80


Unnamed: 0,Name,Params,Accuracy,Precision (0),Recall (0),F1 (0),Precision (1),Recall (1),F1 (1),Window Size
0,LSTM,250 layers,0.928962,0.949045,0.967532,0.958199,0.807692,0.724138,0.763636,80
1,LSTM,200 layers,0.89071,0.889535,0.993506,0.93865,0.909091,0.344828,0.5,80
2,LSTM,100 layers,0.770492,0.85443,0.876623,0.865385,0.24,0.206897,0.222222,80
3,LSTM,50 layers,0.754098,0.830303,0.88961,0.858934,0.055556,0.034483,0.042553,80
4,CNN with Attention,"32 filters, kernel size 7",0.918033,0.92638,0.980519,0.952681,0.85,0.586207,0.693878,80
5,CNN with Attention,"32 filters, kernel size 5",0.923497,0.954545,0.954545,0.954545,0.758621,0.758621,0.758621,80
6,CNN with Attention,"32 filters, kernel size 3",0.907104,0.930818,0.961039,0.945687,0.75,0.62069,0.679245,80
7,CNN with Attention,"64 filters, kernel size 7",0.923497,0.943038,0.967532,0.955128,0.8,0.689655,0.740741,80
8,CNN with Attention,"64 filters, kernel size 5",0.912568,0.948052,0.948052,0.948052,0.724138,0.724138,0.724138,80
9,CNN with Attention,"64 filters, kernel size 3",0.885246,0.913043,0.954545,0.933333,0.681818,0.517241,0.588235,80


Results for window size: 100


Unnamed: 0,Name,Params,Accuracy,Precision (0),Recall (0),F1 (0),Precision (1),Recall (1),F1 (1),Window Size
0,LSTM,250 layers,0.932515,0.926667,1.0,0.961938,1.0,0.541667,0.702703,100
1,LSTM,200 layers,0.852761,0.852761,1.0,0.92053,0.0,0.0,0.0,100
2,LSTM,100 layers,0.852761,0.852761,1.0,0.92053,0.0,0.0,0.0,100
3,LSTM,50 layers,0.852761,0.852761,1.0,0.92053,0.0,0.0,0.0,100
4,CNN with Attention,"32 filters, kernel size 7",0.90184,0.948905,0.935252,0.942029,0.653846,0.708333,0.68,100
5,CNN with Attention,"32 filters, kernel size 5",0.846626,0.87013,0.964029,0.914676,0.444444,0.166667,0.242424,100
6,CNN with Attention,"32 filters, kernel size 3",0.883436,0.9,0.971223,0.934256,0.692308,0.375,0.486486,100
7,CNN with Attention,"64 filters, kernel size 7",0.91411,0.956204,0.942446,0.949275,0.692308,0.75,0.72,100
8,CNN with Attention,"64 filters, kernel size 5",0.895706,0.917808,0.964029,0.940351,0.705882,0.5,0.585366,100
9,CNN with Attention,"64 filters, kernel size 3",0.920245,0.963235,0.942446,0.952727,0.703704,0.791667,0.745098,100


### **Random Under Sampling**

In [27]:
# from imblearn.under_sampling import RandomUnderSampler
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler


# time_series_df = aggregated_df.copy()

# # Drop rows with NaN in the 'spikes' column
# time_series_df = time_series_df.dropna(subset=['spikes'])
# discretized_df = discretize(time_series_df[FEATURE_COLUMNS], FEATURE_COLUMNS, BIN_COUNT)
# time_series_df[FEATURE_COLUMNS] = discretized_df

# # Extract features and target variable BEFORE creating sequences
# X = time_series_df[FEATURE_COLUMNS].values
# y = time_series_df['spikes'].values

# # Feature scaling using StandardScaler
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Apply RandomOverSampler BEFORE creating sequences
# random_under_sampler = RandomUnderSampler(random_state=RANDOM_STATE)
# X_scaled_resampled, y_resampled = random_under_sampler.fit_resample(X_scaled, y)

# # Recreate sequences with resampled data
# X_sequences_resampled, y_sequences_resampled = [], []
# for i in range(len(X_scaled_resampled) - SPIKES_WINDOW_SIZE + 1):
#     X_sequences_resampled.append(X_scaled_resampled[i:i + SPIKES_WINDOW_SIZE, :])
#     y_sequences_resampled.append(y_resampled[i + SPIKES_WINDOW_SIZE - 1])

# X_sequences_resampled, y_sequences_resampled = np.array(X_sequences_resampled), np.array(y_sequences_resampled)

# # Split the resampled sequences into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_sequences_resampled, y_sequences_resampled, test_size=0.2, random_state=50)

# # Proceed with model training and evaluation
# evaluate_all_models(RUS_OUTPUT_PATH)


### **Random Over Sampling**

In [28]:
# from imblearn.over_sampling import RandomOverSampler

# time_series_df = aggregated_df.copy()

# # Drop rows with NaN in the 'spikes' column
# time_series_df = time_series_df.dropna(subset=['spikes'])
# discretized_df = discretize(time_series_df[FEATURE_COLUMNS], FEATURE_COLUMNS, BIN_COUNT)
# time_series_df[FEATURE_COLUMNS] = discretized_df

# # Extract features and target variable BEFORE creating sequences
# X = time_series_df[FEATURE_COLUMNS].values
# y = time_series_df['spikes'].values

# # Feature scaling using StandardScaler
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Apply RandomUnderSampler BEFORE creating sequences
# random_over_sampler = RandomOverSampler(random_state=RANDOM_STATE)
# X_scaled_resampled, y_resampled = random_over_sampler.fit_resample(X_scaled, y)

# # Recreate sequences with resampled data
# X_sequences_resampled, y_sequences_resampled = [], []
# for i in range(len(X_scaled_resampled) - SPIKES_WINDOW_SIZE + 1):
#     X_sequences_resampled.append(X_scaled_resampled[i:i + SPIKES_WINDOW_SIZE, :])
#     y_sequences_resampled.append(y_resampled[i + SPIKES_WINDOW_SIZE - 1])

# X_sequences_resampled, y_sequences_resampled = np.array(X_sequences_resampled), np.array(y_sequences_resampled)

# # Split the resampled sequences into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_sequences_resampled, y_sequences_resampled, test_size=0.2, random_state=50)

# evaluate_all_models(ROS_OUTPUT_PATH)