In [None]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install scikit-learn
# %pip install tensorflow
# %pip install -U imbalanced-learn

In [None]:
import numpy as np
import pandas as pd

In [None]:
COMMODITY = 'magnesium'

DATE_COLUMN = 'Date'
VALUE_COLUMN = 'Value'  
QUANTITY_COLUMN = 'Std. Quantity (KG)'
UNIT_RATE_COLUMN = 'Std. Unit Rate ($/KG)'
BRENT_OIL_COLUMN = 'Brent Oil Value'
WTI_OIL_COLUMN = 'WTI Oil Value'

VALUE_SPIKES_COLUMN = 'Value Spikes'  
QUANTITY_SPIKES_COLUMN = 'Std. Quantity (KG) Spikes'
UNIT_RATE_SPIKES_COLUMN = 'Std. Unit Rate ($/KG) Spikes'
BRENT_OIL_SPIKES_COLUMN = 'Brent Oil Value'
WTI_OIL_SPIKES_COLUMN = 'WTI Oil Value'

ORIGIN_COUNTRY_COLUMN = 'Country of Origin'
DEST_COUNTRY_COLUMN = 'Country of Destination'

PETROL_FILE_PATH = '../volza/petroleum/petrol_crude_oil_spot_price.csv'
VOLZA_FILE_PATH = '../volza/magnesium/magnesium.csv'
PRICE_FILE_PATH = "../volza/magnesium/magnesium_price_2.csv"

SPIKES_THRESHOLD = 2
SPIKES_WINDOW_SIZE = 20

RANDOM_STATE = 42

In [None]:
# Only keep rows where we have usable quantity units (kg, ton) and standardizing it.
def convert_to_kg(df, quantity_col='Std. Quantity', unit_col='Std. Unit'):
    converstion_factors = {
        'TON': 907.185,
        'TNE': 1000,
        'KGS': 1,
        'Kgs': 1,
    }

    df_filtered = df[df[unit_col].isin(converstion_factors.keys())]

    def convert(row):
        unit = row[unit_col]
        quantity = row[quantity_col]
        return quantity * converstion_factors.get(unit,1)
    
    df_filtered = df_filtered[df_filtered[VALUE_COLUMN] != 0]
    df_filtered[QUANTITY_COLUMN] = df_filtered.apply(convert, axis=1)
    df_filtered = df_filtered[df_filtered[QUANTITY_COLUMN] != 0]

    df_filtered[UNIT_RATE_COLUMN] = df_filtered[VALUE_COLUMN] / df_filtered[QUANTITY_COLUMN]

    return df_filtered

## Spike detection

In [None]:
from datetime import datetime

#Formatting the date and price for Volza data
volza_pd = pd.read_csv(VOLZA_FILE_PATH)
volza_pd = volza_pd[(volza_pd["Country of Origin"].notnull()) & (volza_pd["Country of Destination"].notnull())]
volza_pd = volza_pd.rename(columns={'Unnamed: 0': 'ID'})
volza_pd['Date'] = volza_pd['Date'].apply(lambda x: x.split(' ')[0])
volza_pd['Date'] = pd.to_datetime(volza_pd['Date'], errors='raise', format='%Y-%m-%d')
volza_pd = convert_to_kg(volza_pd)
volza_pd.head(3)

In [None]:
#Preprocessing the price data
prices_pd = pd.read_csv(PRICE_FILE_PATH)
prices_pd['Date'] = prices_pd['Date'].apply(lambda x: datetime.strptime(x, "%b %d, %Y").strftime("%Y-%m-%d"))
prices_pd['Date'] = pd.to_datetime(prices_pd['Date'], errors='raise', format='%Y-%m-%d')
prices_pd['Price'] = prices_pd['Price'].str.replace(',', '').astype(float)
prices_pd = prices_pd[['Date','Price']]
prices_pd.head(3)

In [None]:
#Aggregate volza data by day
date_wise_volza = volza_pd.groupby("Date")[[VALUE_COLUMN,QUANTITY_COLUMN,'Gross Weight']].sum()

In [None]:
# Avg of Commodity Price in Volza
avg_price_volza = volza_pd.groupby('Date')[UNIT_RATE_COLUMN].mean()
date_wise_volza = date_wise_volza.join(avg_price_volza, how='left')
date_wise_volza

In [None]:
# Petroleum data prep
petrol_df = pd.read_csv(PETROL_FILE_PATH, delimiter=';', on_bad_lines='warn')
petrol_df['Date'] = pd.to_datetime(petrol_df['Date'])

# Split based on types of oil
brent_df = petrol_df[petrol_df['product-name']=='UK Brent Crude Oil']
wti_df = petrol_df[petrol_df['product-name']=='WTI Crude Oil']

brent_df.rename(columns={'Value':'Brent Oil Value'}, inplace=True)
wti_df.rename(columns={'Value':'WTI Oil Value'}, inplace=True)


In [None]:
# Combining dataframes
prices_pd = prices_pd.set_index('Date')
aggregated_df = date_wise_volza.join(prices_pd, how="left").fillna(method='ffill')
aggregated_df = aggregated_df.merge(brent_df[[DATE_COLUMN, BRENT_OIL_COLUMN]], on='Date', how='left').fillna(method='ffill')
aggregated_df = aggregated_df.merge(wti_df[[DATE_COLUMN, WTI_OIL_COLUMN]], on='Date', how='left').fillna(method='ffill')

In [None]:
def detect_spikes(df, column):
    ## Detecting spikes
    moving_avg = df[column].rolling(window=SPIKES_WINDOW_SIZE).mean()
    std_dev = df[column].rolling(window=SPIKES_WINDOW_SIZE).std()

    # Set a threshold to identify spikes
    return (abs(aggregated_df[column] - moving_avg) > SPIKES_THRESHOLD * std_dev).astype(int)

aggregated_df['spikes'] = detect_spikes(aggregated_df, 'Price')
print("SPIKES : NON SPIKES = ")
print(aggregated_df['spikes'].value_counts())
print("PERCENT OF SPIKES", aggregated_df['spikes'].value_counts()[1]/len(aggregated_df))

# **Detect spikes**

In [None]:
aggregated_df[VALUE_SPIKES_COLUMN] = detect_spikes(aggregated_df, VALUE_COLUMN)
aggregated_df[QUANTITY_SPIKES_COLUMN] = detect_spikes(aggregated_df, QUANTITY_COLUMN)
aggregated_df[UNIT_RATE_SPIKES_COLUMN] = detect_spikes(aggregated_df, UNIT_RATE_COLUMN)
aggregated_df[WTI_OIL_SPIKES_COLUMN] = detect_spikes(aggregated_df, WTI_OIL_COLUMN)
aggregated_df[BRENT_OIL_SPIKES_COLUMN] = detect_spikes(aggregated_df, BRENT_OIL_COLUMN)

#Visualise Dataset

import matplotlib.pyplot as plt
# Plotting the graph
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plotting 'Value', 'Quantity', and 'Gross Weight' on the left y-axis
ax1.plot(aggregated_df.index, aggregated_df[VALUE_SPIKES_COLUMN], label='Value Spikes', color='b')
ax1.plot(aggregated_df.index, aggregated_df[QUANTITY_SPIKES_COLUMN], label='Quantity Spikes', color='g')
ax1.plot(aggregated_df.index, aggregated_df[UNIT_RATE_SPIKES_COLUMN], label='Unit Rate Spikes', color='k')
ax1.plot(aggregated_df.index, aggregated_df[BRENT_OIL_SPIKES_COLUMN], label='Brent Oil Value Spikes', color='m')
ax1.plot(aggregated_df.index, aggregated_df[WTI_OIL_SPIKES_COLUMN], label='WTI Oil Value Spikes', color='c')

ax1.set_xlabel('Date')
ax1.set_ylabel('Value / Quantity / Gross Weight', color='b')
ax1.tick_params('y', colors='b')

# Creating a second y-axis for 'Price'
ax2 = ax1.twinx()
ax2.plot(aggregated_df.index, aggregated_df['Price'], label='Price', color='orange')
ax2.set_ylabel('Price', color='orange')
ax2.tick_params('y', colors='orange')

# Display legend
fig.tight_layout()
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))

# Display the graph
plt.show()

In [None]:
#remove date 2020-01-01
aggregated_df = aggregated_df[aggregated_df.index != '2020-01-01']
aggregated_df

In [None]:
#Visualise Dataset

import matplotlib.pyplot as plt
# Plotting the graph
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plotting 'Value', 'Quantity', and 'Gross Weight' on the left y-axis
ax1.plot(aggregated_df.index, aggregated_df[VALUE_COLUMN], label='Value', color='b')
ax1.plot(aggregated_df.index, aggregated_df[QUANTITY_COLUMN], label='Quantity', color='g')
ax1.plot(aggregated_df.index, aggregated_df[UNIT_RATE_COLUMN], label='Unit Rate', color='k')
ax1.plot(aggregated_df.index, aggregated_df[BRENT_OIL_COLUMN], label='Brent Oil Value', color='m')
ax1.plot(aggregated_df.index, aggregated_df[WTI_OIL_COLUMN], label='WTI Oil Value', color='c')
ax1.plot(aggregated_df.index, aggregated_df['Gross Weight'], label='Gross Weight', color='r')

ax1.set_xlabel('Date')
ax1.set_ylabel('Value / Quantity / Gross Weight', color='b')
ax1.tick_params('y', colors='b')

# Creating a second y-axis for 'Price'
ax2 = ax1.twinx()
ax2.plot(aggregated_df.index, aggregated_df['Price'], label='Price', color='orange')
ax2.set_ylabel('Price', color='orange')
ax2.tick_params('y', colors='orange')

# Display legend
fig.tight_layout()
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))

# Display the graph
plt.show()

In [None]:
# Plotting the price data
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
plt.plot(aggregated_df.index, aggregated_df['Price'], label='Price', color='blue')

# Highlighting spikes
spike_indices = aggregated_df[aggregated_df['spikes'] == 1].index
spike_prices = aggregated_df.loc[spike_indices, 'Price']
plt.scatter(spike_indices, spike_prices, color='red', marker='^', label='Spikes')

# Adding labels and title
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Price Data with Spikes')
plt.legend()

# Display the plot
plt.show()

## Baseline

In [None]:
# Count % of spikes 
total_spikes = aggregated_df['spikes'].sum()
total_data_points = len(aggregated_df)
percentage_of_spikes = (total_spikes / total_data_points) * 100

print(f"Percentage of Spikes: {percentage_of_spikes:.2f}%")

In [None]:
from sklearn.metrics import precision_score, recall_score

# Probability of spike
spike_prob = aggregated_df['spikes'].mean()

# Random baseline predictions
random_predictions = np.random.choice([0, 1], size=len(aggregated_df), p=[1-spike_prob, spike_prob])

# Calculate precision and recall for the random baseline
random_precision = precision_score(aggregated_df['spikes'], random_predictions)
random_recall = recall_score(aggregated_df['spikes'], random_predictions)

print(f"Random Guessing Precision: {random_precision}")
print(f"Random Guessing Recall: {random_recall}")




In [18]:

# Discretize
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, MaxPooling1D, Flatten, SimpleRNN
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.layers import Attention, Reshape
from tensorflow.keras.models import Model

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.layers import Attention, Reshape
from tensorflow.keras.models import Model

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [19]:
FEATURES = [VALUE_COLUMN, QUANTITY_COLUMN, UNIT_RATE_COLUMN, WTI_OIL_COLUMN, BRENT_OIL_COLUMN]

## Data Prep for Classification

In [20]:

for feature in FEATURES:
    feature_name = feature.replace('/', '')
    FEATURE_COLUMNS = FEATURES.copy()
    FEATURE_COLUMNS.remove(feature)

    def discretize(df, columns, bins):
        est = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='kmeans')
        return est.fit_transform(df[columns])

    test_df = aggregated_df.copy()  # Assuming aggregated_df is your DataFrame
    discretized_df = discretize(test_df[FEATURE_COLUMNS], FEATURE_COLUMNS, 5)
    test_df[FEATURE_COLUMNS] = discretized_df
    test_df.head(2)
    # Convert the discretized data into a DataFrame
    discretized_df = pd.DataFrame(discretized_df, columns=FEATURE_COLUMNS)

    # Count the frequencies of each bin for each feature
    bin_counts = discretized_df.apply(pd.Series.value_counts).fillna(0).T

    time_series_df = aggregated_df.copy()

    # Drop rows with NaN in the 'spikes' column
    time_series_df = time_series_df.dropna(subset=['spikes'])
    discretized_df = discretize(time_series_df[FEATURE_COLUMNS], FEATURE_COLUMNS, 5)
    time_series_df[FEATURE_COLUMNS] = discretized_df

    # Extract features and target variable
    X = time_series_df[FEATURE_COLUMNS].values
    y = time_series_df['spikes'].values

    # Feature scaling using StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Create sequences for each sample with a window size of 20
    SPIKES_WINDOW_SIZE = 20
    X_sequences, y_sequences = [], []

    for i in range(len(X_scaled) - SPIKES_WINDOW_SIZE + 1):
        X_sequences.append(X_scaled[i:i + SPIKES_WINDOW_SIZE, :])
        y_sequences.append(y[i + SPIKES_WINDOW_SIZE - 1])

    X_sequences, y_sequences = np.array(X_sequences), np.array(y_sequences)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=50)
    def make_output_dict(name, params, classification_report):
        return {
            "Name": name,
            "Params": params,
            "Accuracy": classification_report["accuracy"],
            "Precision (0)": classification_report["0"]["precision"],
            "Recall (0)": classification_report["0"]["recall"],
            "F1 (0)": classification_report["0"]["f1-score"],
            "Precision (1)": classification_report["1"]["precision"],
            "Recall (1)": classification_report["1"]["recall"],
            "F1 (1)": classification_report["1"]["f1-score"],
        }

    output_dicts = []
    #LSTM Model
    def evaluate_lstm(num_layers: int):
        # Build the LSTM model
        model = Sequential()
        model.add(LSTM(num_layers, input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

        # Train the model
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=False)


        y_pred = (model.predict(X_test) > 0.5).astype(int)
        return make_output_dict(f"LSTM", f"{num_layers} layers", classification_report(y_test, y_pred, output_dict=True))


    def evaluate_rnn(num_units: int):
        # Build the RNN model
        model = Sequential()
        model.add(SimpleRNN(num_units, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

        # Train the model
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=False)

        # Predictions
        y_pred = (model.predict(X_test) > 0.5).astype(int)

        # Generate classification report
        return make_output_dict("RNN", f"{num_units} units", classification_report(y_test, y_pred, output_dict=True))
    def evaluate_cnn(num_filters: int, kernel_size: int):
        # Build the CNN model
        model = Sequential()
        model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Flatten())
        model.add(Dense(50, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

        # Train the model
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=False)

        # Predictions
        y_pred = (model.predict(X_test) > 0.5).astype(int)

        # Generate classification report
        return make_output_dict("CNN", f"{num_filters} filters, kernel size {kernel_size}", classification_report(y_test, y_pred, output_dict=True))



    def evaluate_random_forest(n_estimators):
        # Create a Random Forest Classifier
        random_forest = RandomForestClassifier(n_estimators=10, random_state=213)

        X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
        X_test_reshaped = X_test.reshape(X_test.shape[0], -1)

        # Train the classifier
        random_forest.fit(X_train_reshaped, y_train)

        # Make predictions on the test set
        y_pred = random_forest.predict(X_test_reshaped)

        return make_output_dict("Random Forest", f"{n_estimators} Estimators", classification_report(y_test, y_pred, output_dict=True))

    def create_attention_cnn_model(input_shape, num_classes, filters, kernel_size):
        inputs = tf.keras.Input(shape=input_shape)

        # CNN layers
        conv1 = Conv1D(filters, kernel_size=kernel_size, activation='relu')(inputs)
        pool1 = MaxPooling1D(pool_size=2)(conv1)

        # Reshape for attention
        reshape = Reshape((-1, 64))(pool1)

        # Attention mechanism
        attention = Attention()([reshape, reshape])

        # Flatten for fully connected layers
        flatten = Flatten()(attention)

        # Fully connected layers
        dense1 = Dense(128, activation='relu')(flatten)
        dropout = Dropout(0.5)(dense1)
        outputs = Dense(num_classes, activation='softmax')(dropout)

        # Create model
        model = Model(inputs=inputs, outputs=outputs)

        return model

    def evaluate_attention_cnn(filters, kernel_size):
        model = create_attention_cnn_model(X_train.shape[1:], 2, filters, kernel_size)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=100, batch_size=filters, verbose=False)
        y_pred = model.predict(X_test)
        return make_output_dict("CNN with Attention", f"{filters} filters, kernel size {kernel_size}", classification_report(y_test, y_pred.argmax(axis=1), output_dict=True))


    def create_cnn_model(input_shape, num_classes):
        inputs = tf.keras.Input(shape=input_shape)

        # CNN layers
        conv1 = Conv1D(32, kernel_size=3, activation='relu')(inputs)
        pool1 = MaxPooling1D(pool_size=2)(conv1)
        conv2 = Conv1D(64, kernel_size=3, activation='relu')(pool1)
        pool2 = MaxPooling1D(pool_size=2)(conv2)

        # Reshape for attention
        reshape = Reshape((-1, 64))(pool2)

        # Attention mechanism
        attention = Attention()([reshape, reshape])

        # Flatten for fully connected layers
        flatten = Flatten()(attention)

        # Fully connected layers
        dense1 = Dense(128, activation='relu')(flatten)
        dropout = Dropout(0.5)(dense1)
        outputs = Dense(num_classes, activation='softmax')(dropout)

        # Create model
        model = Model(inputs=inputs, outputs=outputs)

        return model

    def evaluate_all_models(output_file_name):
        output_dicts = []

        output_dicts.append(evaluate_lstm(100))

        # output_dicts.append(evaluate_random_forest(50))
        # output_dicts.append(evaluate_random_forest(25))
        # output_dicts.append(evaluate_random_forest(20))
        # output_dicts.append(evaluate_random_forest(10))
        # output_dicts.append(evaluate_random_forest(5))

        # output_dicts.append(evaluate_rnn(200))
        # output_dicts.append(evaluate_rnn(150))
        output_dicts.append(evaluate_rnn(100))
        output_dicts.append(evaluate_rnn(50))

        # output_dicts.append(evaluate_cnn(64, 3))
        output_dicts.append(evaluate_cnn(128, 3))
        # output_dicts.append(evaluate_cnn(256, 3))
        # output_dicts.append(evaluate_cnn(64, 5))
        output_dicts.append(evaluate_cnn(128, 5))
        # output_dicts.append(evaluate_cnn(256, 5))

        
        output_dicts.append(evaluate_attention_cnn(128, 5))

        output_dicts = pd.DataFrame(output_dicts)
        output_dicts.to_csv(output_file_name)
        output_dicts

    evaluate_all_models(f"ablation/{COMMODITY}_model_performance (No Balancing, Ablation {feature_name}).csv")
    # X_train_previous = X_train.reshape(X_train.shape[0], -1)
    # y_train_previous = y_train
    # X_test_previous = X_test.reshape(X_test.shape[0], -1)
    # y_test_previous = y_test

    # random_under_sampler = RandomUnderSampler(random_state=RANDOM_STATE)
    # X_train, y_train = random_under_sampler.fit_resample(X_train_previous, y_train_previous)

    # evaluate_all_models(f"ablation/{COMMODITY}_model_performance (Random Under Sampling, Ablate {feature_name}).csv")


    # random_under_sampler = RandomOverSampler(random_state=RANDOM_STATE)
    # X_train, y_train = random_under_sampler.fit_resample(X_train_previous, y_train_previous)

    # evaluate_all_models(f"ablation/{COMMODITY}_model_performance (Random Over Sampling, Ablate {feature_name}).csv")

  centers = km.fit(column[:, None]).cluster_centers_[:, 0]
  "decreasing the number of bins." % jj
  centers = km.fit(column[:, None]).cluster_centers_[:, 0]
  "decreasing the number of bins." % jj
  centers = km.fit(column[:, None]).cluster_centers_[:, 0]
  "decreasing the number of bins." % jj
  centers = km.fit(column[:, None]).cluster_centers_[:, 0]
  "decreasing the number of bins." % jj




  centers = km.fit(column[:, None]).cluster_centers_[:, 0]
  "decreasing the number of bins." % jj
  centers = km.fit(column[:, None]).cluster_centers_[:, 0]
  "decreasing the number of bins." % jj
  centers = km.fit(column[:, None]).cluster_centers_[:, 0]
  "decreasing the number of bins." % jj
  centers = km.fit(column[:, None]).cluster_centers_[:, 0]
  "decreasing the number of bins." % jj


