In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load datasets
exchange_rate_df = pd.read_csv('Exchange Rate.csv')
export_value_df = pd.read_csv('Export Value.csv')
fdi_inflow_df = pd.read_csv('FDI Inflow.csv')
fpi_df = pd.read_csv('FPI.csv')


In [None]:
# Drop unnecessary columns
exchange_rate_df.drop(columns=['Unnamed: 0'], inplace=True)
export_value_df.drop(columns=['Unnamed: 0'], inplace=True)
fdi_inflow_df.drop(columns=['Unnamed: 0'], inplace=True)
fpi_df.drop(columns=['Unnamed: 0'], inplace=True)


In [None]:
# converting exchange rate to yearly data using the average
exchange_rate = exchange_rate_df.groupby(['Area', 'Year']).mean().reset_index()

In [None]:
exchange_rate

In [None]:
# converting fpi to yearly data using the average
fpi = fpi_df.groupby(['Area', 'Year']).mean().reset_index()
fpi.head(205)

In [None]:
#making df an excel file
exchange_rate.to_csv("Exchange Rate yearly.csv")
fpi.to_csv("FPI yearly.csv")

In [None]:
# Merging files
data = export_value_df.merge(exchange_rate, on=['Area', 'Year'], how='left') \
                      .merge(fdi_inflow_df, on=['Area', 'Year'], how='left') \
                      .merge(fpi, on=['Area', 'Year'], how='left')

In [None]:
data

In [None]:
data.to_csv("merged data.csv")

In [None]:
data.info()

In [None]:
# Renaming column
data = data.rename(columns={'FDI_USDm': 'FDI Inflow'})

In [None]:
# Checking for entire duplicate rows
duplicate_rows = data.duplicated()
print("Number of duplicate rows:", duplicate_rows.sum())

In [None]:
# checking for missing values
missing_counts = data.isnull().sum()
print(missing_counts)

In [None]:
# Filtering out columns that have no missing values
missing_counts = missing_counts[missing_counts > 0]

# Creating a bar plot
plt.figure(figsize=(6, 4))
missing_counts.plot(kind='bar')
plt.title('Figure . Missing Values Count by Column')
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values')
plt.show()

In [None]:
# Get the initial shape of the dataset
initial_shape = data.shape
print(f'Initial dataset shape: {initial_shape}')

# Dropping rows with any missing values
df = data.dropna()

# Getting the shape of the dataset after dropping missing values
final_shape = df.shape
print(f'Final dataset shape after dropping missing values: {final_shape}')

# Calculating the number of rows dropped
rows_dropped = initial_shape[0] - final_shape[0]
print(f'Number of rows dropped: {rows_dropped}')

# Calculating the percentage of data retained
percentage_retained = (final_shape[0] / initial_shape[0]) * 100
print(f'Percentage of data retained: {percentage_retained:.2f}%')

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# displaying format to float, 2 decimal places
pd.set_option('display.float_format', '{:.2f}'.format)
df.describe()

In [None]:
df.groupby('Item').count()

In [None]:
# Checking highest export value from highest to lowest by region
ranking_export_value=df.groupby(['Area'],sort=True)['Export Value'].sum().nlargest(10)

# Creating a bar plot
sns.set(style="whitegrid")
plt.figure(figsize=(8, 4))
sns.barplot(x=ranking_export_value.values, y=ranking_export_value.index, palette="viridis")
plt.title('Top 10 Areas by Export Value', fontsize=16)
plt.xlabel('Export Value', fontsize=14)
plt.ylabel('Area', fontsize=14)
plt.show()

The United States has the highest export value in the dataset.
Dominated crop in dataset is fruits and vegeables with 4 countries where it is the highest

In [None]:
#Highest export value by Item and Area
top_item_export_value = df.groupby(['Item','Area'],sort=True)['Export Value'].sum().nlargest(10)
top_item_export_value

In [None]:
# Boxplot of variables
plt.figure(figsize = (10,10))

plt.subplot(3,2,1)
sns.boxplot(data= df['Year'])
plt.title('Year')


plt.subplot(3,2,2)
sns.boxplot(data= df['Export Value'])
plt.title('Export Value')

plt.subplot(3,2,3)
sns.boxplot(data= df['Exchange Rate'])
plt.title('Exchange Rate')

plt.subplot(3,2,4)
sns.boxplot(data= df['FDI Inflow'])
plt.title('FDI Inflow')

plt.subplot(3,2,5)
sns.boxplot(data= df['FPI'])
plt.title('FPI')

plt.show()

In [None]:
df.isnull().sum()

In [None]:
# correlation matrix between variables
num_cor = df.select_dtypes(['int64','float64']).corr()
sns.heatmap(num_cor,cmap = 'YlGnBu',annot = True)
plt.title('Heatmap')

In [None]:
df.info()

Can be seen that there is mostly a negative correlation between variables

In [None]:
# Function to create labels (export value at t+3)
def create_labels(df, column_name, lead, groupby_columns=['Area']):
    df[f'{column_name}_t+{lead}'] = df.groupby(groupby_columns)[column_name].shift(-lead)
    return df

# Creating label for export value three years ahead (t+3)
df = create_labels(df, 'Export Value', lead=3)

# Dropping rows with missing values in the label column
df.dropna(subset=['Export Value_t+3'], inplace=True)
df

In [None]:
# Creating lagged features for the target and independent variables

for feature in ['Export Value', 'Exchange Rate', 'FDI Inflow', 'FPI']:
    for lag in range(1, 4):
        df[f'{feature}_Lag_{lag}'] = df.groupby('Area')[feature].shift(lag)


In [None]:
df.head()

In [None]:
# dropping rows with missing values
df = df.dropna()
df = df.reset_index(drop=True)
df

In [None]:
# Applying one-hot encoding to categorical variables
from sklearn.preprocessing import OneHotEncoder
df = pd.get_dummies(df,columns=['Item','Area'], dtype=float, drop_first=True)
df


In [None]:
# Recoding labels into two classes
threshold = df['Export Value_t+3'].median()

def recode_labels(value):
    if value < threshold:
        return 0
    else:
        return 1
df['Export Value_t+3_label'] = df['Export Value_t+3'].apply(recode_labels)
df

In [None]:
# features and target variable
X = df.drop(columns=['Export Value_t+3', 'Export Value', 'Year', 'Exchange Rate', 'FDI Inflow', 'FPI']) # features
y = df['Export Value_t+3_label'] # target

Export Value has been dropped from the 'X' and stored in 'y' as it is the target variable.

In [None]:
# Shows that there is 44180  and 195 columns
X.shape

In [None]:
# Scaling/Normalization
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
X_normalized=scaler.fit_transform(X)

In [None]:
# Randomly Splitting data into train/test using 80:20 split
from sklearn.model_selection import train_test_split
X_rem, X_test, y_rem, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42, shuffle=True)

# Further splitting the remaining data 80:20 into training (64%) and validation sets (16%)
X_train, X_val, y_train, y_val = train_test_split(X_rem, y_rem, test_size=0.2, random_state=42, shuffle=True)


In [None]:
print(f"Number of instances in training set: {X_train.shape[0]}")
print(f"Number of instances in test set: {X_test.shape[0]}")
print(f"Number of instances in validation set:{X_val.shape[0]}")

In [None]:
# Define the function to plot label distribution
def plot_label_distr(labels, plot_title):
    plt.figure()
    the_bin_centres = np.unique(labels)
    plt.hist(labels, bins=the_bin_centres.shape[0], range=(the_bin_centres[0]-0.5, the_bin_centres[-1]+0.5))
    plt.xticks(the_bin_centres)
    plt.title(plot_title)
    plt.show()
    print('\n')

# Plotting the label distributions for training, validation, and test sets
plot_label_distr(y_train, 'Class frequencies - Training set')
plot_label_distr(y_val, 'Class frequencies - Validation set')
plot_label_distr(y_test, 'Class frequencies - Test set')

In [None]:
##

# Multilayer Perceptron

Creating methods

In [None]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import random
import numpy

# Ensuring reproducibility
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
numpy.random.seed(random_seed)

## Creating the network structure
class three_layer_MLP(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_layer_sizes,
                 output_size):
        super().__init__()
        self.hidden_l1 = nn.Linear(input_size, hidden_layer_sizes[0])
        self.hidden_l2 = nn.Linear(hidden_layer_sizes[0], hidden_layer_sizes[1])
        self.output_l3 = nn.Linear(hidden_layer_sizes[1], output_size)


    def forward(self, inputs):
        out = self.hidden_l1(inputs)
        out = self.hidden_l2(out)
        out = self.output_l3(out)
        out = torch.softmax(out, 1)
        return out

# A method for computing performance metrics of interest
def my_metrics(labels, predictions, show_confusion_matrix=False):

    ## First work out which class has been predicted for each data sample. Hint: use argmax
    ## Second count how many of these are correctly predicted
    ## Finally return the accuracy, i.e. the percentage of samples correctly predicted

    predictions_numpy = predictions.detach().numpy()
    predicted_classes = numpy.argmax(predictions_numpy, axis=1)


    f1_scores = f1_score(labels, predicted_classes, average=None)
    acc = accuracy_score(labels, predicted_classes)

    if show_confusion_matrix:
      print("\n Confusion matrix:")
      confus_mat = confusion_matrix(labels, predicted_classes)
      disp = ConfusionMatrixDisplay(confus_mat)
      disp.plot()
      plt.show()

    return f1_scores, acc


# A class for managing the data for training the model
class MetDataset(Dataset):
    def __init__(self, feats, labels):
        # Converting features and labels from numpy arrays to PyTorch tensors
        self.feats = torch.tensor(feats, dtype=torch.float32)
        self.labels = torch.tensor(labels.values if isinstance(labels, pd.Series) else labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        return self.feats[idx, :], self.labels[idx]




Running model

In [None]:
from copy import deepcopy
# Creating an instance of the MLP network
feature_count = X_train.shape[1]
hidden_layer_sizes = [10, 10]
class_count = 2
model = three_layer_MLP(feature_count, hidden_layer_sizes, class_count)


# Setting hyperparameters
num_epochs = 30
learning_rate = 0.05
batch_size = 50


# Setting up the data loading by batch
# With the test and validation sets having only one batch
train_set = MetDataset(X_train, y_train)
train_dataloader = DataLoader(train_set, batch_size=batch_size)

val_set = MetDataset(X_val, y_val)
val_dataloader = DataLoader(val_set, batch_size=len(val_set))

test_set = MetDataset(X_test, y_test)
test_dataloader = DataLoader(test_set, batch_size=len(test_set))



# Setting up the SGD optimizer for updating the model weights
optimizer = optim.SGD(model.parameters(), lr=learning_rate)


# Computing cross entropy loss against the training labels
loss_function = nn.CrossEntropyLoss()



best_model_acc = 0
losses = []

# Iterating over the dataset at two different staages:
# 1. Iterating over the batches in the dataset (inner for loop below)
# One complete set of iteration through the dataset (i.e. having gone over
# all batches in the dataset at least once) = One epoch
# 2. Iterating over the specified numeber of epochs (outer for loop below)
for epoch in range(0, num_epochs):

    # Setting the model to training mode
    model.train()

    if epoch == 0:  best_model = deepcopy(model)

    for batch, (X_train_batch, y_train_batch) in enumerate(train_dataloader):

      # Zeroing out the `.grad` buffers,
      # otherwise on the backward pass we'll add the
      # new gradients to the old ones.
      optimizer.zero_grad()

      # Computing the forward pass and then the loss
      train_pred = model.forward(X_train_batch)
      train_loss = loss_function(train_pred, y_train_batch)
      train_avg_f1_score, train_acc = my_metrics(y_train_batch, train_pred)

      # Computing the model parameters' gradients
      # and propagating the loss backwards through the network.
      train_loss.backward()

      # Updating the model parameters using those gradients
      optimizer.step()

    # Evaluating on the validation set
    model.eval()
    for batch, (X_val_batch, y_val_batch) in enumerate(val_dataloader):
      val_pred = model.forward(X_val_batch)
      val_loss = loss_function(val_pred, y_val_batch)
      val_avg_f1_score, val_acc = my_metrics(y_val_batch, val_pred)

    if val_acc > best_model_acc:
      best_model_acc = val_acc
      best_model = deepcopy(model)
      print('Found improvement in performance. New model saved.')

    # How well the network does on the batches
    # is an indication of how well training is progressing
    print("epoch: {} - train loss: {:.4f} train acc: {:.2f} val loss: {:.4f} val acc: {:.2f}".format(
        epoch,
        train_loss.item(),
        train_acc,
        val_loss.item(),
        val_acc ))

    losses.append([train_loss.item(), val_loss.item()])

model = best_model

# Testing model on the test set to get an estimate of its performance.
# First set the model to evaluation mode
model.eval()
data_instance_ids = []
true_labels = []
model_predictions = []

for batch, (X_test_batch, y_test_batch) in enumerate(test_dataloader):
  test_pred = model.forward(X_test_batch)
  test_f1_scores, test_accuracy = my_metrics(y_test_batch, test_pred, show_confusion_matrix=True)
  print("\n test accuracy: {:2.2f}".format(test_accuracy))
  test_pred_numpy = test_pred.detach().numpy()
  print('\n The F1 scores for each of the classes are: '+str(test_f1_scores))

  print("\n Loss graph:")
  fig, ax = plt.subplots()
  losses = numpy.array(losses)
  ax.plot(losses[:, 0], 'b-', label='training loss')
  ax.plot(losses[:, 1], 'k-', label='validation loss')
  plt.legend(loc='upper right')

  instance_ids = range(len(y_test_batch))  # Adjust this if you have specific instance IDs
  data_instance_ids.extend(instance_ids)
  true_labels.extend(y_test_batch.numpy())
  model_predictions.extend(test_pred_numpy)


# Saving results to CSV
output_df = pd.DataFrame({
    'Data Instance ID': data_instance_ids,
    'True Label': true_labels,
    'Predicted value': model_predictions
})

output_csv_path = 'model_predictions.csv'
output_df.to_csv(output_csv_path, index=False, header=True)

print(f"Predictions saved to {output_csv_path}")

In [None]:
# @title
y_test_batch.size()