### **Importing the required libraries**

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader, random_split,  TensorDataset
import torch.nn as nn

# **Train Data**

# **(a) Data Preprocessing**

### **CGM Data Preprocessing**



In [None]:
data_train_cgm = pd.read_csv('cgm_train.csv')

In [None]:
print("Shape of Train data:", data_train_cgm.shape)

In [None]:
data_train_cgm.head()

In [None]:
# Converting into DateTime objects
data_train_cgm['Breakfast Time'] = pd.to_datetime(data_train_cgm['Breakfast Time'], errors = 'coerce')
data_train_cgm['Lunch Time'] = pd.to_datetime(data_train_cgm['Lunch Time'], errors = 'coerce')

#### **Handling Missing Values**

In [None]:
data_train_cgm.isnull().sum()

In [None]:
# Imputing null values using Forward Fill
data_train_cgm['Breakfast Time'].fillna(method = 'ffill', inplace = True)
data_train_cgm['Lunch Time'].fillna(method='ffill', inplace=True)

In [None]:
data_train_cgm['CGM Data'] = data_train_cgm['CGM Data'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

print(type(data_train_cgm['CGM Data'][0]))
print(data_train_cgm['CGM Data'][0])

In [None]:
data_cgm_num_tuples_before = data_train_cgm['CGM Data'].apply(len).sum()
data_cgm_num_tuples_before

#### **Resampling and Interpolation of CGM Data**

In [None]:
# Resampled to have entries every 5 minuites, corresponding glucose values are interpolated linearly
subj_ids = data_train_cgm['Subject ID'].unique()
day_nums = data_train_cgm['Day'].unique()

data_train_cgm_resampled = pd.DataFrame()

for subj_id in subj_ids:
  for day_num in day_nums:
    data_subj_id_day_num = data_train_cgm[(data_train_cgm['Subject ID'] == subj_id) & (data_train_cgm['Day'] == day_num)]

    if data_subj_id_day_num.empty:
      continue

    data_all_cgm_row = []

    for index, row in data_subj_id_day_num.iterrows():
      data_train_cgm_row = row['CGM Data']

      if not data_train_cgm_row or any(pd.isna(item[0]) for item in data_train_cgm_row):
        continue

      data_train_cgm_row_df = pd.DataFrame(data_train_cgm_row, columns=['timestamp', 'glucose_level'])

      data_train_cgm_row_df['timestamp'] = pd.to_datetime(data_train_cgm_row_df['timestamp'], errors='coerce')

      data_train_cgm_row_df = data_train_cgm_row_df.dropna(subset=['timestamp'])

      if data_train_cgm_row_df.empty:
        continue

      start_time = data_train_cgm_row_df['timestamp'].min()
      end_time = data_train_cgm_row_df['timestamp'].max()

      if pd.isna(start_time) or pd.isna(end_time):
        continue

      resampled_time_index = pd.date_range(start=start_time, end=end_time, freq='5T')

      if resampled_time_index.empty:
        continue

      data_train_cgm_row_df_resampled = data_train_cgm_row_df.set_index('timestamp').reindex(resampled_time_index)

      data_train_cgm_row_df_resampled['glucose_level'] = data_train_cgm_row_df_resampled['glucose_level'].interpolate(method='linear')

      data_resampled_cgm_row = list(zip(data_train_cgm_row_df_resampled.index, data_train_cgm_row_df_resampled['glucose_level']))

      data_all_cgm_row.extend(data_resampled_cgm_row)

      data_resampled_subj_day = pd.DataFrame({
          'Subject ID': [subj_id],
          'Day': [day_num],
          'Breakfast Time': [data_subj_id_day_num['Breakfast Time'].iloc[0]],
          'Lunch Time': [data_subj_id_day_num['Lunch Time'].iloc[0]],
          'CGM Data': [data_all_cgm_row]})

      data_train_cgm_resampled = pd.concat([data_train_cgm_resampled, data_resampled_subj_day], ignore_index=True)

In [None]:
data_train_cgm = data_train_cgm_resampled

In [None]:
data_train_cgm

#### **Visualizing the change in glucose level values with time**

In [None]:
data_subj_id_day_num = data_train_cgm[(data_train_cgm['Subject ID'] == 1) & (data_train_cgm['Day'] == 2)]

data_cgm = data_subj_id_day_num['CGM Data'].iloc[0]

timestamps, glucose_vals = zip(*data_cgm)

timestamps = pd.to_datetime(timestamps, format='%Y-%m-%d %H:%M:%S')

time_breakfast = pd.to_datetime(data_subj_id_day_num['Breakfast Time'].iloc[0])
time_lunch = pd.to_datetime(data_subj_id_day_num['Lunch Time'].iloc[0])

plt.figure(figsize=(10, 6))

plt.plot(timestamps, glucose_vals, linestyle='-', color='b', label='Glucose Level')

plt.axvline(x = time_breakfast, color='green', linestyle=':', label='Breakfast Time')
plt.axvline(x = time_lunch, color='red', linestyle=':', label='Lunch Time')

plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))

plt.xticks(rotation=45)

plt.xlabel('Time')
plt.ylabel('Glucose Level')
plt.title('CGM Data for Subject ID 1, Day 2')
plt.grid(True)
plt.tight_layout()
plt.legend()

plt.show()

In [None]:
data_train_cgm_df = pd.DataFrame(data_train_cgm)

#### **Applying Standardization to the Glucose values**

In [None]:
all_glucose_vals = [glucose_val
                      for data_cgm in data_train_cgm_df['CGM Data']
                      for timestamp, glucose_val in data_cgm]

data_mean_glucose_vals = np.mean(all_glucose_vals)
data_std_dev_glucose_vals = np.std(all_glucose_vals)

data_train_cgm_df['CGM Data'] = data_train_cgm_df['CGM Data'].apply(
    lambda cgm_data: [(timestamp, (glucose_level - data_mean_glucose_vals) / data_std_dev_glucose_vals)
                      for timestamp, glucose_level in cgm_data])

In [None]:
data_train_cgm_df

In [None]:
data_cgm_num_tuples_after = data_train_cgm_df['CGM Data'].apply(len).sum()
data_cgm_num_tuples_after

In [None]:
data_train_cgm_df = pd.DataFrame(data_train_cgm_df)

In [None]:
data_train_cgm_df.columns

#### **Converting the Breakfast time, Lunch time and Timestamps into Seconds after midnight**

In [None]:
def time_to_seconds(time):
  return int(timedelta(hours=time.hour, minutes=time.minute, seconds=time.second).total_seconds())

data_train_cgm_df['Breakfast Time'] = pd.to_datetime(data_train_cgm_df['Breakfast Time'])
data_train_cgm_df['Breakfast Time(Seconds after midnight)'] = data_train_cgm_df['Breakfast Time'].dt.time.apply(time_to_seconds)

data_train_cgm_df['Lunch Time'] = pd.to_datetime(data_train_cgm_df['Lunch Time'])
data_train_cgm_df['Lunch Time(Seconds after midnight)'] = data_train_cgm_df['Lunch Time'].dt.time.apply(time_to_seconds)

def process_tuple_list(tuple_list):
  return [(time_to_seconds(pd.to_datetime(t[0]).time()), *t[1:]) for t in tuple_list]

data_train_cgm_df['CGM Data(seconds after midnight)'] = data_train_cgm_df['CGM Data'].apply(process_tuple_list)

print(data_train_cgm_df[['Breakfast Time(Seconds after midnight)', 'Lunch Time(Seconds after midnight)', 'CGM Data(seconds after midnight)']])

In [None]:
data_train_cgm_df

In [None]:
data_train_cgm_df.columns

In [None]:
data_train_cgm_df = data_train_cgm_df.drop(['Breakfast Time', 'Lunch Time', 'CGM Data'], axis=1)

In [None]:
data_train_cgm_df.columns

In [None]:
data_train_cgm_df.rename(columns={'Breakfast Time(Seconds after midnight)':'Breakfast Time', 'Lunch Time(Seconds after midnight)':'Lunch Time', 'CGM Data(seconds after midnight)': 'CGM Data'}, inplace=True)

In [None]:
data_train_cgm_df.columns

In [None]:
data_train_cgm_df['CGM Data'] = data_train_cgm_df['CGM Data'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [None]:
data_train_cgm_df['CGM Data'].apply(len)

#### **Applying Padding towards the end of CGM Data list (with zeros) to ensure uniform length for further processing**

In [None]:
data_train_cgm_max_len = (data_train_cgm_df['CGM Data'].apply(len)).max()

In [None]:
def data_train_cgm_list_pad(tuple_list, target_length, default_tuple=(0,0)):
  return tuple_list + [default_tuple] * (target_length - len(tuple_list))

data_train_cgm_df['CGM_Data_padded'] = data_train_cgm_df['CGM Data'].apply(lambda x: data_train_cgm_list_pad(x, data_train_cgm_max_len))

In [None]:
data_train_cgm_df['CGM_Data_padded'].apply(len)

In [None]:
data_train_cgm_df

In [None]:
data_train_cgm_df = data_train_cgm_df.drop('CGM Data', axis=1)

In [None]:
data_train_cgm_df.rename(columns={'CGM_Data_padded': 'CGM Data'}, inplace = True)

In [None]:
data_train_cgm_df

In [None]:
data_train_cgm_preprocessed = data_train_cgm_df

### **Demoviome data preprocessing**

In [None]:
data_train_demo_viome = pd.read_csv("demo_viome_train.csv")
data_train_demo_viome.head()

In [None]:
data_train_demo_viome_df = pd.DataFrame(data_train_demo_viome)
print("Shape of Training DataFrame:", data_train_demo_viome_df.shape)

#### **Checking for Missing Values**

In [None]:
print("Number of Missing values in all features :")
data_train_demo_viome_df.isnull().sum()

#### **Performing One hot Encoding**

In [None]:
data_train_demo_viome_df_encoded = pd.get_dummies(data_train_demo_viome_df, columns=['Race','Diabetes Status'], drop_first=False)
data_train_demo_viome_df_encoded_filtered_columns = data_train_demo_viome_df_encoded.columns[data_train_demo_viome_df_encoded.columns.str.contains('Race|Diabetes Status')].tolist()

print("The following columns have been encoded:",data_train_demo_viome_df_encoded_filtered_columns)

In [None]:
data_train_demo_viome_df_encoded[data_train_demo_viome_df_encoded_filtered_columns] = data_train_demo_viome_df_encoded[data_train_demo_viome_df_encoded_filtered_columns].astype(int)

print("Data after One Hot Encoding:")
print(data_train_demo_viome_df_encoded)

#### **Separating all the PCA encoded Viome Data into different columns**

In [None]:
data_train_demo_viome_df_split = data_train_demo_viome_df_encoded['Viome'].str.split(',', expand=True)
data_train_demo_viome_df_split.columns = [f'Viome{i+1}' for i in range(data_train_demo_viome_df_split.shape[1])]
data_train_demo_viome_df = data_train_demo_viome_df_encoded.drop('Viome', axis=1).join(data_train_demo_viome_df_split)

print("Resulting DataFrame:\n", data_train_demo_viome_df)

#### **Performing Standardization**

In [None]:
scaler = StandardScaler()

columns_not_to_standardize = ('Subject ID', 'Gender') + tuple(data_train_demo_viome_df.columns[data_train_demo_viome_df.columns.str.contains('Race|Diabetes Status')].tolist())

columns_to_standardize = [col for col in data_train_demo_viome_df.columns if col not in columns_not_to_standardize]
print("The following columns are Standardized:", columns_to_standardize)

data_train_demo_viome_df_standardized = data_train_demo_viome_df.copy()
data_train_demo_viome_df_standardized[columns_to_standardize] = scaler.fit_transform(data_train_demo_viome_df[columns_to_standardize])

print("DemoViome Data after Standardization:\n", data_train_demo_viome_df_standardized)

#### **Dropping unnecessary columns**

In [None]:
data_train_demo_viome_df_standardized = data_train_demo_viome_df_standardized.drop(['Weight','Height','Race_African American',
       'Race_Hispanic/Latino', 'Race_White','Viome11', 'Viome12', 'Viome13', 'Viome14', 'Viome15', 'Viome16',
       'Viome17', 'Viome18', 'Viome19', 'Viome20', 'Viome21', 'Viome22',
       'Viome23', 'Viome24', 'Viome25', 'Viome26', 'Viome27'], axis=1)

In [None]:
data_train_demo_viome_df_standardized.columns

In [None]:
data_train_demo_viome_preprocessed = data_train_demo_viome_df_standardized

### **Image data preprocessing**

In [None]:
data_train_image = pd.read_csv('img_train.csv')

In [None]:
data_train_image.columns

#### **Resizing and Normalizing**

In [None]:
def preprocess_image(image_path):
  image_array = np.array(eval(image_path), dtype=np.uint8)
  image = Image.fromarray(image_array)
  image = image.resize((128, 128))
  image = np.array(image) / 255.0
  return image

data_train_image['Image Before Breakfast'] = data_train_image['Image Before Breakfast'].apply(preprocess_image)
data_train_image['Image Before Lunch'] = data_train_image['Image Before Lunch'].apply(preprocess_image)

#### **Checking for Missing values**

In [None]:
print("Number of Missing values in all features:\n")
print(data_train_image.isnull().sum())

In [None]:
data_train_image_preprocessed = data_train_image

#  **(b) Data Preparation**

## **Merging data**

In [None]:
data_labels = pd.read_csv('label_train.csv')

train_cgm = data_train_cgm_preprocessed
train_demoviome = data_train_demo_viome_preprocessed
train_image = data_train_image_preprocessed

data_train_merged = pd.merge(data_labels, train_cgm, on=['Subject ID', 'Day'], how='inner')
data_train_merged = pd.merge(data_train_merged, train_demoviome, on=['Subject ID'], how='inner')

data_train_merged = pd.merge(data_train_merged, train_image, on=['Subject ID', 'Day'], how='inner')

In [None]:
data_train_merged.columns

## **Creating a Multi Modal Dataset and saving to a DataLoader**

In [None]:
class MultiModalDataset(Dataset):
  def __init__(self, train_data, train_data_cgm, train_data_demo_viome, train_data_image, train_data_meal_times, data_labels):
    self.train_data = train_data
    self.train_data_cgm = train_data_cgm
    self.train_data_demo_viome = train_data_demo_viome
    self.train_data_image = train_data_image
    self.train_data_meal_times = train_data_meal_times
    self.data_labels = data_labels

  def __len__(self):
    return len(self.train_data)

  def __getitem__(self, idx):
    train_data_cgm_item = self.train_data[self.train_data_cgm].iloc[idx]
    train_data_cgm_all = cgm_single_tensor(train_data_cgm_item)
    train_cgm_tensor = train_data_cgm_all

    train_demo_viome_tensor = torch.tensor(self.train_data[self.train_data_demo_viome].iloc[idx].values, dtype=torch.float32)

    train_image1_tensor = self.image_processing(self.train_data[self.train_data_image[0]].iloc[idx])
    train_image2_tensor = self.image_processing(self.train_data[self.train_data_image[1]].iloc[idx])

    train_meal_times_tensor = torch.tensor(self.train_data[self.train_data_meal_times].iloc[idx].values, dtype=torch.float32)

    data_labels_tensor = torch.tensor(self.train_data[self.data_labels].iloc[idx], dtype=torch.float32)

    return train_cgm_tensor, train_demo_viome_tensor, train_image1_tensor, train_image2_tensor, train_meal_times_tensor,  data_labels_tensor

  def image_processing(self, image):
    image_tensor = torch.from_numpy(image).float()

    if image_tensor.dim() == 2:
      image_tensor = image_tensor.unsqueeze(0)

    if image_tensor.shape[0] == 1:
      image_tensor = image_tensor.repeat(3, 1, 1)

    if image_tensor.shape[0] != 3 and image_tensor.shape[2] == 3:
      image_tensor = image_tensor.permute(2, 0, 1)

    return image_tensor

def collate_fn(batch):
  train_cgm_batch, train_demo_viome_batch, train_image1_batch, train_image2_batch,  train_meal_times_batch, data_labels_batch = zip(*batch)

  train_cgm = torch.stack(train_cgm_batch)
  train_demo_viome = torch.stack(train_demo_viome_batch)
  train_image1 = torch.stack(train_image1_batch)
  train_image2 = torch.stack(train_image2_batch)
  train_meal_times = torch.stack(train_meal_times_batch)
  data_labels = torch.stack(data_labels_batch)

  return train_cgm, train_demo_viome, train_image1, train_image2, train_meal_times, data_labels

def cgm_single_tensor(row):
  cgm_flattened = [item for tuples in row for item in tuples]
  return torch.tensor(cgm_flattened, dtype=torch.float32)

if __name__ == "__main__":

  train_data_cgm = 'CGM Data'
  train_data_demo_viome = ['Age', 'Gender', 'A1C', 'Baseline Fasting Glucose', 'Insulin', 'Triglycerides', 'Cholesterol', 'HDL', 'Non-HDL', 'LDL', 'VLDL', 'CHO/HDL Ratio',
                         'HOMA-IR', 'BMI', 'Diabetes Status_1', 'Diabetes Status_2', 'Diabetes Status_3', 'Viome1', 'Viome2', 'Viome3', 'Viome4', 'Viome5', 'Viome6', 'Viome7', 'Viome8', 'Viome9', 'Viome10']
  train_data_image = ['Image Before Breakfast', 'Image Before Lunch']
  train_data_meal_times = ['Breakfast Time', 'Lunch Time']
  data_labels = ['Lunch Calories']

  dataset = MultiModalDataset(data_train_merged, train_data_cgm, train_data_demo_viome, train_data_image, train_data_meal_times, data_labels)
  dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

  for batch in dataloader:
    cgm_train, demo_viome_train, image1_train, image2_train,  meal_times_train, labels = batch

    print("Train CGM Data Shape:", cgm_train.shape)
    print("Train Demo_Viome Data Shape:", demo_viome_train.shape)
    print("Train Image 1 Data Shape:", image1_train.shape)
    print("Train Image 2 Data Shape:", image2_train.shape)
    print("Train Meal_Times Data Shape:", meal_times_train.shape)
    print("Data Labels Shape:", labels.shape)

    break

## **Extracting individual modality data from dataloader**


### **CGM Data**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

train_cgm_all = []
for batch in dataloader:
  cgm_train = batch[0]
  train_cgm_all.append(cgm_train)

full_train_data_cgm = torch.cat(train_cgm_all, dim=0)

print("Shape of Complete CGM Data:", full_train_data_cgm.shape)
print(full_train_data_cgm)

### **DemoViome Data**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

train_demo_viome_all = []
for batch in dataloader:
  demo_viome_train = batch[1]
  train_demo_viome_all.append(demo_viome_train)

full_train_data_demo_viome = torch.cat(train_demo_viome_all, dim=0)

print("Shape of Complete DemoViome Data:", full_train_data_demo_viome.shape)
print(full_train_data_demo_viome)

### **Image 1**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

train_image1_all = []
for batch in dataloader:
  image1_train = batch[2]
  train_image1_all.append(image1_train)

full_train_data_image1 = torch.cat(train_image1_all, dim=0)

print("Shape of Complete Image1 Data:", full_train_data_image1.shape)
#print(full_train_data_image1)

### **Image 2**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

train_image2_all = []
for batch in dataloader:
  image2_train = batch[3]
  train_image2_all.append(image2_train)

full_train_data_image2 = torch.cat(train_image2_all, dim=0)

print("Shape of Complete Image2 Data:", full_train_data_image2.shape)
#print(full_train_data_image2)

### **Meal Times**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

train_meal_times_all = []
for batch in dataloader:
  meal_times_train = batch[4]
  train_meal_times_all.append(meal_times_train)

full_train_meal_times = torch.cat(train_meal_times_all, dim=0)

print("Shape of Complete Meal_Times Data:", full_train_meal_times.shape)
#print(full_train_meal_times)

#  **(c) Multimodal Model Implementation**

## **Encoder Transformer - CGM Data Embeddings**

In [None]:
class EncoderTransformer(nn.Module):
  def __init__(self, input_dim, embedding_dim, num_heads, num_layers, ff_dim):
    super(EncoderTransformer, self).__init__()
    self.embedding = nn.Linear(input_dim, embedding_dim)
    self.positional_encoding = nn.Parameter(torch.zeros(150, embedding_dim))
    self.encoder_layers = nn.TransformerEncoder(
        nn.TransformerEncoderLayer(
            d_model = embedding_dim,
            nhead = num_heads,
            dim_feedforward = ff_dim,
            dropout = 0.1),
            num_layers = num_layers
        )
    self.pooling = nn.AdaptiveAvgPool1d(1)

  def forward(self, x):
    x = self.embedding(x) + self.positional_encoding
    x = self.encoder_layers(x)
    x = x.permute(0, 2, 1)
    x = self.pooling(x).squeeze(-1)
    return x

full_train_data_cgm = full_train_data_cgm.view(full_train_data_cgm.size(0), 150, 2)

Transformer_CGM_Model = EncoderTransformer(input_dim=2, embedding_dim=64, num_heads=4, num_layers=2, ff_dim=128)
train_data_cgm_embeddings = Transformer_CGM_Model(full_train_data_cgm)
print("Shape of Train Data CGM Embeddings:", train_data_cgm_embeddings.shape)
#print(train_data_cgm_embeddings)

## **Fully Connected Neural Network - DemoViome Data Embeddings**

In [None]:
class FCNN(nn.Module):
  def __init__(self, input_size, embedding_size):
    super(FCNN, self).__init__()
    self.fc1 = nn.Linear(input_size, 64)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(64, embedding_size)

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    return x

FCNN_DemoViome_Model = FCNN(input_size=27, embedding_size=128)
train_data_demo_viome_embeddings = FCNN_DemoViome_Model(full_train_data_demo_viome)

print("Shape of Train Data DemoViome Embeddings:", train_data_demo_viome_embeddings.shape)

## **Convolutional Neural Network - Image Data Embeddings**

In [None]:
class CNNModel(nn.Module):
  def __init__(self):
    super(CNNModel, self).__init__()
    self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
    self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
    self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)

    self.flatten = nn.Flatten()
    self.fc1 = nn.Linear(64 * 16 * 16, 512)
    self.fc2 = nn.Linear(512, 128)

    self.relu = nn.ReLU()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

  def forward(self, x):
    x = self.conv1(x)
    x = self.relu(x)
    x = self.pool(x)

    x = self.conv2(x)
    x = self.relu(x)
    x = self.pool(x)

    x = self.conv3(x)
    x = self.relu(x)
    x = self.pool(x)

    x = self.flatten(x)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    x = self.relu(x)
    return x

CNN_Image_Model = CNNModel()

train_data_image1_embeddings = CNN_Image_Model(full_train_data_image1)
train_data_image2_embeddings = CNN_Image_Model(full_train_data_image2)

print("Shape of Train Data Image 1 Embeddings:", train_data_image1_embeddings.shape)
print("Shape of Train Data Image 1 Embeddings:", train_data_image2_embeddings.shape)
#print(train_data_image1_embeddings)
#print(train_data_image2_embeddings)

### **Combining Image Embeddings**

In [None]:
concat_train_data_image_embeddings = torch.cat([train_data_image1_embeddings, train_data_image2_embeddings], dim=1)
print("Concatenated Image Embeddings Shape:", concat_train_data_image_embeddings.shape)

#### We originally used all three embeddings, but image embeddings were not found useful, so later, we used only CGM and DemoViome embeddings

In [None]:
concat_train_data_embeddings = torch.cat([train_data_cgm_embeddings, train_data_demo_viome_embeddings], dim=1)
print("Shape of Embeddings(2 Modalities):", concat_train_data_embeddings.shape)

## **Joint Embedding**

In [None]:
# Final input to the dense layers
full_train_data_embeddings = torch.cat([concat_train_data_embeddings, full_train_meal_times], dim=1)
print("Shape of Entire Train Data Embeddings:", full_train_data_embeddings.shape)

### **Labels**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

data_labels_all = []
for batch in dataloader:
  label = batch[5]
  data_labels_all.append(label)

full_labels_data = torch.cat(data_labels_all, dim=0)

print("Shape of Data Labels:", full_labels_data.shape)
#print(full_labels_data)

# **Test Data**

# **(a) Data preprocessing**

## **CGM Data Preprocessing**



In [None]:
data_test_cgm = pd.read_csv('cgm_test.csv')

In [None]:
print("Shape of Test Data:", data_test_cgm.shape)

In [None]:
data_test_cgm.head()

In [None]:
# Converting into DateTime Objects
data_test_cgm['Breakfast Time'] = pd.to_datetime(data_test_cgm['Breakfast Time'], errors = 'coerce')
data_test_cgm['Lunch Time'] = pd.to_datetime(data_test_cgm['Lunch Time'], errors = 'coerce')

#### **Handling Missing Values**

In [None]:
data_test_cgm.isnull().sum()

In [None]:
# Imputing Null Values using Forward Fill
data_test_cgm['Breakfast Time'].fillna(method = 'ffill', inplace = True)
data_test_cgm['Lunch Time'].fillna(method='ffill', inplace=True)

In [None]:
data_test_cgm['CGM Data'] = data_test_cgm['CGM Data'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

print(type(data_test_cgm['CGM Data'][0]))
print(data_test_cgm['CGM Data'][0])

In [None]:
data_test_cgm_num_tuples_before = data_test_cgm['CGM Data'].apply(len).sum()
data_test_cgm_num_tuples_before

In [None]:
data_test_cgm

#### **Resampling and Interpolation of CGM Data**

In [None]:
# Resampled to have entries every 5 minutes, corresponding glucose values are interpolated linearly
subj_ids = data_test_cgm['Subject ID'].unique()
day_nums = data_test_cgm['Day'].unique()

data_test_cgm_resampled = pd.DataFrame()

for subj_id in subj_ids:
  for day_num in day_nums:
    data_subj_id_day_num = data_test_cgm[(data_test_cgm['Subject ID'] == subj_id) & (data_test_cgm['Day'] == day_num)]

    if data_subj_id_day_num.empty:
      continue

    data_all_cgm_row = []

    for index, row in data_subj_id_day_num.iterrows():
      data_test_cgm_row = row['CGM Data']

      if not data_test_cgm_row or any(pd.isna(item[0]) for item in data_test_cgm_row):
        continue

      data_test_cgm_row_df = pd.DataFrame(data_test_cgm_row, columns=['timestamp', 'glucose_level'])

      data_test_cgm_row_df['timestamp'] = pd.to_datetime(data_test_cgm_row_df['timestamp'], errors='coerce')

      data_test_cgm_row_df = data_test_cgm_row_df.dropna(subset=['timestamp'])

      if data_test_cgm_row_df.empty:
        continue

      start_time = data_test_cgm_row_df['timestamp'].min()
      end_time = data_test_cgm_row_df['timestamp'].max()

      if pd.isna(start_time) or pd.isna(end_time):
        continue

      resampled_time_index = pd.date_range(start=start_time, end=end_time, freq='5T')

      if resampled_time_index.empty:
        continue

      data_test_cgm_row_df_resampled = data_test_cgm_row_df.set_index('timestamp').reindex(resampled_time_index)

      data_test_cgm_row_df_resampled['glucose_level'] = data_test_cgm_row_df_resampled['glucose_level'].interpolate(method='linear')

      data_resampled_cgm_row = list(zip(data_test_cgm_row_df_resampled.index, data_test_cgm_row_df_resampled['glucose_level']))

      data_all_cgm_row.extend(data_resampled_cgm_row)

      data_resampled_subj_day = pd.DataFrame({
            'Subject ID': [subj_id],
            'Day': [day_num],
            'Breakfast Time': [data_subj_id_day_num['Breakfast Time'].iloc[0]],
            'Lunch Time': [data_subj_id_day_num['Lunch Time'].iloc[0]],
            'CGM Data': [data_all_cgm_row] })

      data_test_cgm_resampled = pd.concat([data_test_cgm_resampled, data_resampled_subj_day], ignore_index=True)

In [None]:
data_test_cgm = data_test_cgm_resampled

In [None]:
data_test_cgm

In [None]:
data_test_cgm_df = pd.DataFrame(data_test_cgm)

#### **Applying Standardization to the Glucose Values**

In [None]:
all_glucose_vals = [glucose_val
                      for data_cgm in data_test_cgm_df['CGM Data']
                      for timestamp, glucose_val in data_cgm]

data_mean_glucose_vals = np.mean(all_glucose_vals)
data_std_dev_glucose_vals = np.std(all_glucose_vals)

data_test_cgm_df['CGM Data'] = data_test_cgm_df['CGM Data'].apply(
    lambda cgm_data: [(timestamp, (glucose_level - data_mean_glucose_vals) / data_std_dev_glucose_vals)
                      for timestamp, glucose_level in cgm_data])

In [None]:
data_test_cgm_df

In [None]:
data_cgm_num_tuples_test_after = data_test_cgm_df['CGM Data'].apply(len).sum()
data_cgm_num_tuples_test_after

In [None]:
data_test_cgm_df =pd.DataFrame(data_test_cgm_df)

In [None]:
data_test_cgm_df.columns

#### **Converting the Breakfast time, Lunch time and Timestamps into Seconds after midnight**

In [None]:
def time_to_seconds(time):
  return int(timedelta(hours=time.hour, minutes=time.minute, seconds=time.second).total_seconds())

data_test_cgm_df['Breakfast Time'] = pd.to_datetime(data_test_cgm_df['Breakfast Time'])
data_test_cgm_df['Breakfast Time(Seconds after midnight)'] = data_test_cgm_df['Breakfast Time'].dt.time.apply(time_to_seconds)

data_test_cgm_df['Lunch Time'] = pd.to_datetime(data_test_cgm_df['Lunch Time'])
data_test_cgm_df['Lunch Time(Seconds after midnight)'] = data_test_cgm_df['Lunch Time'].dt.time.apply(time_to_seconds)

def process_tuple_list(tuple_list):
  return [(time_to_seconds(pd.to_datetime(t[0]).time()), *t[1:]) for t in tuple_list]

data_test_cgm_df['CGM Data(seconds after midnight)'] = data_test_cgm_df['CGM Data'].apply(process_tuple_list)

print(data_test_cgm_df[['Breakfast Time(Seconds after midnight)', 'Lunch Time(Seconds after midnight)', 'CGM Data(seconds after midnight)']])

In [None]:
data_test_cgm_df

In [None]:
data_test_cgm_df.columns

In [None]:
data_test_cgm_df = data_test_cgm_df.drop(['Breakfast Time', 'Lunch Time', 'CGM Data'], axis=1)

In [None]:
data_test_cgm_df.columns

In [None]:
data_test_cgm_df.rename(columns={'Breakfast Time(Seconds after midnight)':'Breakfast Time', 'Lunch Time(Seconds after midnight)':'Lunch Time', 'CGM Data(seconds after midnight)': 'CGM Data'}, inplace=True)

In [None]:
data_test_cgm_df.columns

In [None]:
data_test_cgm_df['CGM Data'] = data_test_cgm_df['CGM Data'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [None]:
data_test_cgm_df['CGM Data'].apply(len)

#### **Applying Padding towards the end of CGM Data list (with zeros) to ensure uniform length for further processing**

In [None]:
#getting the corresponding max value of train_data, since both train and test should be identical in length
data_train_cgm_max_len = (data_train_cgm_df['CGM Data'].apply(len)).max()

In [None]:
def data_test_cgm_list_pad(tuple_list, target_length, default_tuple=(0,0)):
  return tuple_list + [default_tuple] * (target_length - len(tuple_list))

data_test_cgm_df['CGM_Data_padded'] = data_test_cgm_df['CGM Data'].apply(lambda x: data_test_cgm_list_pad(x, data_train_cgm_max_len))

In [None]:
data_test_cgm_df['CGM_Data_padded'].apply(len)

In [None]:
data_test_cgm_df

In [None]:
data_test_cgm_df = data_test_cgm_df.drop('CGM Data', axis=1)

In [None]:
data_test_cgm_df.rename(columns={'CGM_Data_padded': 'CGM Data'}, inplace = True)

In [None]:
data_test_cgm_df

In [None]:
data_test_cgm_preprocessed = data_test_cgm_df

## **DemoViome data preprocessing**

In [None]:
data_test_demo_viome = pd.read_csv("demo_viome_test.csv")
data_test_demo_viome.head()

In [None]:
data_test_demo_viome_df = pd.DataFrame(data_test_demo_viome)
print("Shape of Test Data:", data_test_demo_viome_df.shape)

In [None]:
print("Number of Missing values in all features :")
data_test_demo_viome_df.isnull().sum()

### **Performing One hot Encoding**

In [None]:
data_test_demo_viome_df_encoded = pd.get_dummies(data_test_demo_viome_df, columns=['Race','Diabetes Status'], drop_first=False)
data_test_demo_viome_df_encoded_filtered_columns = data_test_demo_viome_df_encoded.columns[data_test_demo_viome_df_encoded.columns.str.contains('Race|Diabetes Status')].tolist()

print("The following columns have been encoded:",data_test_demo_viome_df_encoded_filtered_columns)

In [None]:
data_test_demo_viome_df_encoded[data_test_demo_viome_df_encoded_filtered_columns] = data_test_demo_viome_df_encoded[data_test_demo_viome_df_encoded_filtered_columns].astype(int)

print("Data after One Hot Encoding:")
print(data_test_demo_viome_df_encoded)

#### **Separating all the PCA encoded Viome Data into different columns**

In [None]:
data_test_demo_viome_df_split = data_test_demo_viome_df_encoded['Viome'].str.split(',', expand=True)
data_test_demo_viome_df_split.columns = [f'Viome{i+1}' for i in range(data_test_demo_viome_df_split.shape[1])]
data_test_demo_viome_df = data_test_demo_viome_df_encoded.drop('Viome', axis=1).join(data_test_demo_viome_df_split)

print("Resulting DataFrame:\n", data_test_demo_viome_df)

#### **Performing Standardization**

In [None]:
scaler = StandardScaler()

columns_not_to_standardize = ('Subject ID', 'Gender') + tuple(data_test_demo_viome_df.columns[data_test_demo_viome_df.columns.str.contains('Race|Diabetes Status')].tolist())

columns_to_standardize = [col for col in data_test_demo_viome_df.columns if col not in columns_not_to_standardize]
print("The following columns are Standardized:", columns_to_standardize)

data_test_demo_viome_df_standardized = data_test_demo_viome_df.copy()
data_test_demo_viome_df_standardized[columns_to_standardize] = scaler.fit_transform(data_test_demo_viome_df[columns_to_standardize])

print("DemoViome Data after Standardization:\n", data_test_demo_viome_df_standardized)

#### **Dropping unnecessary columns**

In [None]:
data_test_demo_viome_df_standardized = data_test_demo_viome_df_standardized.drop(['Weight','Height','Race_African American',
       'Race_Hispanic/Latino', 'Race_White','Viome11', 'Viome12', 'Viome13', 'Viome14', 'Viome15', 'Viome16',
       'Viome17', 'Viome18', 'Viome19', 'Viome20', 'Viome21', 'Viome22',
       'Viome23', 'Viome24', 'Viome25', 'Viome26', 'Viome27'], axis=1)

In [None]:
data_test_demo_viome_df_standardized.columns

In [None]:
data_test_demo_viome_preprocessed = data_test_demo_viome_df_standardized

## **Image data preprocessing**

In [None]:
data_test_image=pd.read_csv('img_test.csv')

In [None]:
data_test_image.columns

#### **Resizing and Normalizing**

In [None]:
def preprocess_image(image_path):
  image_array = np.array(eval(image_path), dtype=np.uint8)
  image = Image.fromarray(image_array)
  image = image.resize((128, 128))
  image = np.array(image) / 255.0
  return image

data_test_image['Image Before Breakfast'] = data_test_image['Image Before Breakfast'].apply(preprocess_image)
data_test_image['Image Before Lunch'] = data_test_image['Image Before Lunch'].apply(preprocess_image)

#### **Checking for Missing Values**

In [None]:
print("Number of Missing values in all features:\n")
print(data_test_image.isnull().sum())

In [None]:
data_test_image_preprocessed = data_test_image

# **(b) Data Preparation**

## **Merging data**

In [None]:
test_cgm = data_test_cgm_preprocessed
test_demoviome = data_test_demo_viome_preprocessed
test_image = data_test_image_preprocessed

data_test_merged = pd.merge(test_cgm, test_demoviome, on=['Subject ID'], how='inner')

data_test_merged = pd.merge(data_test_merged, test_image, on=['Subject ID', 'Day'], how='inner')

In [None]:
data_test_merged.columns

## **Creating a Multi Modal Dataset and saving to a DataLoader**

In [None]:
class MultiModalDataset(Dataset):
  def __init__(self, test_data, test_data_cgm, test_data_demo_viome, test_data_image, test_data_meal_times):
    self.test_data = test_data
    self.test_data_cgm = test_data_cgm
    self.test_data_demo_viome = test_data_demo_viome
    self.test_data_image = test_data_image
    self.test_data_meal_times = test_data_meal_times

  def __len__(self):
    return len(self.test_data)

  def __getitem__(self, idx):
    test_data_cgm_item = self.test_data[self.test_data_cgm].iloc[idx]
    test_data_cgm_all = cgm_single_tensor(test_data_cgm_item)
    test_cgm_tensor = test_data_cgm_all

    test_demo_viome_tensor = torch.tensor(self.test_data[self.test_data_demo_viome].iloc[idx].values, dtype=torch.float32)

    test_image1_tensor = self.image_processing(self.test_data[self.test_data_image[0]].iloc[idx])
    test_image2_tensor = self.image_processing(self.test_data[self.test_data_image[1]].iloc[idx])

    test_meal_times_tensor = torch.tensor(self.test_data[self.test_data_meal_times].iloc[idx].values, dtype=torch.float32)

    return test_cgm_tensor, test_demo_viome_tensor, test_image1_tensor, test_image2_tensor, test_meal_times_tensor

  def image_processing(self, image):
    image_tensor = torch.from_numpy(image).float()

    if image_tensor.dim() == 2:
      image_tensor = image_tensor.unsqueeze(0)

    if image_tensor.shape[0] == 1:
      image_tensor = image_tensor.repeat(3, 1, 1)

    if image_tensor.shape[0] != 3 and image_tensor.shape[2] == 3:
      image_tensor = image_tensor.permute(2, 0, 1)

    return image_tensor

def collate_fn(batch):
  test_cgm_batch, test_demo_viome_batch, test_image1_batch, test_image2_batch,  test_meal_times_batch = zip(*batch)

  test_cgm = torch.stack(test_cgm_batch)
  test_demo_viome = torch.stack(test_demo_viome_batch)
  test_image1 = torch.stack(test_image1_batch)
  test_image2 = torch.stack(test_image2_batch)
  test_meal_times = torch.stack(test_meal_times_batch)

  return test_cgm, test_demo_viome, test_image1, test_image2, test_meal_times

def cgm_single_tensor(row):
  cgm_flattened = [item for tuples in row for item in tuples]
  return torch.tensor(cgm_flattened, dtype=torch.float32)

if __name__ == "__main__":
  test_data_cgm = 'CGM Data'
  test_data_demo_viome = [ 'Age', 'Gender', 'A1C', 'Baseline Fasting Glucose', 'Insulin', 'Triglycerides', 'Cholesterol', 'HDL', 'Non-HDL', 'LDL', 'VLDL', 'CHO/HDL Ratio',
                         'HOMA-IR', 'BMI', 'Diabetes Status_1', 'Diabetes Status_2', 'Diabetes Status_3', 'Viome1', 'Viome2', 'Viome3', 'Viome4', 'Viome5', 'Viome6', 'Viome7', 'Viome8', 'Viome9', 'Viome10']
  test_data_image = ['Image Before Breakfast', 'Image Before Lunch']
  test_data_meal_times = ['Breakfast Time', 'Lunch Time']

  dataset = MultiModalDataset(data_test_merged, test_data_cgm, test_data_demo_viome, test_data_image, test_data_meal_times)
  dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

  for batch in dataloader:
    cgm_test, demo_viome_test, image1_test, image2_test,  meal_times_test = batch

    print("Train CGM Data Shape:", cgm_test.shape)
    print("Train Demo_Viome Data Shape:", demo_viome_test.shape)
    print("Train Image 1 Data Shape:", image1_test.shape)
    print("Train Image 2 Data Shape:", image2_test.shape)
    print("Train Meal_Times Data Shape:", meal_times_test.shape)

    break

## **Extracting individual modality data fom dataloader**


### **CGM Data**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

test_cgm_all = []
for batch in dataloader:
  cgm_test = batch[0]
  test_cgm_all.append(cgm_test)

full_test_data_cgm = torch.cat(test_cgm_all, dim=0)

print("Shape of Complete CGM Test Data:", full_test_data_cgm.shape)
print(full_test_data_cgm)

### **DemoViome Data**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

test_demo_viome_all = []
for batch in dataloader:
  demo_viome_test = batch[1]
  test_demo_viome_all.append(demo_viome_test)

full_test_data_demo_viome = torch.cat(test_demo_viome_all, dim=0)

print("Shape of Complete DemoViome Test Data:", full_test_data_demo_viome.shape)
print(full_test_data_demo_viome)

### **Image 1**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

test_image1_all = []
for batch in dataloader:
  image1_test = batch[2]
  test_image1_all.append(image1_test)

full_test_data_image1 = torch.cat(test_image1_all, dim=0)

print("Shape of Complete Image1 Test Data:", full_test_data_image1.shape)
#print(full_test_data_image1)

### **Image 2**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

test_image2_all = []
for batch in dataloader:
  image2_test = batch[3]
  test_image2_all.append(image2_test)

full_test_data_image2 = torch.cat(test_image2_all, dim=0)

print("Shape of Complete Image2 Test Data:", full_test_data_image2.shape)
#print(full_test_data_image2)

### **Meal Times**

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

test_meal_times_all = []
for batch in dataloader:
  meal_times_test = batch[4]
  test_meal_times_all.append(meal_times_test)

full_test_meal_times = torch.cat(test_meal_times_all, dim=0)

print("Shape of Complete Meal_Times Test Data:", full_test_meal_times.shape)
#print(full_test_meal_times)

# **(c) Multimodal Model Implementation**

## **Encoder Transformer - CGM Data Embeddings**

In [None]:
class EncoderTransformer(nn.Module):
  def __init__(self, input_dim, embedding_dim, num_heads, num_layers, ff_dim):
    super(EncoderTransformer, self).__init__()
    self.embedding = nn.Linear(input_dim, embedding_dim)
    self.positional_encoding = nn.Parameter(torch.zeros(150, embedding_dim))
    self.encoder_layers = nn.TransformerEncoder(
        nn.TransformerEncoderLayer(
            d_model = embedding_dim,
            nhead = num_heads,
            dim_feedforward = ff_dim,
            dropout = 0.1),
            num_layers = num_layers
        )
    self.pooling = nn.AdaptiveAvgPool1d(1)

  def forward(self, x):
    x = self.embedding(x) + self.positional_encoding
    x = self.encoder_layers(x)
    x = x.permute(0, 2, 1)
    x = self.pooling(x).squeeze(-1)
    return x

full_test_data_cgm = full_test_data_cgm.view(full_test_data_cgm.size(0), 150, 2)

Transformer_CGM_Model = EncoderTransformer(input_dim=2, embedding_dim=64, num_heads=4, num_layers=2, ff_dim=128)
test_data_cgm_embeddings = Transformer_CGM_Model(full_test_data_cgm)
print("Shape of Train Data CGM Embeddings:", test_data_cgm_embeddings.shape)
#print(test_data_cgm_embeddings)

## **Fully Connected Neural Network - DemoViome Data Embeddings**

In [None]:
class FCNN(nn.Module):
  def __init__(self, input_size, embedding_size):
    super(FCNN, self).__init__()
    self.fc1 = nn.Linear(input_size, 64)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(64, embedding_size)

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    return x

FCNN_DemoViome_Model = FCNN(input_size=27, embedding_size=128)
test_data_demo_viome_embeddings = FCNN_DemoViome_Model(full_test_data_demo_viome)

print("Shape of Test Data DemoViome Embeddings:", test_data_demo_viome_embeddings.shape)

## **Convolutional Neural Network - Image Data Embeddings**

In [None]:
class CNNModel(nn.Module):
  def __init__(self):
    super(CNNModel, self).__init__()
    self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
    self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
    self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)

    self.flatten = nn.Flatten()
    self.fc1 = nn.Linear(64 * 16 * 16, 512)
    self.fc2 = nn.Linear(512, 128)

    self.relu = nn.ReLU()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

  def forward(self, x):
    x = self.conv1(x)
    x = self.relu(x)
    x = self.pool(x)

    x = self.conv2(x)
    x = self.relu(x)
    x = self.pool(x)

    x = self.conv3(x)
    x = self.relu(x)
    x = self.pool(x)

    x = self.flatten(x)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    x = self.relu(x)
    return x

CNN_Image_Model = CNNModel()

test_data_image1_embeddings = CNN_Image_Model(full_test_data_image1)
test_data_image2_embeddings = CNN_Image_Model(full_test_data_image2)

print("Shape of Test Data Image 1 Embeddings:", test_data_image1_embeddings.shape)
print("Shape of Test Data Image 1 Embeddings:", test_data_image2_embeddings.shape)
#print(test_data_image1_embeddings)
#print(test_data_image2_embeddings)

### **Combining Image Embeddings**

In [None]:
concat_test_data_image_embeddings = torch.cat([test_data_image1_embeddings, test_data_image2_embeddings], dim=1)
print("Concatenated Image Embeddings Shape:", concat_test_data_image_embeddings.shape)

#### We originally used all three embeddings, but image embeddings were not useful, so later, we used only CGM and DemoViome embeddings

In [None]:
concat_test_data_embeddings = torch.cat([test_data_cgm_embeddings, test_data_demo_viome_embeddings], dim=1)
print("Shape of Embeddings(2 Modalities):", concat_test_data_embeddings.shape)

## **Joint Embedding**

In [None]:
full_test_data_embeddings = torch.cat([concat_test_data_embeddings, full_test_meal_times], dim=1)
print("Shape of Entire Test Data Embeddings:", full_test_data_embeddings.shape)

# **(c) Dense Layers of the Multimodal Model, (d) Model training, and (e) Result analysis**

### We performed hyperparameter tuning, and trained the model on the best set of hyperparameters and made predictions on test labels.

### The loss curve is also plotted below.

In [None]:
class FinalDenseLayers(nn.Module):
  def __init__(self, input_size, output_size):
    super(FinalDenseLayers, self).__init__()
    self.fc1 = nn.Linear(input_size, 256)
    self.fc2 = nn.Linear(256, 128)
    self.fc3 = nn.Linear(128, output_size)

    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(0.3)

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.fc2(x)
    x = self.relu(x)
    x = self.fc3(x)
    return x

class RMSRELoss(nn.Module):
  def __init__(self):
    super(RMSRELoss, self).__init__()

  def forward(self, y_pred, y_true):
    relative_error = (y_pred - y_true) / (y_true)
    squared_error = torch.square(relative_error)
    mean_squared_error = torch.mean(squared_error)
    rmsre = torch.sqrt(mean_squared_error)
    return rmsre

inputs = full_train_data_embeddings.detach()
targets = full_labels_data.detach()
dataset = TensorDataset(inputs, targets)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(full_test_data_embeddings, batch_size=32, shuffle=False)

prediction_model = FinalDenseLayers(input_size=194, output_size=1)
criterion = RMSRELoss()
optimizer = torch.optim.Adam(prediction_model.parameters(), lr=0.005, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

epochs = 100
train_losses = []
val_losses = []

for epoch in range(epochs):
  prediction_model.train()
  train_loss = 0.0
  for inputs, targets in train_loader:
    inputs, targets = inputs.to(torch.float32), targets.to(torch.float32)
    optimizer.zero_grad()
    predictions = prediction_model(inputs)
    loss = criterion(predictions, targets)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(prediction_model.parameters(), max_norm=0.5)

    optimizer.step()
    train_loss += loss.item()

  prediction_model.eval()
  val_loss = 0.0
  with torch.no_grad():
    for inputs, targets in val_loader:
      inputs, targets = inputs.to(torch.float32), targets.to(torch.float32)
      predictions = prediction_model(inputs)
      loss = criterion(predictions, targets)
      val_loss += loss.item()

  train_loss /= len(train_loader)
  val_loss /= len(val_loader)
  train_losses.append(train_loss)
  val_losses.append(val_loss)

  print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

  scheduler.step(val_loss)

plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_losses, label="Training Loss", color="blue")
plt.plot(range(1, epochs + 1), val_losses, label="Validation Loss", color="red")
plt.xlabel("Epochs")
plt.ylabel("Loss (RMSRE)")
plt.title("Training and Validation Loss Over Epochs")
plt.legend()
plt.grid(True)
plt.show()

## **Making Predictions on the Test data**

In [None]:
prediction_model.eval()
test_predictions = []

with torch.no_grad():
  for batch_inputs in test_loader:
    batch_inputs = batch_inputs.to(torch.float32)
    predictions = prediction_model(batch_inputs)
    test_predictions.append(predictions)

test_predictions = torch.cat(test_predictions, dim=0).detach().numpy()

row_id = np.arange(len(test_predictions))

df_predictions = pd.DataFrame({
    "row_id": row_id,
    "prediction": test_predictions.flatten() })

df_predictions.to_csv("test_predictions.csv", index=False)
#df_predictions

## **We submitted these results to Kaggle and obtained an RMSRE Loss of 0.3374**