In [None]:
import os
# Supress TensorFlow messages
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Input,Dense, Concatenate, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

from sklearn.cluster import KMeans

import shap

### **Versions used:**
* numpy==1.26.4
* pandas==2.2.2
* scikit-learn==1.4.2
* tensorflow==2.16.1
* shap==0.46.0

### **Filtering raw data given:** 

The data that was given had incomplete data in a '.xlsx' file. The code below is what was used to remove all of the uncomplete samples and save the result as a '.csv' file.

In [5]:
def filter_data(excel_file, output_csv):
    try:
        df = pd.read_excel(excel_file)
    except FileNotFoundError:
        print(f"Error: The file '{excel_file}' was not found.")
        return
    except Exception as e:
        print(f"Error occurred while reading '{excel_file}': {str(e)}")
        return

    # Step 2: Filter rows based on completeness (non-empty cells)
    complete_rows = []
    for index, row in df.iterrows():
        if is_row_complete(row):
            complete_rows.append(row)

    cleaned_df = pd.DataFrame(complete_rows, columns=df.columns)

    # Step 3: Save the cleaned data to a CSV file
    try:
        cleaned_df.to_csv(output_csv, index=False)
        print(f"Cleaned data saved to '{output_csv}' successfully.")
        print(f"There are {len(cleaned_df)} samples in the cleaned data.")
    except Exception as e:
        print(f"Error occurred while saving to '{output_csv}': {str(e)}")
        return

def is_row_complete(row):
    for cell in row:
        # Check if cell is NaN or empty (after stripping whitespace)
        if pd.isna(cell) or str(cell).strip() == '':
            return False
    return True

filter_data('Data/RawData.xlsx', 'Data/FilteredData.csv')

Cleaned data saved to 'Data/FilteredData.csv' successfully.
There are 881 samples in the cleaned data.


### **How to use the data:**
 There are 10 different 'senerios' or 'decisions' made by each sample (each sample represents one person). When making a decision they were able to choose between 0-10 based on how sure they are. This gives 11 possible choices per situation per sample. Given the fact there are only 881 samples attempting to accruetly predict 11 possible choices will likely not be accurate due to the limited data. To account for this The decisions are going to be split into three catagories. Anyone that chose a 0, 1, 2, or 3 will be a part of catagory one. Anyone that chose either 4, 5, or 6 will be a part of catagory two and anyone that chose 7, 8, 9, or 10 will be a part of catagory three. This gives a 4-3-4 catigorical split. Below is the code that completes this. 

In [9]:
df = pd.read_csv('Data/FilteredData.csv')
# Column tites for all the output data
output_columns = ['Scenario 1 ',
                  'Unnamed: 40',
                  'Scenario 2 ',
                  'Unnamed: 42',
                  'Scenario 3 ',
                  'Unnamed: 44',
                  'Scenario 4',
                  'Unnamed: 46',
                  'Scenario 5 ',
                  'Unnamed: 48']

multi_output_df = df[output_columns]
all_Y_values = multi_output_df.to_numpy()

catigorized_Y_values = []
for sample in all_Y_values:
    temp = []
    for cell in sample:
        if cell <= 3:
            temp.append(0)
        elif cell <= 6:
            temp.append(1)
        else:
            temp.append(2)
    catigorized_Y_values.append(temp)
    
catigorized_Y_values = np.array(catigorized_Y_values)
Y_values_catigorized = to_categorical(catigorized_Y_values)

print(Y_values_catigorized)


[[[0. 0. 1.]
  [0. 1. 0.]
  [0. 0. 1.]
  ...
  [0. 1. 0.]
  [0. 1. 0.]
  [0. 1. 0.]]

 [[0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [1. 0. 0.]
  [0. 0. 1.]
  [0. 0. 1.]]

 [[0. 0. 1.]
  [1. 0. 0.]
  [0. 0. 1.]
  ...
  [0. 1. 0.]
  [0. 0. 1.]
  [0. 1. 0.]]

 ...

 [[0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [0. 0. 1.]
  [0. 1. 0.]
  [0. 0. 1.]]

 [[0. 0. 1.]
  [1. 0. 0.]
  [0. 1. 0.]
  ...
  [0. 1. 0.]
  [0. 1. 0.]
  [0. 1. 0.]]

 [[0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]]]


### **Splitting into training and validating:**
Before this data can be used to train a model it first needs to be split into traning and validating data. Below is the code that does that. The first 800 samples (people) are going to be used train the model while the last 81 are going to be for validation.