# ***By Kyle Weldon*** 

In [39]:
import os
# Supress TensorFlow messages
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Input,Dense, Concatenate, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

from sklearn.cluster import KMeans

import shap

import matplotlib.pyplot as plt

### **Versions used:**
* numpy==1.26.4
* pandas==2.2.2
* scikit-learn==1.4.2
* tensorflow==2.16.1
* shap==0.46.0
* matplotlib==3.9.0

### **Filtering raw data given:** 

The data that was given had incomplete data in a '.xlsx' file. The code below is what was used to remove all of the uncomplete samples and save the result as a '.csv' file.

In [23]:
def filter_data(excel_file, output_csv):
    try:
        df = pd.read_excel(excel_file)
    except FileNotFoundError:
        print(f"Error: The file '{excel_file}' was not found.")
        return
    except Exception as e:
        print(f"Error occurred while reading '{excel_file}': {str(e)}")
        return

    # Step 2: Filter rows based on completeness (non-empty cells)
    complete_rows = []
    for index, row in df.iterrows():
        if is_row_complete(row):
            complete_rows.append(row)

    cleaned_df = pd.DataFrame(complete_rows, columns=df.columns)

    # Step 3: Save the cleaned data to a CSV file
    try:
        cleaned_df.to_csv(output_csv, index=False)
        print(f"Cleaned data saved to '{output_csv}' successfully.")
        print(f"There are {len(cleaned_df)} samples in the cleaned data.")
    except Exception as e:
        print(f"Error occurred while saving to '{output_csv}': {str(e)}")
        return

def is_row_complete(row):
    for cell in row:
        # Check if cell is NaN or empty (after stripping whitespace)
        if pd.isna(cell) or str(cell).strip() == '':
            return False
    return True

filter_data('Data/RawData.xlsx', 'Data/FilteredData.csv')

Cleaned data saved to 'Data/FilteredData.csv' successfully.
There are 881 samples in the cleaned data.


### **How to use the data:**
 There are 10 different 'senerios' or 'decisions' made by each sample (each sample represents one person). When making a decision they were able to choose between 0-10 based on how sure they are. This gives 11 possible choices per situation per sample. Given the fact there are only 881 samples attempting to accruetly predict 11 possible choices will likely not be accurate due to the limited data. To account for this The decisions are going to be split into three catagories. Anyone that chose a 0, 1, 2, or 3 will be a part of catagory one. Anyone that chose either 4, 5, or 6 will be a part of catagory two and anyone that chose 7, 8, 9, or 10 will be a part of catagory three. This gives a 4-3-4 catigorical split. Below is the code that completes this. 

In [24]:
df = pd.read_csv('Data/FilteredData.csv')
# Column tites for all the output data
output_columns = ['Scenario 1 ',
                  'Unnamed: 40',
                  'Scenario 2 ',
                  'Unnamed: 42',
                  'Scenario 3 ',
                  'Unnamed: 44',
                  'Scenario 4',
                  'Unnamed: 46',
                  'Scenario 5 ',
                  'Unnamed: 48']

multi_output_df = df[output_columns]
all_Y_values = multi_output_df.to_numpy()

catigorized_Y_values = []
for sample in all_Y_values:
    temp = []
    for cell in sample:
        if cell <= 3:
            temp.append(0)
        elif cell <= 6:
            temp.append(1)
        else:
            temp.append(2)
    catigorized_Y_values.append(temp)
    
catigorized_Y_values = np.array(catigorized_Y_values)
Y_values_catigorized = to_categorical(catigorized_Y_values)

print(Y_values_catigorized)


[[[0. 0. 1.]
  [0. 1. 0.]
  [0. 0. 1.]
  ...
  [0. 1. 0.]
  [0. 1. 0.]
  [0. 1. 0.]]

 [[0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [1. 0. 0.]
  [0. 0. 1.]
  [0. 0. 1.]]

 [[0. 0. 1.]
  [1. 0. 0.]
  [0. 0. 1.]
  ...
  [0. 1. 0.]
  [0. 0. 1.]
  [0. 1. 0.]]

 ...

 [[0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [0. 0. 1.]
  [0. 1. 0.]
  [0. 0. 1.]]

 [[0. 0. 1.]
  [1. 0. 0.]
  [0. 1. 0.]
  ...
  [0. 1. 0.]
  [0. 1. 0.]
  [0. 1. 0.]]

 [[0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]]]


### **Splitting into training and validating:**
Before this data can be used to train a model it first needs to be split into traning and validating data. Below is the code that does that. The first 800 samples (people) are going to be used train the model while the last 81 are going to be for validation.

In [25]:
multi_Y_train = Y_values_catigorized[:800]
multi_Y_val = Y_values_catigorized[800:]
print(f"Training set length: {len(multi_Y_train)}")
print(f"Validation set length: {len(multi_Y_val)}")

Training set length: 800
Validation set length: 81


### **Prepare input data:**
Now that the output data is fully prepared and ready for training it is time to prepare the coresponding input data. Below is the code to complete this.

In [26]:
# COlumn titles used for this input set
input_columns = ['MAx1', 'Max2', 'Max3']
input_df = df[input_columns]
all_X_values = input_df.to_numpy()

X_train = all_X_values[:800]
X_val = all_X_values[800:]

print(f"Training set length: {len(X_train)}")
print(f"Validation set length: {len(X_val)}")

Training set length: 800
Validation set length: 81


### **Building model archetecture:**
First instinct is build a deep neural network. Because this initial model will predict all 10 different decisions for each sample a multi-output model is need. There was some adjustment needed to the output layers so they are the correct shape. The functional API from Keras is used for this.

In [27]:
inputs = Input(shape=(3,))

# Fully connected dense layers
hidden1 = Dense(256, activation='relu')(inputs)
hidden2 = Dense(128, activation='relu')(hidden1)
hidden3 = Dense(64, activation='relu')(hidden2)
hidden4 = Dense(32, activation='relu')(hidden3)
hidden5 = Dense(16, activation='relu')(hidden4)

outputs = []
for i in range(10): # 10 different outputs
    output = Dense(3, activation='softmax', name=f'output_{i + 1}')(hidden5)
    outputs.append(output)

reshaped_outputs = [Reshape((1, 3))(output) for output in outputs]
concatenated_outputs = Concatenate(axis=1)(reshaped_outputs)

model = Model(inputs=inputs, outputs=concatenated_outputs)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

None


### **Training the model:***
Now that the model has been built and compiled it is ready for the training data that has been previously prepared. Below is the code for training the model.

In [28]:
model.fit(X_train, multi_Y_train, epochs=10,
          batch_size=32,
          validation_data=(X_val, multi_Y_val))

Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.4208 - loss: 1.7464 - val_accuracy: 0.6691 - val_loss: 0.8656
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6608 - loss: 0.8503 - val_accuracy: 0.7210 - val_loss: 0.7977
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7067 - loss: 0.7691 - val_accuracy: 0.7790 - val_loss: 0.7206
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7454 - loss: 0.7423 - val_accuracy: 0.7877 - val_loss: 0.6708
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7491 - loss: 0.7071 - val_accuracy: 0.7889 - val_loss: 0.6631
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7184 - loss: 0.7594 - val_accuracy: 0.7889 - val_loss: 0.6561
Epoch 7/10
[1m25/25[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x154ed7087a0>

### **Making new predictions with trained model:**
Making new predictions with the trained model can be a great way to see exactly what the model is producing as a prediction. The data used for predictions is  going to be the vallidation data. Below is the code for this as well as the code to dysplay the output in a readable way.

In [29]:
predictions = model.predict(X_val)

for i in range(len(predictions)):
    print(f"Person {i + 1}:")
    for j in range(len(predictions[i])):
        print(f"\tScenario {j + 1}:")
        print(f"\t\t{predictions[i][j]} -> {multi_Y_val[i][j]}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
Person 1:
	Scenario 1:
		[0.04797322 0.05258985 0.89943695] -> [0. 0. 1.]
	Scenario 2:
		[0.06191658 0.10141879 0.8366646 ] -> [0. 0. 1.]
	Scenario 3:
		[0.07169466 0.08722916 0.8410762 ] -> [0. 0. 1.]
	Scenario 4:
		[0.11870784 0.06555033 0.8157417 ] -> [0. 0. 1.]
	Scenario 5:
		[0.25489268 0.07979343 0.66531384] -> [1. 0. 0.]
	Scenario 6:
		[0.31202665 0.09566507 0.59230834] -> [1. 0. 0.]
	Scenario 7:
		[0.01946653 0.04360114 0.93693227] -> [0. 0. 1.]
	Scenario 8:
		[0.176263   0.09691954 0.7268174 ] -> [0. 0. 1.]
	Scenario 9:
		[0.0655982  0.06337606 0.8710258 ] -> [1. 0. 0.]
	Scenario 10:
		[0.08979514 0.15483452 0.7553703 ] -> [0. 0. 1.]
Person 2:
	Scenario 1:
		[0.02948394 0.05394097 0.91657513] -> [0. 0. 1.]
	Scenario 2:
		[0.04009654 0.08608512 0.87381834] -> [0. 0. 1.]
	Scenario 3:
		[0.07430566 0.08920964 0.8364846 ] -> [0. 0. 1.]
	Scenario 4:
		[0.08111216 0.06781486 0.851073  ] -> [0. 0. 1.]
	Scenario 5

### **Understanding the ouput above:**
Under each scenario is two arrays, the first one with crazy numbers is the preditions from the model the arrow is pointing to the correct corresponding array. The place in the array where there is a 1 is what catagory that person fell under for that decision. The model produced what it thought each probibiltiy of the person falling into that catigory. The location of the biggest number is what the model predicted the person having the highest chance of being. The accuracy of this is the same as the val_accuracy of the training model because it is the same data. That means that the computer predicted the correct choice 78.64% of the time.

### **Saving the output to a CSV file:**
Saving the outputed data to a CSV file can be a better way to organize the infromation and make easier access for other to read. Below is the code to write the information to a CSV file.

In [30]:
headers = ['Person',
           'Scenario 1 pt1', 'Scenario 1 pt2',
           'Scenario 2 pt1', 'Scenario 2 pt2',
           'Scenario 3 pt1', 'Scenario 3 pt2',
           'Scenario 4 pt1', 'Scenario 4 pt2',
           'Scenario 5 pt1', 'Scenario 5 pt2']

num = len(predictions)
predictions_df = pd.DataFrame({'Person': [i+1 for i in range(num)],
                               'Scenario 1 pt1': [f"{predictions[i][0]} -> {multi_Y_val[i][0]}" for i in range(num)],
                               'Scenario 1 pt2': [f"{predictions[i][1]} -> {multi_Y_val[i][1]}" for i in range(num)],
                               'Scenario 2 pt1': [f"{predictions[i][2]} -> {multi_Y_val[i][2]}" for i in range(num)],
                               'Scenario 2 pt2': [f"{predictions[i][3]} -> {multi_Y_val[i][3]}" for i in range(num)],
                               'Scenario 3 pt1': [f"{predictions[i][4]} -> {multi_Y_val[i][4]}" for i in range(num)],
                               'Scenario 3 pt2': [f"{predictions[i][5]} -> {multi_Y_val[i][5]}" for i in range(num)],
                               'Scenario 4 pt1': [f"{predictions[i][6]} -> {multi_Y_val[i][6]}" for i in range(num)],
                               'Scenario 4 pt2': [f"{predictions[i][7]} -> {multi_Y_val[i][7]}" for i in range(num)],
                               'Scenario 5 pt1': [f"{predictions[i][8]} -> {multi_Y_val[i][8]}" for i in range(num)],
                               'Scenario 5 pt2': [f"{predictions[i][9]} -> {multi_Y_val[i][9]}" for i in range(num)]})

print(predictions_df)
predictions_df.to_csv('Data/predictions.csv', index=False)
print(f"Predictions saved to 'Data/predictions.csv' successfully.")

    Person                                    Scenario 1 pt1  \
0        1  [0.04797322 0.05258985 0.89943695] -> [0. 0. 1.]   
1        2  [0.02948394 0.05394097 0.91657513] -> [0. 0. 1.]   
2        3  [0.04283439 0.06355444 0.89361125] -> [1. 0. 0.]   
3        4  [0.04614383 0.06462609 0.88923   ] -> [0. 1. 0.]   
4        5  [0.01972614 0.03293964 0.9473341 ] -> [0. 0. 1.]   
..     ...                                               ...   
76      77  [0.08384305 0.08246782 0.83368903] -> [1. 0. 0.]   
77      78  [0.04511363 0.07996128 0.87492514] -> [0. 0. 1.]   
78      79  [0.04434903 0.0676381  0.8880129 ] -> [0. 0. 1.]   
79      80  [0.05814823 0.0834912  0.85836047] -> [0. 0. 1.]   
80      81  [0.02982359 0.04590271 0.92427367] -> [0. 0. 1.]   

                                      Scenario 1 pt2  \
0   [0.06191658 0.10141879 0.8366646 ] -> [0. 0. 1.]   
1   [0.04009654 0.08608512 0.87381834] -> [0. 0. 1.]   
2   [0.05481724 0.10254977 0.842633  ] -> [0. 0. 1.]   
3   [0.

### **Explaining the model with shapely additive explanation**


In [41]:
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_train)

# Custom SHAP plot for a specific output
def custom_shap_plot(shap_values, features, feature_names, output_index):
    shap_values_mean = np.abs(shap_values[:, :, output_index]).mean(axis=0)
    feature_order = np.argsort(shap_values_mean)
    
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(feature_names)), shap_values_mean[feature_order], align='center')
    plt.yticks(range(len(feature_names)), [feature_names[i] for i in feature_order])
    plt.xlabel('Mean Absolute SHAP Value')
    plt.title(f'SHAP summary plot for {model.output_names[output_index]}')
    plt.show()

# Choose an output index (0 to 9 for your 10 outputs) to plot
output_index = 0  # Change this to plot different outputs
feature_names = ['Feature 1', 'Feature 2', 'Feature 3']

# Plot SHAP values for the chosen output index
custom_shap_plot(shap_values, X_train, feature_names, output_index)

ExactExplainer explainer: 801it [00:36, 15.74it/s]                         


TypeError: bad operand type for abs(): 'Explanation'

### **Next thing here**
