# ***By Kyle Weldon*** 

In [71]:
import os
# Supress TensorFlow messages
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Input,Dense, Concatenate, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2

import shap

import sklearn
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score

print('NumPy version:', np.__version__)
print('Pandas version:', pd.__version__)
print('Scikit-learn version:', sklearn.__version__)
print('Tensorflow version:', tf.__version__)
print('shap version:', shap.__version__)

NumPy version: 1.26.4
Pandas version: 2.2.2
Scikit-learn version: 1.5.1
Tensorflow version: 2.17.0
shap version: 0.46.0


### **Filtering raw data given:** 

The data that was given had incomplete data in a '.xlsx' file. The code below is what was used to remove all of the uncomplete samples and save the result as a '.csv' file.

In [72]:
def filter_data(excel_file, output_csv):
    try:
        df = pd.read_excel(excel_file)
    except FileNotFoundError:
        print(f"Error: The file '{excel_file}' was not found.")
        return
    except Exception as e:
        print(f"Error occurred while reading '{excel_file}': {str(e)}")
        return

    complete_rows = []
    for index, row in df.iterrows():
        if is_row_complete(row):
            complete_rows.append(row)

    cleaned_df = pd.DataFrame(complete_rows, columns=df.columns)

    try:
        cleaned_df.to_csv(output_csv, index=False)
        print(f"Cleaned data saved to '{output_csv}' successfully.")
        print(f"There are {len(cleaned_df)} samples in the cleaned data.")
    except Exception as e:
        print(f"Error occurred while saving to '{output_csv}': {str(e)}")
        return

def is_row_complete(row):
    for cell in row:
        if pd.isna(cell) or str(cell).strip() == '':
            return False
    return True

filter_data('Data/RawData.xlsx', 'Data/FilteredData.csv')

Cleaned data saved to 'Data/FilteredData.csv' successfully.
There are 881 samples in the cleaned data.


### **How to use the data:**
 There are 10 different 'senerios' or 'decisions' made by each sample (each sample represents one person). When making a decision they were able to choose between 0-10 based on how sure they are. This gives 11 possible choices per situation per sample. Given the fact there are only 881 samples attempting to accruetly predict 11 possible choices will likely not be accurate due to the limited data. To account for this The decisions are going to be split into three catagories. Anyone that chose a 0, 1, 2, or 3 will be a part of catagory one. Anyone that chose either 4, 5, or 6 will be a part of catagory two and anyone that chose 7, 8, 9, or 10 will be a part of catagory three. This gives a 4-3-4 catigorical split. Below is the code that completes this. It is also essential to remember to split the data into training and validating data. 

In [73]:
df = pd.read_csv('Data/FilteredData.csv')
# Column tites for all the output data
output_columns = ['Scenario 1 ',
                  'Unnamed: 40',
                  'Scenario 2 ',
                  'Unnamed: 42',
                  'Scenario 3 ',
                  'Unnamed: 44',
                  'Scenario 4',
                  'Unnamed: 46',
                  'Scenario 5 ',
                  'Unnamed: 48']

def classiy_and_catigorize(column):
    return to_categorical([0 if x <= 3 else 1 if x <= 6 else 2 for x in column])

columns = df[output_columns].to_numpy().T # The 'T' is to transpose the array

S1P1, S1P2, S2P1, S2P2, S3P1, S3P2, S4P1, S4P2, S5P1, S5P2 = [classiy_and_catigorize(col) for col in columns]
all_situations = [S1P1, S1P2, S2P1, S2P2, S3P1, S3P2, S4P1, S4P2, S5P1, S5P2]

for situation in all_situations:
    print(situation.shape)

(881, 3)
(881, 3)
(881, 3)
(881, 3)
(881, 3)
(881, 3)
(881, 3)
(881, 3)
(881, 3)
(881, 3)


### **Splitting into training and validating:**
Before this data can be used to train a model it first needs to be split into traning and validating data. Below is the code that does that. The first 800 samples (people) are going to be used train the model while the last 81 are going to be for validation.

In [74]:
def split(array):
    return array[:800], array[800:]

S1P1_train, S1P1_val = split(S1P1)
S1P2_train, S1P2_val = split(S1P2)
S2P1_train, S2P1_val = split(S2P1)
S2P2_train, S2P2_val = split(S2P2)
S3P1_train, S3P1_val = split(S3P1)
S3P2_train, S3P2_val = split(S3P2)
S4P1_train, S4P1_val = split(S4P1)
S4P2_train, S4P2_val = split(S4P2)
S5P1_train, S5P1_val = split(S5P1)
S5P2_train, S5P2_val = split(S5P2)

print(f"S1P1 training shape: {S1P1_train.shape}")
print(f"Validation set length: {len(S1P1_val)}")

S1P1 training shape: (800, 3)
Validation set length: 81


### **Prepare input data:**
Now that the output data is fully prepared and ready for training it is time to prepare the coresponding input data. Simalar to the multiple outputs there our also miltiple inputs. Below is the code to complete this.

In [75]:
# Column titles used for this input set
input_columns1 = ['MAx1', 'Max2', 'Max3']
input_columns2 = ['Q105_1','Q105_2','Q105_3','Q105_4','Q105_5','Q105_6','Q105_7','Q105_8','Q105_9','Q105_10','Q105_11','Q105_12','Q105_13','Q105_14','Q105_15','Q105_16','Q105_17','Q105_18','Q105_19','Q105_20','Q105_21','Q105_22','Q105_23','Q105_24','Q105_25','Q105_26','Q105_27','Q105_28','Q105_29','Q105_30','Q105_31','Q105_32','Q105_33','Q105_34']

input_df1 = df[input_columns1]
input1_X_values = input_df1.to_numpy()
input_df2 = df[input_columns2]
input2_X_values = input_df2.to_numpy()

layer1_X_train = input1_X_values[:800]
layer1_X_val = input1_X_values[800:]
layer2_X_train = input2_X_values[:800]
layer2_X_val = input2_X_values[800:]

print(f"Layer1 training shape: {layer1_X_train.shape} -> validating shape: {layer1_X_val.shape}")
print(f"Layer2 training shape: {layer2_X_train.shape} -> validating shape: {layer2_X_val.shape}")

Layer1 training shape: (800, 3) -> validating shape: (81, 3)
Layer2 training shape: (800, 34) -> validating shape: (81, 34)


### **Building model archetecture:**
First instinct is build a deep neural network. Because this initial model will predict all 10 different decisions for each sample a multi-output model is need. There was some adjustment needed to the output layers so they are the correct shape. The functional API from Keras is used for this.

In [78]:
input1 = Input(shape=(3,))

hidden1_input1 = Dense(256, activation='relu', kernel_regularizer=l2(0.01), name='DenseOne_Input1')(input1)
hidden2_input1 = Dense(128, activation='relu', kernel_regularizer=l2(0.01), name='DenseTwo_Input1')(hidden1_input1)
hidden3_input1 = Dense(64, activation='relu', kernel_regularizer=l2(0.01), name='DenseThree_Input1')(hidden2_input1)
hidden4_input1 = Dense(32, activation='relu', kernel_regularizer=l2(0.01), name='DenseFour_Input1')(hidden3_input1)
hidden5_input1 = Dense(16, activation='relu', kernel_regularizer=l2(0.01), name='DenseFive_Input1')(hidden4_input1)

# input2 = Input(shape=(34,), name='InputLayer2')
# 
# hidden1_input2 = Dense(256, activation='relu', kernel_regularizer=l2(0.01), name='DenseOne_Input2')(input2)
# hidden2_input2 = Dense(128, activation='relu', kernel_regularizer=l2(0.01), name='DenseTwo_Input2')(hidden1_input2)
# hidden3_input2 = Dense(64, activation='relu', kernel_regularizer=l2(0.01), name='DenseThree_Input2')(hidden2_input2)
# hidden4_input2 = Dense(32, activation='relu', kernel_regularizer=l2(0.01), name='DenseFour_Input2')(hidden3_input2)
# hidden5_input2 = Dense(16, activation='relu', kernel_regularizer=l2(0.01), name='DenseFive_Input2')(hidden4_input2)
# 
# concatenated = Concatenate(name='ConcatinatedInput')([hidden5_input1, hidden5_input2])

def create_output_layer(name, input_layer):
    return Dense(3, activation='softmax', name=name)(input_layer)
def itterate_situations_and_parts(num_itts=10):
    scenerio = 1
    sittos = []
    for i in range(num_itts):
        part = 1 if i % 2 == 0 else 2
        sittos.append(f"S{scenerio}P{part}")
        if part == 2: scenerio += 1
    return sittos

outputs = [create_output_layer(name, concatenated) for name in itterate_situations_and_parts()]

model = Model(inputs=input1, outputs=outputs, name='CustomizedDeepNeuralNetwork')

metrics = ['accuracy'] * 10  

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=metrics)

print(model.summary())

TypeError: unsupported operand type(s) for /: 'Dimension' and 'int', please use // instead

### **Training the model:**
Now that the model has been built and compiled it is ready for the training data that has been previously prepared. Below is the code for training the model.

In [None]:
model.fit(layer1_X_train, [S1P1_train, S1P2_train, S2P1_train,
                    S2P2_train, S3P1_train, S3P2_train, S4P1_train,
                    S4P2_train, S5P1_train, S5P2_train],
          epochs=10,
          batch_size=32,
          validation_data=(layer1_X_val, [S1P1_val, S1P2_val, S2P1_val,
                                                          S2P2_val, S3P1_val, S3P2_val, S4P1_val,
                                                          S4P2_val, S5P1_val, S5P2_val]))

### **Attempts made to improve output:**
The loss calculation is high as seen in this output. The closer a loss is to 0 the better, but the above calculations are far from 0. Simplifying the model's complexity, adjusting the L2 regularzation penalties, and changing the number of epochs but none seemed to have any effect on the loss of the model.


### **Explaining the output with SHAP (SHapley Additive exPlanations):**


In [None]:

explainer = shap.DeepExplainer(model,[layer1_X_train, layer2_X_train])

### **Better understanding the data itself:**
Having a better understanding of the data itself can often explain what is happening with the model. For example, knowing what percentage of people fell into each catagory for each scinario could potentially add more of an explination to what is seen in the model's output. This could help because in scinerio 1 part 1 the model predicted the correct answer about 85% of the time but if about 85% of people all fell within the same catagory then the high accuracy rate might not be as impressive. Below is the code that calculates the percentage of people that fall into each catagory for each scinario.

In [None]:
def get_percentages(full_data):
    n = len(full_data)
    classed_arr = [np.argmax(sample) for sample in full_data]
    n0, n1, n2 = classed_arr.count(0), classed_arr.count(1), classed_arr.count(2)
    return (f"Catagory 0: {round((n0-3)/n*100,2)}% Catagory 1: "
            f"{round((n1-3)/n*100,2)}% Catagory 2: {round((n2-3)/n*100,2)}%")

names = itterate_situations_and_parts()
for i in range(10):
    print(f"{names[i]} = {get_percentages(all_situations[i])}")

### **Connecting to model output:**
It is clear that a large majority of the samples (people) made decisions that fell into the thrid catagory. Most of them are near the same percentage of the model accuracy. That means if the model just predicted the sample making a decision that falls into catagory three then the output would be simalar. With that being said, S3P1 and S3P2 are both signifigently lower than the model's output. There are two things that come to mind. Number one is to change the catagories so instead of a 4-3-4 split it is a 3-5-3 split. This would potentally lower the percentage of samples that fall into catagory three. Another idea would be to build a decision tree model instead of a deep nerual network mode. This could work because decision threes are better for smaller and less complecated data. Changing the split to 3-5-3 is an easy change so I will do that first and see what happens. Below is the code to make the split. 

In [None]:
def classiy_and_catigorize(column):
    return to_categorical([0 if x <= 2 else 1 if x <= 7 else 2 for x in column])

S1P1, S1P2, S2P1, S2P2, S3P1, S3P2, S4P1, S4P2, S5P1, S5P2 = [classiy_and_catigorize(col) for col in columns]
all_situations = [S1P1, S1P2, S2P1, S2P2, S3P1, S3P2, S4P1, S4P2, S5P1, S5P2]

for situation in all_situations:
    print(situation.shape)

### **Dysplay ratios for each catagory**    

In [None]:
for i in range(10):
    print(f"{names[i]} = {get_percentages(all_situations[i])}")

### **These ratios are better and will hopefully add more insight to what is happening**

### **Split into training and validating:**
Just as before the data with the new split need to be sorted into training and validating data. Below is the code for this.

In [None]:
S1P1_train, S1P1_val = split(S1P1)
S1P2_train, S1P2_val = split(S1P2)
S2P1_train, S2P1_val = split(S2P1)
S2P2_train, S2P2_val = split(S2P2)
S3P1_train, S3P1_val = split(S3P1)
S3P2_train, S3P2_val = split(S3P2)
S4P1_train, S4P1_val = split(S4P1)
S4P2_train, S4P2_val = split(S4P2)
S5P1_train, S5P1_val = split(S5P1)
S5P2_train, S5P2_val = split(S5P2)

print(f"S1P1 training shape: {S1P1_train.shape}")
print(f"Validation set length: {len(S1P1_val)}")

### **Train the model with new data split:**
The model archetecture does not need to be rebuilt but the model does need to be reset and re-compiled.

In [None]:
model = Model(inputs=[input1, input2], outputs=outputs, name='CustomizedDeepNeuralNetwork')

metrics = ['accuracy'] * 10  

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=metrics)

print(model.summary())

### **Train the model same way as before**

In [None]:
model.fit([layer1_X_train, layer2_X_train], [S1P1_train, S1P2_train, S2P1_train,
                    S2P2_train, S3P1_train, S3P2_train, S4P1_train,
                    S4P2_train, S5P1_train, S5P2_train],
          epochs=10,
          batch_size=32,
          validation_data=([layer1_X_val, layer2_X_val], [S1P1_val, S1P2_val, S2P1_val,
                                                          S2P2_val, S3P1_val, S3P2_val, S4P1_val,
                                                          S4P2_val, S5P1_val, S5P2_val]))

### **Understanding model output**
The output again apears to suggest that the model is simply picking catagory three most of the time and that is why the percentages of accuracy is close to that of the ratio split. This could be because of poor model archetecture but I think trying out a decision tree next is a valid idea.

### **Decision Tree**
A valid next move is to use a decision tree. This is likely going to work better because decision trees work very well with smaller less complicated data. It is possible the above model is too complecated to solve this problem... or not complicated enough but trying out a decision tree will provide more insight.

In [None]:
dtc = DecisionTreeClassifier()

dtc.fit(layer1_X_train, S1P1_train)

predictions = dtc.predict(layer1_X_val)
accuracy = accuracy_score(S1P1_val, predictions)
for i in range(len(predictions)):
    print(f"{S1P1_val[i]} -> {predictions[i]}")
print(f"Accuracy: {round(accuracy * 100, 2)}%")
