<h1>Analysis of the Grade Group Random Forest Model</h1>

<h2>Create Test Dataset of First 6 Seconds From All Respondents</h2>

In [2]:
import pandas as pd
import os

folder_path = "../all respondents data/"

# Select the columns to include in the dataset
selected_columns = ['Anger', 'Contempt', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise',
       'Engagement', 'Valence', 'Sentimentality', 'Confusion']

csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

data_list = []
for file in csv_files:
    data = pd.read_csv(file, header=0, usecols=selected_columns, nrows=250).dropna()
    data_list.append(data)

df = pd.concat(data_list)

print(df.head())

       Anger  Contempt   Disgust      Fear       Joy   Sadness  Surprise  \
5   0.125566  0.186393  0.023706  0.125796  0.025453  0.125512  0.055428   
7   0.125833  0.186909  0.023738  0.126169  0.025312  0.125694  0.055510   
9   0.126258  0.187818  0.023798  0.126605  0.025114  0.125970  0.055607   
11  0.126290  0.188413  0.023834  0.126583  0.024996  0.126124  0.055529   
13  0.126344  0.188792  0.023866  0.126641  0.024909  0.126202  0.055519   

    Engagement  Valence  Sentimentality  Confusion  
5     0.328766      0.0        0.030948   0.004428  
7     0.328766      0.0        0.051102   0.007234  
9     0.328766      0.0        0.043316   0.012638  
11    0.328766      0.0        0.032416   0.013319  
13    0.328766      0.0        0.028298   0.013363  


In [3]:
import os
import pandas as pd

# Specify the folder path where the CSV files are located
folder_path = "../all respondents data"

# Initialize an empty dataframe to store the combined data
combined_data = pd.DataFrame()

# Select the columns to include in the dataset
selected_columns = ['Anger', 'Contempt', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise',
       'Engagement', 'Valence', 'Sentimentality', 'Confusion']

# Loop through each file in the folder with .csv extension and append to the combined_data dataframe
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, header=0, usecols=selected_columns, low_memory=False, nrows=200).dropna()
        respondent_num = filename.split('_')[0]  # Get the first part of the filename before the first '_'
        df['Respondent'] = respondent_num  # Add a new column with the respondent number
        combined_data = pd.concat([combined_data, df])

# Print the combined data
print(combined_data)


        Anger   Contempt   Disgust      Fear       Joy   Sadness  Surprise  \
5    0.125566   0.186393  0.023706  0.125796  0.025453  0.125512  0.055428   
7    0.125833   0.186909  0.023738  0.126169  0.025312  0.125694  0.055510   
9    0.126258   0.187818  0.023798  0.126605  0.025114  0.125970  0.055607   
11   0.126290   0.188413  0.023834  0.126583  0.024996  0.126124  0.055529   
13   0.126344   0.188792  0.023866  0.126641  0.024909  0.126202  0.055519   
..        ...        ...       ...       ...       ...       ...       ...   
190  0.813891  10.912367  0.178709  0.650948  0.029256  0.264468  0.281056   
192  0.762514  47.368874  0.144964  0.558924  0.031119  0.302397  0.264432   
194  0.834973  78.164200  0.118704  0.457550  0.033249  0.316273  0.216130   
196  1.234668  91.640717  0.089462  0.533636  0.036385  0.294732  0.239699   
198  1.476485  95.016411  0.073928  0.601103  0.038038  0.268923  0.243317   

     Engagement   Valence  Sentimentality  Confusion Respondent

<h3>Add Grades to the Dataset</h3>

In [4]:
import pandas as pd

# Read in the Grades.csv file
grades_df = pd.read_csv('Grades.csv')

# Remove the trailing underscore from the "Respondent" column in the grades_df dataframe
grades_df['Respondent'] = grades_df['Respondent'].str.rstrip('_')

# Merge the combined_data and grades_df dataframes based on the "Respondent" column
combined_data_with_grades = pd.merge(combined_data, grades_df[['Respondent', 'Grade']], on='Respondent')

# Print the resulting dataframe with the added "Grade" column
print(combined_data_with_grades)


         Anger   Contempt   Disgust      Fear       Joy   Sadness  Surprise  \
0     0.125566   0.186393  0.023706  0.125796  0.025453  0.125512  0.055428   
1     0.125833   0.186909  0.023738  0.126169  0.025312  0.125694  0.055510   
2     0.126258   0.187818  0.023798  0.126605  0.025114  0.125970  0.055607   
3     0.126290   0.188413  0.023834  0.126583  0.024996  0.126124  0.055529   
4     0.126344   0.188792  0.023866  0.126641  0.024909  0.126202  0.055519   
...        ...        ...       ...       ...       ...       ...       ...   
2105  0.813891  10.912367  0.178709  0.650948  0.029256  0.264468  0.281056   
2106  0.762514  47.368874  0.144964  0.558924  0.031119  0.302397  0.264432   
2107  0.834973  78.164200  0.118704  0.457550  0.033249  0.316273  0.216130   
2108  1.234668  91.640717  0.089462  0.533636  0.036385  0.294732  0.239699   
2109  1.476485  95.016411  0.073928  0.601103  0.038038  0.268923  0.243317   

      Engagement   Valence  Sentimentality  Confusi

<h3>Add Grade Group Column</h3>

In [5]:
# Add a new column to the combined_data_with_grades dataframe that indicates whether the grade is above or below 55
combined_data_with_grades['Grade Group'] = combined_data_with_grades['Grade'].apply(lambda x: 'Above 55' if x > 55 else 'Below 55')

print(combined_data_with_grades)

         Anger   Contempt   Disgust      Fear       Joy   Sadness  Surprise  \
0     0.125566   0.186393  0.023706  0.125796  0.025453  0.125512  0.055428   
1     0.125833   0.186909  0.023738  0.126169  0.025312  0.125694  0.055510   
2     0.126258   0.187818  0.023798  0.126605  0.025114  0.125970  0.055607   
3     0.126290   0.188413  0.023834  0.126583  0.024996  0.126124  0.055529   
4     0.126344   0.188792  0.023866  0.126641  0.024909  0.126202  0.055519   
...        ...        ...       ...       ...       ...       ...       ...   
2105  0.813891  10.912367  0.178709  0.650948  0.029256  0.264468  0.281056   
2106  0.762514  47.368874  0.144964  0.558924  0.031119  0.302397  0.264432   
2107  0.834973  78.164200  0.118704  0.457550  0.033249  0.316273  0.216130   
2108  1.234668  91.640717  0.089462  0.533636  0.036385  0.294732  0.239699   
2109  1.476485  95.016411  0.073928  0.601103  0.038038  0.268923  0.243317   

      Engagement   Valence  Sentimentality  Confusi

<h2>Run Prediction on the Dataset</h2>

In [8]:
import pandas as pd
import pickle
from datetime import datetime
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, confusion_matrix, classification_report

# Load the saved model from the pickle file
with open(f'random_forest_classifier_2023-05-08_21-14-34.pkl', 'rb') as f:
    model = pickle.load(f)

# Extract the features from the full data
features = combined_data_with_grades[selected_columns]

# Make predictions on the features using the loaded model
predictions = model.predict(features)

# Compare the predicted grade group labels with the pre-labeled grade group labels
correct_predictions = sum(predictions == combined_data_with_grades['Grade Group']) 
accuracy = correct_predictions / len(combined_data_with_grades)
print(f"Accuracy on new data: {accuracy}")

# Print the accuracy score
print("Accuracy score:", accuracy_score(combined_data_with_grades['Grade Group'], predictions))

# Print the classification report
print(classification_report(combined_data_with_grades['Grade Group'], predictions))

# Print the confusion matrix
print("Confusion matrix:")
print(confusion_matrix(combined_data_with_grades['Grade Group'], predictions))

# Evaluate the model
tn, fp, fn, tp = confusion_matrix(combined_data_with_grades['Grade Group'], predictions).ravel()

# Calculate TPR, FPR, TNR, FNR
tnr = tn / (tn + fp)
tpr = tp / (tp + fn)
fnr = fn / (fn + tp)
fpr = fp / (fp + tn)

print('FPR: ', fpr)
print('TPR: ', tpr)
print('FNR: ', fnr)
print('TNR: ', tnr)


Accuracy on new data: 0.8559241706161137
Accuracy score: 0.8559241706161137
              precision    recall  f1-score   support

    Above 55       0.85      0.87      0.86      1102
    Below 55       0.86      0.84      0.85      1008

    accuracy                           0.86      2110
   macro avg       0.86      0.86      0.86      2110
weighted avg       0.86      0.86      0.86      2110

Confusion matrix:
[[962 140]
 [164 844]]
FPR:  0.12704174228675136
TPR:  0.8373015873015873
FNR:  0.1626984126984127
TNR:  0.8729582577132486
