In [None]:
from my_utils import emotion_dict, sentiment_dict, film_frames_dict

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, pearsonr, f_oneway
from sklearn.preprocessing import LabelEncoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

folder = '/content/drive/My Drive/Colab Notebooks/Dissertation'

os.chdir(folder)

output_dir = './Datasets'
results_dir = './Results'
film_keys = list(emotion_dict.keys())
datasets = ['frame_level', 'video_level']
results = ['eda', 'stats_analysis','modelling']
feature_group = ['rgb_hsv','audio','optical_flow']

for dataset in datasets:
    os.makedirs(os.path.join(results_dir, dataset), exist_ok=True)
    for results_type in results:
        os.makedirs(os.path.join(results_dir, dataset, results_type), exist_ok=True)

# 2. Data Analysis

#### Function to load feature sets

In [None]:
# obtain the frame-level and video-level feature sets
def load_datasets(frame_level_path, video_level_path):
    if os.path.exists(frame_level_path):
        frame_df = pd.read_csv(frame_level_path)
    else:
        frame_df = None
        print (f"File {frame_level_path} not found")

    if os.path.exists(video_level_path):
        video_df = pd.read_csv(video_level_path)
    else:
        video_df = None
        print (f"File {video_level_path} not found")

    return frame_df, video_df

# obtain the frame-level and video-level feature sets based on the feature group (rgb_hsv, audio, optical_flow)

def get_feature_group_datasets(feature_group):
    # call load_datasets above
    frame_df, video_df = load_datasets(os.path.join(output_dir, f"{datasets[0]}/features_{feature_group}_df.csv"),
                                       os.path.join(output_dir, f"{datasets[1]}/features_{feature_group}_df.csv"))

    v_shape = ()
    f_shape = ()
    feature_cols = []

    if video_df is not None:
        feature_cols = [col for col in video_df.columns if col not in ["video_id", "emotion", "sentiment"]]
        v_shape = video_df.shape

    if frame_df is not None:
        f_shape = frame_df.shape

        feature_cols = [col for col in frame_df.columns if col not in ["video_id", "frame_id", "emotion", "sentiment"]]

    print(f"Dataframes shape for {feature_group}: frame-level -> {f_shape}, video-level -> {v_shape}")

    return frame_df, video_df, feature_cols

#### Box Plot Function

In [None]:
# get a discrete clour palette for the emotions
def get_discrete_colour_palette(unique_emotions):
    colour_palette = sns.color_palette("tab10", len(unique_emotions))  # using tab10 for discrete colours
    return dict(zip(unique_emotions, colour_palette))

# generate box plots
def plot_boxplot_by_emotion(df, feature_columns, f_set_id, feature_group, results_dir=results_dir, display_plot=False):

    unique_emotions = df["emotion"].unique() # get the Seven-emotion labels
    emotion_palette = get_discrete_colour_palette(unique_emotions)

    plt.figure(figsize=(8, 6))
    df_melted = df.melt(id_vars=["emotion"], value_vars=feature_columns, var_name="Feature", value_name="Value") # keep emotion column as the identifier and features are the variables
    sns.boxplot(x="Feature", y="Value", hue="emotion", data=df_melted, palette=emotion_palette)
    plt.xticks(rotation=45)
    plt.title(f"Boxplots of '{f_set_id}' by Emotion")
    plt.legend(title="Emotion", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.savefig(os.path.join(results_dir, f"boxplot_by_emotion_{feature_group}_{f_set_id}.png"), bbox_inches='tight')

    if display_plot:
        plt.show()

    plt.close()

#### Correlation Analysis Functions

In [None]:
# correlations between features and labels
def correlation_analysis(df, feature_columns, label_column, feature_group, results_dir=results_dir, display_plot=False):

    # Convert emotion labels to numeric values
    le = LabelEncoder()
    df["encoded_emotion"] = le.fit_transform(df[label_column])

    # Compute correlations
    correlations = {}

    for feature in feature_columns:
        pearson_corr, _ = pearsonr(df[feature],  df["encoded_emotion"])
        spearman_corr, _ = spearmanr(df[feature],  df["encoded_emotion"])
        correlations[feature] = {"Pearson": pearson_corr, "Spearman": spearman_corr}

    correlation_df = pd.DataFrame(correlations).T
    correlation_df.to_csv(os.path.join(results_dir, f"correlation_analysis_by_{label_column}_{feature_group}.csv"))

    # plot heatmap of correlation coefficient
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_df, annot=True, cmap="coolwarm", linewidths=0.5)
    plt.title(f"Heatmap of feature correlation with '{label_column}' label")
    plt.savefig(os.path.join(results_dir, f"correlation_heatmap_by_{label_column}_{feature_group}.png"), bbox_inches='tight')
    if display_plot:
        plt.show()
    plt.close()

# correlations between the features
def plot_feature_correlation_heatmap(df, feature_columns, feature_group, results_dir=results_dir, threshold=0.7, display_plot=False):

    # compute correlation matrix
    corr_matrix = df[feature_columns].corr()

    # plot heatmap of correlations among features
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5, fmt=".2f")
    plt.title("Feature Correlation Heatmap")
    plt.savefig(os.path.join(results_dir, f"correlation_heatmap_features_{feature_group}.png"), bbox_inches='tight')
    if display_plot:
        plt.show()
    plt.close()

    # identify highly correlated pairs
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i + 1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold: # using a threshold of 0.7
                high_corr_pairs.append((corr_matrix.index[i], corr_matrix.index[j], corr_matrix.iloc[i, j]))

    # save highly correlated pairs to a file
    high_corr_df = pd.DataFrame(high_corr_pairs, columns=["Feature 1", "Feature 2", "Correlation"])
    high_corr_df.to_csv(os.path.join(results_dir, f"highly_correlated_features_{feature_group}.csv"), index=False)

#### Statistical Analysis Function

In [None]:
def bootstrap_anova(df, label_column, feature, n_iterations, sample_size):

    f_values, p_values = [], []

    for _ in range(n_iterations):
        subset_df = df.sample(frac=sample_size, replace=False)  # bootstrap sample
        groups = [group[feature].dropna().values for name, group in subset_df.groupby(label_column)] # group feature vectors by emotion label

        if all(len(g) > 1 for g in groups):  # ensure groups have enough samples
            f_val, p_val = f_oneway(*groups) # one way ANOVA
            f_values.append(f_val)
            p_values.append(p_val)

    return np.mean(f_values), np.mean(p_values) # return mean values from all iterations

## 2.1 Data Visualisations
Plots saved to Results folder (both for frame-level and video-level features)


 #### 2.1.1 RGB_HSV Features


In [None]:
frame_df, video_df, feature_cols = get_feature_group_datasets(feature_group[0])

for dataset in datasets:

    if dataset == 'frame_level':
         df = frame_df
    else:
        df = video_df

    if df is not None:
        subset_feature_cols = {
            "Colourfulness": ["colourfulness"],
            "Red": [col for col in feature_cols if col.startswith("R_")],
            "Green": [col for col in feature_cols if col.startswith("G_")],
            "Blue": [col for col in feature_cols if col.startswith("B_")],
            "Hue": [col for col in feature_cols if col.startswith("H_")],
            "Saturation": [col for col in feature_cols if col.startswith("S_")],
            "Value": [col for col in feature_cols if col.startswith("V_")]
        }

        # generate box plots with feature names
        for subset_f in subset_feature_cols.keys():
            plot_boxplot_by_emotion(df, subset_feature_cols[subset_f], subset_f,
                                    feature_group = feature_group[0],
                                    results_dir=os.path.join(results_dir,dataset,'eda'),
                                    display_plot=True)
    else:
        print("No colour feature files found")


 #### 2.1.2 Audio Features


In [None]:
frame_df, video_df, feature_cols = get_feature_group_datasets(feature_group[1])

for dataset in datasets:

    if dataset == 'frame_level':
        df = frame_df
    else:
        df = video_df

    if df is not None:

        subset_feature_cols = {
        "Spectral centroid": ["spectral_centroid"],
        "Spectral bandwidth": ["spectral_bandwidth"],
        "RMS Energy": ["rms_energy"],
        "Zero Crossing Rate": ["zero_crossing_rate"],
        "Amplitude envelope": ["amplitude_envelope"],
        "Chroma": ["chroma_mean"],
        "MFCCS": ["mfccs_mean"],
        "Delta MFCCS": ["delta_mfccs_mean"],
        "Delta2 MFCCS": ["delta2_mfccs_mean"] }

        # generate box plots with feature names
        for subset_f in subset_feature_cols.keys():
            plot_boxplot_by_emotion(df, subset_feature_cols[subset_f], subset_f,
                                    feature_group = feature_group[1],
                                    results_dir=os.path.join(results_dir,dataset,'eda'),
                                    display_plot=True)

    else:
        print("No audio features files found")



 #### 2.1.3 Motion (Optical Flow) Features


In [None]:
frame_df, video_df, feature_cols = get_feature_group_datasets(feature_group[2])

subset_feature_cols = {}
for f in feature_cols:
    subset_feature_cols[f] = f

for dataset in datasets:

    if dataset == 'frame_level':
        df = frame_df
    else:
        df = video_df

    if df is not None:

        for subset_f in subset_feature_cols:
            plot_boxplot_by_emotion(df, subset_feature_cols[subset_f], subset_f,
                                    feature_group = feature_group[2],
                                    results_dir=os.path.join(results_dir,dataset,'eda'),
                                    display_plot=True)

    else:
        print("No motion features files found")


## 2.2 Correlation Analysis
Analysing both for 'emotion' label and 'sentiment' label

 #### 2.2.1 RGB_HSV Features


In [None]:
frame_df, video_df, feature_cols = get_feature_group_datasets(feature_group[0])

for dataset in datasets:

    if dataset == 'frame_level':
         df = frame_df
    else:
        df = video_df

    feature_cols = [col for col in df.columns if col not in ["video_id", "frame_id", "emotion", "sentiment"]]

    correlation_analysis(df, feature_cols, "emotion", feature_group = feature_group[0], results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=False)
    correlation_analysis(df, feature_cols, "sentiment", feature_group = feature_group[0], results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=False)
    plot_feature_correlation_heatmap(df, feature_cols, feature_group = feature_group[0], results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=True)

 #### 2.2.2 Audio Features


In [None]:

frame_df, video_df, feature_cols = get_feature_group_datasets(feature_group[1])

for dataset in datasets:

    if dataset == 'frame_level':
         df = frame_df
    else:
        df = video_df

    feature_cols = [col for col in df.columns if col not in ["video_id", "frame_id", "emotion", "sentiment", "encoded_emotion"]
                        and not col.startswith("chroma") and not col.startswith("mfcc") and not col.startswith("delta_mfcc") and not col.startswith("delta2_mfcc")]
    feature_cols_mfcc = [col for col in df.columns if col.startswith("mfcc") or col.startswith("delta_mfcc") or col.startswith("delta2_mfcc")]
    feature_cols_chroma = [col for col in df.columns if col.startswith("chroma") ]

    if df is not None:
        feature_group = "audio_features"
        correlation_analysis(df, feature_cols, "emotion", feature_group, results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=False)
        correlation_analysis(df, feature_cols, "sentiment", feature_group, results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=False)
        plot_feature_correlation_heatmap(df, feature_cols, feature_group, results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=True)

        # uncomment for mfcc features / chroma features

        '''
        feature_group = "audio_features_mfcc"
        correlation_analysis(df, feature_cols_mfcc, "emotion", feature_group, results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=True)
        correlation_analysis(df, feature_cols_mfcc, "sentiment", feature_group, results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=True)
        plot_feature_correlation_heatmap(df, feature_cols_mfcc, feature_group, results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=True)

        feature_group = "audio_features_chroma"
        correlation_analysis(df, feature_cols_chroma, "emotion", feature_group, results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=True)
        correlation_analysis(df, feature_cols_chroma, "sentiment", feature_group, results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=True)
        plot_feature_correlation_heatmap(df, feature_cols_chroma, feature_group, results_dir=os.path.join(results_dir,dataset,'stats_analysis'), display_plot=True)'
        '''


## 2.3. Statistical Analysis
Analysing both for 'emotion' label and 'sentiment' label

#### 2.3.1 RGB_HSV features

In [None]:
frame_df, video_df, feature_cols = get_feature_group_datasets(feature_group[0]) # selecting RGB_HSV features

# uncomment depending on what label to use
label_column = 'emotion'
#label_column = 'sentiment'

# uncomment depending on what dataset to use and change sample size accordingly
'''
dataset = 'frame_level'
df = frame_df
sample_size = 0.01
n_iterations = 1000
'''
dataset = 'video_level'
df = video_df
sample_size = 1.0
n_iterations = 1

results = []

print(f"Bootstrapped ANOVA Results for {feature_group[0]} Features:")
for feature in feature_cols:
    f, p = bootstrap_anova(df, label_column, feature, n_iterations, sample_size)
    print(f"{feature}: Mean F-value = {f:.2f}, Mean p-value = {p:.4f}")

    # Append to results list
    results.append({
            "Feature": feature,
            "Mean F-value": round(f, 2),
            "Mean p-value": round(p, 4)
    })


results_df = pd.DataFrame(results)

# save results to a csv file
results_df.to_csv(os.path.join(os.path.join(results_dir,dataset,'stats_analysis'),
                                f"bootstrap_anova_sample_size_{sample_size}_{label_column}_{feature_group[0]}.csv"))


#### 2.3.2 Audio features

In [None]:
frame_df, video_df, feature_cols = get_feature_group_datasets(feature_group[1]) # selecting audio features

# uncomment depending on what label to use
label_column = 'emotion'
#label_column = 'sentiment'

# uncomment depending on what dataset to use and change sample size accordingly
'''
dataset = 'frame_level'
df = frame_df
sample_size = 0.01
n_iterations = 1000
'''

dataset = 'video_level'
df = video_df
sample_size = 1.0
n_iterations = 1

results = []

print(f"Bootstrapped ANOVA Results for {feature_group[1]} Features:")
for feature in feature_cols:
    f, p = bootstrap_anova(df, label_column, feature, n_iterations, sample_size)
    print(f"{feature}: Mean F-value = {f:.2f}, Mean p-value = {p:.4f}")

    # Append to results list
    results.append({
            "Feature": feature,
            "Mean F-value": round(f, 2),
            "Mean p-value": round(p, 4)
    })

results_df = pd.DataFrame(results)

# save results to a csv file
results_df.to_csv(os.path.join(os.path.join(results_dir,dataset,'stats_analysis'),
                                f"bootstrap_anova_sample_size_{sample_size}_{label_column}_{feature_group[1]}.csv"))

#### 2.3.3 Motion (Optical Flow) Features

In [None]:
frame_df, video_df, feature_cols = get_feature_group_datasets(feature_group[2]) # selecting motion features

# uncomment depending on what label to use
label_column = 'emotion'
#label_column = 'sentiment'

# uncomment depending on what dataset to use and change sample size accordingly
'''
dataset = 'frame_level'
df = frame_df
sample_size = 0.01
n_iterations = 1000
'''

dataset = 'video_level'
df = video_df
sample_size = 1.0
n_iterations = 1

results = []

print(f"Bootstrapped ANOVA Results for {feature_group[2]} Features:")
for feature in feature_cols:
    f, p = bootstrap_anova(df, label_column, feature, n_iterations, sample_size)
    print(f"{feature}: Mean F-value = {f:.2f}, Mean p-value = {p:.4f}")

    # Append to results list
    results.append({ "Feature": feature,
                    "Mean F-value": round(f, 2),
                    "Mean p-value": round(p, 4)})

results_df = pd.DataFrame(results)

# save results to a csv file
results_df.to_csv(os.path.join(os.path.join(results_dir,dataset,'stats_analysis'),
                                f"bootstrap_anova_sample_size_{sample_size}_{label_column}_{feature_group[2]}.csv"))