In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
import warnings
from tqdm import tqdm

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
import sklearn
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

  and should_run_async(code)


## Loading data

In [38]:
#Convert Personality to DataFrame
personality = loadmat('data/Dt_Personality.mat')
personality = personality['Personality']
big5 = pd.DataFrame(personality, columns=['extro', 'agree', 'cons', 'stability', 'open'])
#big5.head()

In [39]:
#Load MATLAB file for ECG"
ECG = loadmat('data/Dt_ECGFeatures.mat')
ECG_feats_mat = ECG['ECGFeatures_58']

In [40]:
def unfold_mat(mat, feats=None):
    #Returns list with each index being participant.
    #Each item has dataframe with shape video x feature (row is video, columns are features)
    lst = []
    for i in mat:
        d = pd.DataFrame(i, columns=feats)
        d.index += 1
        lst.append(d)
    
    return lst

def statistical_measurements(suffix):
    #Some modalities have statistical measurements, they are always the same 6 items.
    cols = []
    names = ['mean', 'std', 'skewness', 'kurtosis', '%time_over_mean+std', '%time_over_mean-std']
    for i in range(6):
        cols.append(names[i]+'_'+suffix)
    return cols

In [41]:
#Isolate ECG (Heart Monitor) Features
ECG_cols = ['low_freq_PSD' for x in range(10)]
[ECG_cols.append('slow_response_PSD') for x in range(4)]
[ECG_cols.append(x) for x in statistical_measurements('IBI')]
[ECG_cols.append(x) for x in statistical_measurements('HR')]
[ECG_cols.append(x) for x in statistical_measurements('HRV')]

ECG_feats = unfold_mat(ECG_feats_mat[0], feats=ECG_cols)

#Remove columns, where all rows are 0.0 or NaN - ECG had a lot of useless readings.
for i in range(len(ECG_feats)):
    ECG_feats[i] = ECG_feats[i].drop('low_freq_PSD', axis=1)
    ECG_feats[i] = ECG_feats[i].drop('slow_response_PSD', axis=1)
    ECG_feats[i] = ECG_feats[i].drop('mean_IBI', axis=1)
    ECG_feats[i] = ECG_feats[i].drop('std_IBI', axis=1)
    ECG_feats[i] = ECG_feats[i].drop('skewness_IBI', axis=1)
    ECG_feats[i] = ECG_feats[i].drop('kurtosis_IBI', axis=1)
    

#ECG_feats[0].head()

In [42]:
#Load EEG features
EEG = loadmat('data/Dt_EEGFeatures.mat')
EEG_feats_mat = EEG['EEGFeatures_58']

#Isolate EEG features
EEG_cols = []
[EEG_cols.append(x) for x in ['avg_first_deriv', 'prop_neg_diff_samples', 'mean_num_peaks', 'mean_deriv_inv_signal', 'avg_num_peaks_inv_signal']]
[EEG_cols.append(x) for x in statistical_measurements('EEG')]
[EEG_cols.append('attention') for x in range(11)]
[EEG_cols.append('meditation') for x in range(11)]
[EEG_cols.append('alpha') for x in range(11)]
[EEG_cols.append('beta') for x in range(11)]
[EEG_cols.append('delta') for x in range(11)]
[EEG_cols.append('gamma') for x in range(11)]
[EEG_cols.append('theta') for x in range(11)]

EEG_feats = unfold_mat(EEG_feats_mat[0], feats=EEG_cols)
#EEG_feats[0].head()

In [43]:
#Load GSR features
GSR = loadmat('data/Dt_GSRFeatures.mat')
GSR_feats_mat = GSR['GSRFeatures_58']
GSR_fails_mat = GSR['GSRFailures_58']

GSR_cols = ['mean_resistance',
            'mean_derivative',
            'mean_differential_neg_values',
            'proportion_neg_derivative_samples',
            'num_local_signal_minima',
            'avg_rising_time',
            'spectral_power_Hz',
            'zero_crossing_slow',
            'zero_crossing_very_slow',
            'mean_scsr',
            'scvsr_peak_mag']


GSR_feats = unfold_mat(GSR_feats_mat[0])
#GSR_feats[0].head()

In [44]:
#Load facial landmark features
EMO = loadmat('data/Dt_EMOFeatures.mat')
EMO_feats_mat = EMO['EMOFeatures_58']

EMO_cols = []
[EMO_cols.append(x) for x in statistical_measurements('verti_upper_lip')]
[EMO_cols.append(x) for x in statistical_measurements('verti_lower_lip')]
[EMO_cols.append(x) for x in statistical_measurements('horis_left_lip')]
[EMO_cols.append(x) for x in statistical_measurements('verti_left_lip')]
[EMO_cols.append(x) for x in statistical_measurements('horis_right_lip')]
[EMO_cols.append(x) for x in statistical_measurements('verti_right_lip')]
[EMO_cols.append(x) for x in statistical_measurements('right_eyebrow')]
[EMO_cols.append(x) for x in statistical_measurements('left_eyebrow')]
[EMO_cols.append(x) for x in statistical_measurements('right_cheek')]
[EMO_cols.append(x) for x in statistical_measurements('left_cheek')]
[EMO_cols.append(x) for x in statistical_measurements('right_lid')]
[EMO_cols.append(x) for x in statistical_measurements('left_lid')]

EMO_feats = unfold_mat(EMO_feats_mat[0], feats=EMO_cols)
#EMO_feats[0].head()

## Data preprocessing

In [45]:
def get_group(modality, group):
    #Returns a subset of modality after grouped movie clip
    #ex: get_group(EEG_feats, LAHV) -> list with dataframes, each is a participant, each row movie
    out = []
    for i in range(len(modality)):
        out.append(modality[i].loc[group])
    return out

def trait_binary(trait):
    #Converts personality trait to 0/1 (low/high) depending on median.
    #Returns a 1D numpy array
    median = np.median(big5.loc[:][trait].values)
    mask = big5.loc[:][trait]>median
    return mask.astype(int).values
    

def format_train_data(modality, trait, shuffle=False):
    #Creates input vectors and output vectors for modality and trait combination.
    vector_x = []
    vector_y = []
    
    bool5 = trait_binary(trait)
    
    for i, participant in enumerate(modality):
        participant.replace([np.inf, -np.inf], np.nan, inplace=True)
        participant = participant.dropna()
        y = bool5[i]
        
        for clip in participant.index:
            vector_x.append(participant.loc[clip].values)
            vector_y.append(y)
    
    vector_y = np.array(vector_y)
    vector_x = np.array(vector_x)
    if shuffle:
        vector_x, vector_y = sklearn.utils.shuffle(vector_x, vector_y, random_state=42)
        
    return vector_x, vector_y

def preprocess(features, target_trait):
    #Function, which returns standardized PCA on X and y
    pca = PCA(n_components=0.99)
    X, y = format_train_data(features, target_trait, shuffle=True)
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    x_pca = pca.fit_transform(X)
    
    return x_pca, y

def results(classifier):
    #Creates dataframe with unimodal results.
    mats = { #Dict with column names corresponding to features.
        'EEG': EEG_feats,
        'ECG': ECG_feats,
        'EMO': EMO_feats,
        'GSR': GSR_feats
    }
    scores = []
    #Initialise dataframe
    df = pd.DataFrame(columns=big5.keys(), index=['ECG', 'EEG', 'GSR', 'EMO'])
    for trait in tqdm(list(df.keys())):
        for modal in list(df.index):
            #For each combination of modality and trait:
            X, y = preprocess(mats[modal], trait) #
            score = round(np.mean(cross_val_score(classifier, X, y, scoring='f1', cv=10)),2)
            df[trait][modal] = score
            scores.append(score)            

    print("Mean score:", np.mean(scores))
    print("STD score:", np.std(scores))
    
    return df

#train_X, test_X, train_y, test_y = preprocess(EEG_feats, 'open')

## Models

### Baseline (Should we even use it?)

In [46]:
dummy = DummyClassifier(random_state=42)
results(dummy)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.18it/s]

Mean score: 0.4575
STD score: 0.03645202326346235





Unnamed: 0,extro,agree,cons,stability,open
ECG,0.5,0.46,0.46,0.49,0.44
EEG,0.49,0.43,0.41,0.54,0.42
GSR,0.49,0.47,0.46,0.49,0.42
EMO,0.48,0.45,0.39,0.45,0.41


### Logistic Regression

In [47]:
warnings.filterwarnings('always')
logreg = LogisticRegressionCV(random_state=42, max_iter=2500, scoring='f1')
results(logreg)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:13<00:00, 14.64s/it]

Mean score: 0.544
STD score: 0.09911609354691095





Unnamed: 0,extro,agree,cons,stability,open
ECG,0.63,0.52,0.56,0.62,0.48
EEG,0.63,0.42,0.48,0.64,0.43
GSR,0.57,0.49,0.49,0.59,0.25
EMO,0.66,0.66,0.62,0.56,0.58


### Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=500, n_jobs=4, random_state=42)
results(rfc)

  and should_run_async(code)
 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [03:02<00:45, 45.85s/it]