#### Emotion analysis using OpenFace data

In [5]:
import os
import numpy as np
import pandas as pd
from IPython.display import display, clear_output

In [6]:
# get filenames of the data files
dir_path = "2_speech_AUs_dimension_matched"
filenames = os.listdir(dir_path)
filenames[:5]
print(len(filenames))

276


In [7]:
# read individual csv files, flatten them, and combined into master data file
dfs = [] # empty list for storing flattened individual dataframes
i = 0
for fname in filenames:
    clear_output(wait=True)
    i += 1
    print(i)
    
    fpath = os.path.join(dir_path, fname) # get full "relative" path of the file
    df = pd.read_csv(fpath) # read the file's data
    
    # flatten the data
    df = df.to_numpy()
    df = np.reshape(df, (1, df.shape[0] * df.shape[1])) # flatten
    df = pd.DataFrame(df)
    
    # create classification labels (emotion)
    x = fname.split('-')[2]
    df.insert(0,'emotion', x) # add new column
    df.reset_index(drop=True, inplace=True)
    
    dfs.append(df) # append to the list

data_flattened = pd.concat(dfs, ignore_index=False) # convert lists of df's to single big df  
data = data_flattened.fillna(0) # fill nan values with zeros

# data.to_csv('flattened_AU_data3.csv', index = False) # write to CSV

276


In [8]:
# check for missing values in the dataframe
nan_count = data.isna().sum().sum()
nan_count

0

In [9]:
# classification using logistic regression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = data.iloc[:, 1:]
y = data.iloc[:, 0:1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 365)

# clf = LogisticRegression(max_iter = 1000)
clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter = 1000))
y_train = y_train.values.ravel() # flatten y to make it compatible with the input required
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.9285714285714286

In [10]:
# classification using random forest classifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = data.iloc[:, 1:]
y = data.iloc[:, 0:1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 365)

# clf = RandomForestClassifier()
clf = make_pipeline(StandardScaler(), RandomForestClassifier())
y_train = y_train.values.ravel() # flatten y to make it compatible with the input required
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.9642857142857143

In [11]:
# classification using support vector machine (SVM)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X = data.iloc[:, 1:]
y = data.iloc[:, 0:1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 365)

# clf = SVC(gamma='auto')
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
y_train = y_train.values.ravel() # flatten y to make it compatible with the input required
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.9107142857142857

In [12]:
# classification using CNN
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf 
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [32]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0:1]

X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 365)

X_train = np.expand_dims(X_train, axis=0)
X_test = np.expand_dims(X_test, axis=0)

print(X_train.shape)


(1, 220, 2550)


In [34]:
# bulid the neural network
inputs = X_train
model = tf.keras.Sequential([
    # input layer
    tf.keras.layers.Conv2D(32, (3,3), padding='same', activation=tf.nn.relu,
                           input_shape=(200, 200, 3)),
    tf.keras.layers.Flatten(input_shape = (inputs.shape[1], inputs.shape[2])), 
    # one hidden layer
    tf.keras.layers.Dense(20, activation = 'relu'),
    # one hidden layer
    tf.keras.layers.Dense(20, activation = 'relu'),
    # output layer
    tf.keras.layers.Dense(4, activation = 'softmax')
])

# compile the network
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=["accuracy"])
model.summary()

# trian the network
history = model.fit(X_train, y_train, 
          validation_data=(X_test, y_test),
          epochs = 200,
          batch_size = 32)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_4 (Flatten)         (None, 561000)            0         
                                                                 
 dense_12 (Dense)            (None, 20)                11220020  
                                                                 
 dense_13 (Dense)            (None, 20)                420       
                                                                 
 dense_14 (Dense)            (None, 4)                 84        
                                                                 
Total params: 11,220,524
Trainable params: 11,220,524
Non-trainable params: 0
_________________________________________________________________


ValueError: Data cardinality is ambiguous:
  x sizes: 1
  y sizes: 220
Make sure all arrays contain the same number of samples.