****
<div class='heading'>
    <div style='float:left;'><h1>CPSC 6300: Applied Data Science Final Project</h1></div>
    <img style="float: right; padding-right: 5px; width: 80px" src="https://raw.githubusercontent.com/bsethwalker/clemson-cs4300/main/images/clemson_paw.png"> </div>
Clemson University<br>
Spring 2023<br>

## TOPIC - Continuous Affect Recognition from Multimodal Signals in Videos

**Instructor(s):** Nina Hubig <br>

**Group Members:** Charanjit Singh (C15246652) | Parampreet Singh (C19377466) | Vinod Ramavath (C13139775)

---

In [1]:
""" RUN THIS CELL TO GET THE RIGHT FORMATTING """
import requests
from IPython.core.display import HTML
css_file = 'https://raw.githubusercontent.com/bsethwalker/clemson-cs4300/main/css/cpsc6300.css'
styles = requests.get(css_file).text
HTML(styles)

## DATA PREPROCESSING:


### Visual Representation:

#### 1. Extracting Frames:

In [None]:
import cv2
import os
# Set the paths to the input and output directories
input_dir = "/Data/OMG_Emotion_videos/videos"
output_dir = "/Data/Frames"

k = 1
# Loop through all of the video files in the input directory
for filename in os.listdir(input_dir):
    print(k)
    vid_path = os.path.join(input_dir, filename, "video")
    for vid_name in os.listdir(vid_path):
        if vid_name.endswith(".mp4"):
            # Set the path to the input video file and create a new directory for the output frames
            input_path = os.path.join(vid_path, vid_name)
            output_path = os.path.join(output_dir, filename)
            os.makedirs(output_path, exist_ok=True)

            # Open the video file using OpenCV
            cap = cv2.VideoCapture(input_path)


            count = 0
            # Loop through all of the frames in the video
            while True:
                # Read the next frame from the video
                while count < 10:
                    count += 1
                    ret, frame = cap.read()
                    if not ret:
                        break
                if not ret:
                        break  
    
                # Save the frame as an image file in the output directory
                output_file = os.path.join(output_path, f"{vid_name}{count}.jpg")
                cv2.imwrite(output_file, frame)
                # Increment the frame counter
                count = 0

            # Release the video capture object
            cap.release()
    k += 1

#### 2. Applying MTCNN to Align Faces

In [None]:
import cv2
from mtcnn import MTCNN
import matplotlib.pyplot as plt

# load MTCNN model
mtcnn_model = MTCNN()

# Set the paths to the input and output directories
input_dir = "/Data/Frames"
output_dir = "/Data/DataLoader/Aligned_faces"

# Loop through all of the video files in the input directory
for filename in os.listdir(input_dir):
    print(k)
    img_path = os.path.join(input_dir, filename)
    for img_name in os.listdir(img_path):
        if img_name.endswith(".jpg"):
            # Set the path to the input video file and create a new directory for the output frames
            input_path = os.path.join(img_path, img_name)
            output_path = os.path.join(output_dir, filename, img_name)
            os.makedirs(output_path, exist_ok=True)

            # load image
            img = cv2.imread(input_path)

            # detect face using MTCNN
            boxes= mtcnn_model.detect_faces(img)

            if boxes is not None:
                # extract the first detected face
                box = boxes[0]['box']
                x, y, w, h = box

                sliced_data = img[y:y+h, x:x+w]

                # save the sliced image
                plt.imsave(output_path, sliced_data)
            else:
                print('No face detected in the image.')

### Acoustic Representation

#### Converted video snippets to WAV files and then extracted STFT maps

In [None]:
import os
import moviepy.editor as mp
import librosa
import numpy as np

# Set the paths to the input and output directories
input_dir = "/Data/OMG_Emotion_videos/videos"
output_dir = "/Data/DataLoader/STFT_maps"

# Loop through all of the video files in the input directory
for filename in os.listdir(input_dir):
    vid_path = os.path.join(input_dir, filename, "video")
    for vid_name in os.listdir(vid_path):
        if vid_name.endswith(".mp4"):
            # Set the path to the input video file and create a new directory for the output frames
            input_path = os.path.join(vid_path, vid_name)
            output_path = os.path.join(output_dir, filename)
            os.makedirs(output_path, exist_ok=True)

            # Set sample rate, window size and hop length for STFT
            sr = 16000
            n_fft = 512
            win_length = int(sr * 0.025)
            hop_length = int(sr * 0.01)

            # Extract audio from video and save to WAV files
            audio = mp.AudioFileClip(input_path)
            audio.write_audiofile(os.path.join(output_path, 'audio.wav'))

            # Load audio from WAV file
            audio, sr = librosa.load(os.path.join(output_path, 'audio.wav'), sr=sr, mono=True)

            # Calculate STFT spectrograms
            spec = librosa.stft(audio, n_fft=n_fft, win_length=win_length, hop_length=hop_length, window='hamming')

            # Stack the real and imaginary parts of the STFT spectrograms
            spec = np.stack([spec.real, spec.imag], axis=-1)

            # Split the spectrograms into 3-second chunks
            num_chunks = int(np.ceil(spec.shape[1] / (sr * 3)))
            for i in range(num_chunks):
                start_idx = i * sr * 3
                end_idx = min((i + 1) * sr * 3, spec.shape[1])
                chunk_spec = spec[:, start_idx:end_idx, :]

                # Save each chunk of STFT spectrograms to a file
                output_file = os.path.join(output_path, f'{vid_name}_spec_{i}.npy')
                np.save(output_file, chunk_spec)


## MODEL IMPLEMENTATION

### Initialization

In [7]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
#import net_sphere
import torch.optim as optim
from torch.nn.utils import clip_grad_norm
from skimage import io
import numpy as np
import datetime,sys
from numpy.random import randint
import torchvision.models as models
from calculateEvaluationCCC import calculateCCC
import glob
import os

# Define parameters
use_cuda = torch.cuda.is_available()
lr = 0.001
bs = 6
n_epoch = 30
lr_steps = [7,14,21,28]
gpu_id = [0,1]

gd = 20 # clip gradient
eval_freq = 3
print_freq = 100
num_worker = 4
num_seg = 16
num_stft = 4
flag_biLSTM = True


train_list_path = "/Data/DataLoader/Annotations_train.csv"
val_list_path = "/Data/DataLoader/Annotations_val.csv"

### VideoNet

In [None]:
sphereface = getattr(net_sphere,'sphere20a')()
sphereface.feature = True # remove the last fc layer because we need to use LSTM first

class VNet(torch.nn.Module):
    def __init__(self, sphereface, feature=True):
        super(VNet, self).__init__()
        self.sphereface = sphereface
        self.linear = torch.nn.Linear(512,2)
        self.tanh = torch.nn.Tanh()
        self.feature = feature
        self.avgPool = torch.nn.AvgPool2d((num_seg,1), stride=1)
        self.LSTM = torch.nn.LSTM(512, 512, 1, batch_first = True, dropout=0.2, bidirectional=flag_biLSTM)  # Input dim, hidden dim, num_layer
        for name, param in self.LSTM.named_parameters():
            if 'bias' in name:
                torch.nn.init.constant(param, 0.0)
            elif 'weight' in name:
                torch.nn.init.orthogonal(param)
        
    def sequentialLSTM(self, input, hidden=None):

        input_lstm = input.view([-1,num_seg, input.shape[1]])
        batch_size = input_lstm.shape[0]
        feature_size = input_lstm.shape[2]

        self.LSTM.flatten_parameters()
            
        output_lstm, hidden = self.LSTM(input_lstm)
        if flag_biLSTM:
             output_lstm = output_lstm.contiguous().view(batch_size, output_lstm.size(1), 2, -1).sum(2).view(batch_size, output_lstm.size(1), -1) 

        output_lstm = output_lstm.view(batch_size,1,num_seg,-1)
        out = self.avgPool(output_lstm)
        out = out.view(batch_size,-1)
        return out
    
    def forward(self, x):
        x = self.sphereface(x)
        x = self.sequentialLSTM(x)
        if self.feature == True: return x

        x = self.linear(x)
        x = self.tanh(x)

        return x

model_v = VNet(sphereface)

### AudioNET

In [6]:
vgg = models.vgg16(pretrained=False).features

removed = list(vgg.children())[1:]
vgg = torch.nn.Sequential(*removed)

# We modified the first layer of vgg16
vgg_modified = torch.nn.Sequential(torch.nn.Conv2d(2,64,3),vgg)

class ANet(torch.nn.Module):
    def __init__(self, vgg,feature=True):
        super(ANet, self).__init__()
        self.vgg = vgg
        self.fc1 = torch.nn.Linear(512*7*9,4096)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout()
        self.fc2 = torch.nn.Linear(4096,512)
        self.fc3 = torch.nn.Linear(512,2)
        self.tanh = torch.nn.Tanh()
        self.feature = feature
    def forward(self, x):
        x = self.vgg(x)
        x = x.view([-1,512*7*9])
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        if self.feature == True: return x 
        
        x = self.tanh(self.fc3(x))
        return x

model_a = ANet(vgg_modified)

### Joint Training

In [None]:
class AVNet(torch.nn.Module):
    def __init__(self, vnet,anet):
        super(AVNet, self).__init__()
        self.vnet = vnet
        self.anet = anet
        self.avgPool = torch.nn.AvgPool2d((num_stft,1), stride=1)
        self.fc = torch.nn.Linear(1024,2)
        self.tanh = torch.nn.Tanh()
        
    def forward(self, xi,xs):
        xi = self.vnet(xi) 
        xs = self.anet(xs)
        xs = xs.view((-1,1,num_stft,512))
        xs = self.avgPool(xs)
        xs = xs.view(-1,512)
        
        
        x = torch.cat((xi, xs), 1)
        x = self.tanh(self.fc(x))
        return x

xi = torch.autograd.Variable(torch.randn(32,3,96,112))
xs = torch.autograd.Variable(torch.randn(8,2,257,300))

model = AVNet(model_v, model_a)


if use_cuda:
    model.cuda()

### OMG Dataset Loader

In [None]:
class OMGDataset(Dataset):
    """OMG dataset."""

    def __init__(self, txt_file, base_path_v, base_path_a, transform=None):
        self.base_path_v = base_path_v
        self.base_path_a = base_path_a
        self.data = pd.read_csv(txt_file, sep=" ", header=None)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        vid = self.data.iloc[idx,0]
        utter = self.data.iloc[idx,1]
        img_list = self.data.iloc[idx,-1]
        img_list = img_list.split(',')[:-1]
        img_list = map(int, img_list)
        
        num_frames = len(img_list)
        # inspired by TSN's pytorch code
        average_duration = num_frames // num_seg
        if num_frames>num_seg:
            offsets = np.multiply(list(range(num_seg)), average_duration) + randint(average_duration, size=num_seg)
        else:
            tick = num_frames / float(num_seg)
            offsets = np.array([int(tick / 2.0 + tick * x) for x in range(num_seg)])

        final_list = [img_list[i] for i in offsets]
        
        # stack images within a video in the depth dimension
        for i,ind in enumerate(final_list):
            image = io.imread(self.base_path_v+'%s/%s/%d.jpg'%(vid,utter,ind)).astype(np.float32)
            image = torch.from_numpy(((image - 127.5)/128).transpose(2,0,1))
            if i==0:
                images = image
            else:
                images = torch.cat((images,image), 0)

        # stft data acquisition
        stft_path = self.base_path_a+vid+'/'+utter
        stfts_count = len(glob.glob1(stft_path,"*.npy"))
        stft_list_all = range(stfts_count)
        
        average_duration = stfts_count // num_stft
        if stfts_count>num_stft:
            offsets = np.multiply(list(range(num_stft)), average_duration) + randint(average_duration, size=num_stft)
        else:
            tick = stfts_count / float(num_stft)
            offsets = np.array([int(tick / 2.0 + tick * x) for x in range(num_stft)])
        
        stft_list = [stft_list_all[i] for i in offsets]
        
        for i,ind in enumerate(stft_list):
            
            stft = np.load(stft_path+'/%d.npy'%ind).astype(np.float32)
            max_val = max(np.abs(np.max(stft)),np.abs(np.min(stft)))
            mean_val = np.mean(stft)
            stft = torch.from_numpy(((stft - mean_val)/max_val).transpose(2,0,1))
            if i==0:
                stfts = stft
            else:
                stfts = torch.cat((stfts,stft), 0)
        
        label = torch.from_numpy(np.array([self.data.iloc[idx,2], self.data.iloc[idx,3]]).astype(np.float32))

        if self.transform:
            image = self.transform(image)
        return (images, stfts, label, (vid,utter))

 
base_path_v = "/Data/DataLoader/Aligned_faces"
base_path_a = "/Data/DataLoader/STFT_maps"

train_loader = DataLoader(OMGDataset(train_list_path,base_path_v, base_path_a), 
                          batch_size=bs, shuffle=True, num_workers=num_worker)
val_loader = DataLoader(OMGDataset(val_list_path,base_path_v, base_path_a), 
                         batch_size=bs, shuffle=False, num_workers=num_worker)

In [None]:
def printoneline(*argv):
    s = ''
    for arg in argv: s += str(arg) + ' '
    s = s[:-1]
    sys.stdout.write('\r'+s)
    sys.stdout.flush()
    
def dt():
    return datetime.datetime.now().strftime('%H:%M:%S')

def save_model(model,filename):
    state = model.state_dict()
    torch.save(state, filename)

### Evaluation Metrics

In [None]:
def pearsonr(outputs, targets):
    vx = outputs - torch.mean(outputs)
    vy = targets - torch.mean(targets)
    rho = torch.sum(vx * vy) / (torch.sqrt(torch.sum(vx ** 2)) * torch.sqrt(torch.sum(vy ** 2)))  # use Pearson correlation
    return rho

def calCCC(out, tar, rho):
    true_mean = torch.mean(tar)
    true_variance = torch.var(tar)
    pred_mean = torch.mean(out)
    pred_variance = torch.var(out)
    std_predictions = torch.std(out)
    std_gt = torch.std(tar)
    
    ccc = 2 * rho * std_gt * std_predictions / (
        std_predictions ** 2 + std_gt ** 2 +
        (pred_mean - true_mean) ** 2)
    
    return ccc

def calLoss(outputs, targets):
    out_a = outputs[:,0]
    out_v = outputs[:,1]
    tar_a = targets[:,0]
    tar_v = targets[:,1]
    
    rho_a = pearsonr(out_a, tar_a)
    rho_v = pearsonr(out_v, tar_v)
    
    ccc_a = calCCC(out_a,tar_a,rho_a)
    ccc_v = calCCC(out_v,tar_v,rho_v)
    
    ccc_all = -(ccc_a+ccc_v)
    return ccc_all

### Training

In [None]:
criterion = torch.nn.MSELoss()
def train(train_loader, model, criterion, optimizer, epoch):
    model.train()
    
    train_loss = 0
    correct = 0
    total = 0
    batch_idx = 0
    
    for i, (inputs_v, inputs_a, targets, _) in enumerate(train_loader):
        
        optimizer.zero_grad()
        
        if use_cuda:
            inputs_v, inputs_a, targets = inputs_v.cuda(), inputs_a.cuda(), targets.cuda()

        inputs_v = torch.autograd.Variable(inputs_v)
        inputs_a = torch.autograd.Variable(inputs_a)
        targets = torch.autograd.Variable(targets)
        
        inputs_v = inputs_v.view((-1,3)+inputs_v.size()[-2:])
        inputs_a = inputs_a.view((-1,2)+inputs_a.size()[-2:])
        
        outputs = model(inputs_v, inputs_a)
        
        loss = calLoss(outputs, targets)
        
        loss.backward()
        optimizer.step()
        
        #tsn uses clipping gradient
        if gd is not None:
            total_norm = clip_grad_norm(model.parameters(),gd)
            if total_norm > gd:
                print('clippling gradient: {} with coef {}'.format(total_norm, gd/total_norm))
                
        train_loss += loss.data[0]
        
        if i % print_freq == 0:
            printoneline(dt(),'Epoch=%d Loss=%.4f\n'
                % (epoch,train_loss/(batch_idx+1)))
        batch_idx += 1

In [None]:
def validate(val_loader, model, epoch):
    model.eval()
    
    err_arou = 0.0
    err_vale = 0.0
    
    out_name = 'results/joint_ccc2_%d.csv'%epoch
    txt_result = open(out_name, 'w')
    txt_result.write('video,utterance,arousal,valence\n')
    for (inputs_v, inputs_a, targets,(vid, utter)) in val_loader:
        if use_cuda:
            inputs_v, inputs_a, targets = inputs_v.cuda(), inputs_a.cuda(), targets.cuda()
        
        inputs_v = torch.autograd.Variable(inputs_v)
        inputs_a = torch.autograd.Variable(inputs_a)
        targets = torch.autograd.Variable(targets)
        
        
        inputs_v = inputs_v.view((-1,3)+inputs_v.size()[-2:])
        inputs_a = inputs_a.view((-1,2)+inputs_a.size()[-2:])
        
        outputs = model(inputs_v, inputs_a)
        
        for i in range(len(vid)):
            out = outputs
            txt_result.write('%s,%s.mp4,%f,%f\n'%(vid[i], utter[i],out[i][0],out[i][1]))
    
    txt_result.close()
    
    arouCCC, valeCCC = calculateCCC('/results/omg_ValidationVideos.csv',out_name)
    return (arouCCC,valeCCC)

### Validation

In [None]:
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)    
best_arou_ccc, best_vale_ccc = validate(val_loader, model, 0)
for epoch in range(n_epoch):
    if epoch in lr_steps:
        lr *= 0.1
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)    

    train(train_loader, model, criterion, optimizer, epoch)
    
    # evaluate on validation set
    if (epoch+1)%eval_freq == 0 or epoch == n_epoch-1:
        arou_ccc, vale_ccc = validate(val_loader, model, epoch)
        
        if (arou_ccc+vale_ccc) > (best_arou_ccc + best_vale_ccc):
            best_arou_ccc = arou_ccc
            best_vale_ccc = vale_ccc
save_model(model,'/pth/joint_ccc2_{}_{}_{}.pth'.format(epoch, round(arou_ccc,4), round(vale_ccc,4)))