## Download necessary stuff

In [None]:
# Download the annotations and stuff

!git clone https://github.com/leolani/cltl-face-all

!wget https://raw.githubusercontent.com/leolani/cltl-face-all/master/examples/smaller-datatsets-jsons/datasets.json


!gdown --id 1-2LeHC_5Cm2gWWT6vBrVhp8jorbjkN1_
!unzip visual-features.zip
!rm visual-features.zip

!gdown --id 16ck7plW9v9eSHGCs5wuB2AhhufPRt3Wi
!unzip smaller-dataset.zip
!rm smaller-dataset.zip

## Read pre-computed visual features and annotations

In [None]:
import json
import numpy as np
import os
import json
import av
import cv2
import random
from glob import glob
import pickle
from datetime import datetime, timedelta

VISUAL_FEATURES_PATH = 'visual-features/'
VIDEOS_PATH = "smaller-dataset/"


with open('datasets.json', 'r') as stream:
    datasets = json.load(stream)

datasets = datasets['large']

visual_features = glob('visual-features/*.npy')
visual_features = {os.path.basename(vf).split('.npy')[0] : np.load(vf, allow_pickle=True).item() for vf in visual_features}

with open('friends-time/friends-time.pkl', 'rb') as stream:
    friends_time = pickle.load(stream)

## Run on the images

In [55]:
import os
import av
import numpy as np
import cv2
from tqdm.notebook import tqdm
import time
import csv
from glob import glob
import numpy as np
import shutil

THRESHOLDS = {'face': 0.8, 'angle': 1.15}

predefined_faces = {}
for path in glob(os.path.join('./cltl-face-all/your-faces/*/*.npy')):
    name = path.split('/')[-2]
    predefined_faces[name] = np.load(path)

def calc_angle_distance(emb1, emb2):
    """Calculate the angle (radian) distance between the embeddings."""
    return np.arccos(np.clip((emb1 @ emb2.T), -1, 1))

def get_dias(list_of_diautts):
    return sorted(list(set([diautt.split('_')[0] for diautt in list_of_diautts])))


In [None]:
for DATASET in tqdm(['train', 'dev', 'test']):
    dataset = datasets[DATASET]
    diautts_ = list(dataset.keys())
    
    dias = get_dias(diautts_)
    diautts_ = {dia: [diautt for diautt in diautts_ if dia in diautt] for dia in dias}

    for dia, diautts in tqdm(diautts_.items()):
        shutil.rmtree(os.path.join('data', dia), ignore_errors=True)
        os.makedirs(os.path.join('data', dia, 'image'), exist_ok=True)
        os.makedirs(os.path.join('data', dia, 'text'), exist_ok=True)
        os.makedirs(os.path.join('data', dia, 'audio'), exist_ok=True)

        # annots = {diautt: datasets['train'][diautt] for diautt in diautts}
        # vids_path = {diautt: os.path.join(VIDEOS_PATH, diautt) +  '.mp4' for diautt in diautts}
        # features_path = {diautt: os.path.join(VISUAL_FEATURES_PATH, diautt) +  '.npy' for diautt in diautts}
        # features = {diautt: np.load(path, allow_pickle=True).item() for diautt, path in features_path.items()}

        image_grmc = []
        chat = []
        for diautt in tqdm(diautts):
            annot = datasets[DATASET][diautt] 
            vis = visual_features[diautt]
            vidpath = os.path.join(VIDEOS_PATH, diautt) + '.mp4'

            season = annot['Season']
            episode = annot['Episode']
            emotion = annot['Emotion']
            sentiment = annot['Sentiment']
            starttime = annot['StartTime']
            endttime = annot['EndTime']
            utterance = annot['Utterance']
            speaker = annot['Speaker']

            hours, minutes, seconds = starttime.split(':')
            seconds, milliseconds = seconds.split(',')

            hours, minutes, seconds, milliseconds = int(hours), int(minutes), int(seconds), int(milliseconds)

            time_datetime = friends_time[season][episode] + timedelta(hours=hours, minutes=minutes, seconds=seconds)
            time_unix = time.mktime(time_datetime.timetuple())
            time_unix_ms = int(time_unix*1000 + milliseconds)

            chat.append([speaker, utterance, time_unix_ms])

            aud = diautt + '.wav'
            !ffmpeg -i $vidpath -q:a 0 -map a data/$dia/audio/$aud

            container = av.open(vidpath)
            fps = float(container.streams.video[0].average_rate)
            for frame in container.decode(video=0):
                idx = frame.index
                numpy_RGB = np.array(frame.to_image())
                numpy_BGR = cv2.cvtColor(numpy_RGB, cv2.COLOR_RGB2BGR)
                impath = os.path.join('data', dia, 'image', diautt + f'_frame{str(idx).zfill(5)}.jpg')
                cv2.imwrite(impath, numpy_BGR)

                features = vis[idx]
                # Assume that there is only one unique face per frame.
                for feat in features:
                    to_append = {}

                    age = float(feat['age'])
                    gender = float(feat['gender'])
                    bbox = feat['bbox']

                    bbox, faceprob = [int(round(bb)) for bb in bbox[:4]], float(bbox[-1])

                    embedding = feat['embedding']
                    landmark = feat['landmark']

                    if bbox[-1] < THRESHOLDS['face']:
                        continue

                    if speaker not in list(predefined_faces.keys()):
                        continue

                    embedding.reshape(1, 512)

                    dists = {key: calc_angle_distance(embedding, val) for key, val \
                                in predefined_faces.items()}

                    if dists[speaker] < THRESHOLDS['angle']:
                        to_append['files'] = impath
                        to_append['bbox'] = bbox
                        to_append['faceprob'] = round(faceprob, 3)
                        to_append['speaker'] = speaker
                        to_append['age'] = round(age, 3)
                        to_append['gender'] = round(gender, 3)

                        image_grmc.append(to_append)

        with open(os.path.join('data', dia, 'text', 'chat.csv'), 'w') as stream:
            stream.write('speaker,utterance,time\n')

            for line in chat:
                speaker, utterance, time_unix_ms = line
                stream.write(speaker)
                stream.write(',')
                stream.write(f"\"{utterance}\"")
                stream.write(',')
                stream.write(str(time_unix_ms))
                stream.write('\n')    

        with open(os.path.join('data', dia, 'image.json'), 'w') as stream:
            json.dump(image_grmc, stream)