## Download necessary stuff

In [None]:
# Download the annotations and stuff
import os
os.makedirs('DEBUG', exist_ok=True)

if not os.path.isfile('DEBUG/all-done'):
    %cd DEBUG
    !git clone https://github.com/leolani/cltl-face-all
    !wget https://raw.githubusercontent.com/leolani/cltl-face-all/master/examples/smaller-datasets-jsons/datasets.json

    !gdown --id 1rsLbfgQYztDtrPFqEmkh-2d_0ap1qd_s
    !unzip visual-features-smaller-dataset.zip
    !rm visual-features-smaller-dataset.zip

    !gdown --id 16ck7plW9v9eSHGCs5wuB2AhhufPRt3Wi
    !unzip smaller-dataset.zip
    !rm smaller-dataset.zip
    !touch all-done
    %cd ..

## Read pre-computed visual features and annotations

In [None]:
import json
import numpy as np
import os
import json
import av
import cv2
import random
from glob import glob
import pickle
from datetime import datetime, timedelta

VISUAL_FEATURES_PATH = 'DEBUG/visual-features-smaller-dataset/'
VIDEOS_PATH = "DEBUG/smaller-dataset/"

with open('DEBUG/datasets.json', 'r') as stream:
    datasets = json.load(stream)

datasets = datasets['large']

visual_features = glob('DEBUG/visual-features-smaller-dataset/*.npy')
visual_features = {os.path.basename(vf).split('.npy')[0] : np.load(vf, allow_pickle=True).item() for vf in visual_features}

with open('friends-time/friends-time.pkl', 'rb') as stream:
    friends_time = pickle.load(stream)

## Run on the images

In [None]:
import os
import av
import numpy as np
import cv2
from tqdm.notebook import tqdm
import time
import csv
from glob import glob
import numpy as np
import shutil
import uuid

THRESHOLDS = {'face': 0.8, 'angle': 1.15}

predefined_faces = {}
for path in glob(os.path.join('DEBUG/cltl-face-all/your-faces/*/*.npy')):
    name = path.split('/')[-2]
    predefined_faces[name] = np.load(path)

def calc_angle_distance(emb1, emb2):
    """Calculate the angle (radian) distance between the embeddings."""
    return np.arccos(np.clip((emb1 @ emb2.T), -1, 1))

def get_dias(list_of_diautts):
    return sorted(list(set([diautt.split('_')[0] for diautt in list_of_diautts])))

def get_time_unix_ms(time_string):
    hours, minutes, seconds = time_string.split(':')
    seconds, milliseconds = seconds.split(',')
    hours, minutes, seconds, milliseconds = int(hours), int(minutes), int(seconds), int(milliseconds)
    time_datetime = friends_time[season][episode] + timedelta(hours=hours, minutes=minutes, seconds=seconds)
    time_unix = time.mktime(time_datetime.timetuple())
    time_unix_ms = int(time_unix*1000 + milliseconds)    

    return time_unix_ms

In [None]:
for DATASET in tqdm(['train', 'dev', 'test']):
    dataset = datasets[DATASET]
    diautts_ = list(dataset.keys())
    
    dias = get_dias(diautts_)
    diautts_ = {dia: [diautt for diautt in diautts_ if dia in diautt] for dia in dias}

    for dia, diautts in tqdm(diautts_.items()):
        shutil.rmtree(os.path.join('DEBUG/data', dia), ignore_errors=True)
        os.makedirs(os.path.join('DEBUG/data', dia, 'image'), exist_ok=True)
        os.makedirs(os.path.join('DEBUG/data', dia, 'text'), exist_ok=True)
        os.makedirs(os.path.join('DEBUG/data', dia, 'audio'), exist_ok=True)

        image_gmrc = []
        chat = []
        for diautt in tqdm(diautts):
            annot = datasets[DATASET][diautt] 
            vis = visual_features[diautt]
            vidpath = os.path.join(VIDEOS_PATH, diautt) + '.mp4'

            season = annot['Season']
            episode = annot['Episode']
            emotion = annot['Emotion']
            sentiment = annot['Sentiment']
            utterance = annot['Utterance']
            speaker = annot['Speaker']

            starttime = annot['StartTime']
            endtime = annot['EndTime']

            time_unix_ms_start = get_time_unix_ms(starttime)
            time_unix_ms_end = get_time_unix_ms(endtime)
            chat.append([speaker, utterance, time_unix_ms_start])

            aud = diautt + '.wav'
            !ffmpeg -i $vidpath -q:a 0 -map a DEBUG/data/$dia/audio/$aud

            container = av.open(vidpath)
            fps = float(container.streams.video[0].average_rate)
            spf = 1/fps 
            mspf = round(spf * 1000)
            for i, frame in enumerate(container.decode(video=0)):
                idx = frame.index
                numpy_RGB = np.array(frame.to_image())
                numpy_BGR = cv2.cvtColor(numpy_RGB, cv2.COLOR_RGB2BGR)
                img_time = idx*mspf + time_unix_ms_start
                impath = os.path.join('DEBUG/data', 
                                       dia, 
                                       'image', 
                                       diautt + f'_frame{str(idx).zfill(5)}_{str(img_time)}.jpg')
                cv2.imwrite(impath, numpy_BGR)

                features = vis[idx]
                # Assume that there is only one unique face per frame.
                for feat in features:

                    age = round(float(feat['age']), 3)
                    gender = round(float(feat['gender']), 3)
                    bbox = feat['bbox']
                    bbox, faceprob = [int(round(bb)) for bb in bbox[:4]], float(bbox[-1])
                    faceprob = round(faceprob, 3)
                    embedding = feat['embedding']
                    landmark = feat['landmark']

                    if faceprob < THRESHOLDS['face']:
                        continue

                    if speaker not in list(predefined_faces.keys()):
                        continue

                    embedding.reshape(1, 512)

                    dists = {key: calc_angle_distance(embedding, val) for key, val \
                                in predefined_faces.items()}

                    if dists[speaker] < THRESHOLDS['angle']:
                        to_append = {}
                        to_append['array'] = None
                        to_append['bounds'] = [0, 0, numpy_BGR.shape[1], numpy_BGR.shape[0]]
                        to_append['files'] = [os.path.join('image', os.path.basename(impath))]
                        container_id = str(uuid.uuid4())
                        to_append['id'] = container_id
                        annotations = [
                            {
                                'source': 'machine',
                                'timestamp': round(time.time()*1000),
                                'type': 'person',
                                'value': 
                                    {'name': speaker,
                                     'age': age,
                                     'gender': gender,
                                     'faceprob': faceprob}
                            }
                        ]
                        mention_id = str(uuid.uuid4())
                        segment = [
                            {
                                'bounds': bbox,
                                'container_id': container_id,
                                'type': 'MultiIndex'
                                
                            }
                        ]
                        to_append['mentions'] = [
                            {
                                'annotations': annotations,
                                'id': mention_id,
                                'segment': segment
                            }
                        ]
                        to_append['modality'] = 'image'
                        to_append['ruler'] = {
                            'bounds': [0, 0, numpy_BGR.shape[1], numpy_BGR.shape[0]],
                            'container_id': container_id,
                            'type': 'MultiIndex'
                        }
                        to_append['time'] = {
                            'container_id': container_id,
                            'start': time_unix_ms_start,
                            'end': time_unix_ms_end,
                            'type': 'TemporalRuler',
                        }
                        to_append['type'] = 'ImageSignal'

                        image_gmrc.append(to_append)

        with open(os.path.join('DEBUG/data', dia, 'text', f'{dia}.csv'), 'w') as stream:
            stream.write('speaker,utterance,time,emotion\n')

            for line in chat:
                speaker, utterance, time_unix_ms_start = line
                stream.write(speaker)
                stream.write(',')
                stream.write(f"\"{utterance}\"")
                stream.write(',')
                stream.write(str(time_unix_ms_start))
                stream.write(',')
                stream.write(emotion)
                stream.write('\n')    

        with open(os.path.join('DEBUG/data', dia, 'image.json'), 'w') as stream:
            json.dump(image_gmrc, stream)