# Clustering and Visualization of Phoebe dataset

## Components
* Create dataset with every row linked to the image it was generated from
    * Will need to extract images from videos like this:
    `ffmpeg -ss 00:23:00 -i video.mp4 -frames:v 1 out_time.jpg`


In [3]:
DATASET_FILE = "Phoebe_dataset3.zip"
!unzip $DATASET_FILE;

Archive:  Phoebe_dataset3.zip
replace Phoebe_dataset/single_person_au_intensity/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


---
## Load datasets
### Helper functions

In [94]:
import numpy as np
import os, os.path
from typing import List, Dict
import csv
import uuid
import re
import time
import datetime

PATH_DATASET = "Phoebe_dataset"
PATH_AU_INTENSITY = "single_person_au_intensity"
PATH_AU_PRESENCE = "single_person_au_presence"
PATH_TRAIN = "train"
PATH_TEST = "test"
PATH_TRAINING_VIDEOS = PATH_DATASET + "/videos/train"
PATH_PROCESSED_IMAGES = PATH_DATASET + "/processed_images"
PATH_TRAINING_CSV = f"./{PATH_DATASET}/{PATH_AU_INTENSITY}/{PATH_TRAIN}/"

IMAGE_STORE = {}

def extract_image_for_timestamp(orig_filename: str, 
                                timestamp_sec: float) -> str:
    
    timestamp = datetime.timedelta(seconds=float(timestamp_sec.replace(" ", "")))
    img_name = f"{orig_filename}_{str(timestamp)}"
    img_file = f"{img_name}.jpg"
    img_filepath = os.path.abspath(f"{PATH_PROCESSED_IMAGES}/{img_file}")
    video_filepath = PATH_TRAINING_VIDEOS + "/" + f"{orig_filename}.mp4"
    
    # Generate image if it does not exist
    if not os.path.exists(img_filepath):
        command = f"ffmpeg -ss {str(timestamp)} -i {video_filepath} -frames:v 1 {img_filepath}"
        print(command)
        !$command;
    
    # Add to image store
    global IMAGE_STORE
    IMAGE_STORE[img_name] = img_filepath
    
    return img_name

    
def load_dataset(filename: str):
    # Load csv with python library so that first column may be removed.
    raw_data = []
    with open(filename, "r") as csvfile:
        reader = csv.reader(csvfile)
        
        for index, row in enumerate(reader):
            new_row = row[1:]
            if index == 0:
                new_row.insert(0, "filename")
            elif new_row[0] != "":
                # Create an image for every row and add to dictionnary {id: path}
                filename_without_ext: str = re.sub("\D", "", filename)
                row_id = extract_image_for_timestamp(filename_without_ext, timestamp_sec=new_row[1])
                new_row.insert(0, row_id)
            raw_data.append(new_row)
    
    return raw_data

def create_full_dataset(training_files):
    full_dataset = []
    for file in training_files:
        dataset = load_dataset(file)
        if len(full_dataset) == 0:
            full_dataset = dataset
        else:
            full_dataset.extend(dataset[1:])
    return np.array(full_dataset)

### Run dataset pipeline

In [102]:
%%capture
training_files: List[str] = [f"{PATH_TRAINING_CSV}/{name}" for 
                             name in os.listdir(f"{PATH_TRAINING_CSV}") if 
                             name[-4:] == ".csv"]

# Load datasets
full_dataset = create_full_dataset(training_files)

In [96]:
# print(IMAGE_STORE)
print(full_dataset.shape)

(3569,)


In [99]:
print(full_dataset[0:100])

[list(['filename', ' face_id', ' timestamp', ' confidence', ' success', ' AU01_r', ' AU02_r', ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r', ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r', ' AU25_r', ' AU26_r', ' AU45_r'])
 list(['6_0:00:00', '0', '0.0', '0.98', '1', '0.0', '0.0', '0.0', '0.0', '1.14', '2.01', '0.0', '1.65', '1.83', '0.29', '0.94', '0.81', '0.0', '0.2', '0.0', '0.16', '0.0'])
 list(['6_0:00:00.040000', '0', '0.04', '0.93', '1', '0.0', '0.0', '0.0', '0.0', '0.98', '1.82', '0.0', '1.38', '1.67', '0.51', '0.45', '0.58', '0.0', '0.07', '0.13', '0.11', '0.0'])
 list(['6_0:00:00.080000', '0', '0.08', '0.98', '1', '0.02', '0.05', '0.0', '0.0', '0.94', '1.74', '0.0', '1.2', '1.56', '0.54', '0.27', '0.5', '0.0', '0.0', '0.25', '0.06', '0.06'])
 list(['6_0:00:00.120000', '0', '0.12', '0.98', '1', '0.02', '0.05', '0.0', '0.0', '0.99', '1.79', '0.0', '1.06', '1.5', '0.5', '0.27', '0.43', '0.0', '0.0', '0.34', '0.08', '0.06'])
 list(['6_0:00:00.