# Clustering and Visualization of Phoebe dataset

## Components
* Create dataset with every row linked to the image it was generated from
    * Will need to extract images from videos like this:
    `ffmpeg -ss 00:23:00 -i video.mp4 -frames:v 1 out_time.jpg`


In [3]:
DATASET_FILE = "Phoebe_dataset3.zip"
!unzip $DATASET_FILE;

Archive:  Phoebe_dataset3.zip
replace Phoebe_dataset/single_person_au_intensity/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


---
## Load datasets
### Helper functions

In [36]:
import numpy as np
import os, os.path
from typing import List, Dict
import csv
import uuid
import re
import time
import datetime

PATH_DATASET = "Phoebe_dataset"
PATH_AU_INTENSITY = "single_person_au_intensity"
PATH_AU_PRESENCE = "single_person_au_presence"
PATH_TRAIN = "train"
PATH_TEST = "test"
PATH_TRAINING_VIDEOS = PATH_DATASET + "/videos/train"
PATH_PROCESSED_IMAGES = PATH_DATASET + "/processed_images"

IMAGE_STORE = {}

def extract_image_for_timestamp(orig_filename: str, 
                                timestamp_sec: float) -> str:
    
    timestamp = datetime.timedelta(seconds=float(timestamp_sec))
    img_name = f"{orig_filename}_{str(timestamp)}"
    img_file = f"{img_name}.jpg"
    img_filepath = os.path.abspath(f"{PATH_PROCESSED_IMAGES}/{img_file}")
    video_filepath = PATH_TRAINING_VIDEOS + "/" + f"{orig_filename}.mp4"
    
    # Generate image if it does not exist
    if not os.path.exists(img_filepath):
        command = f"ffmpeg -ss {str(timestamp)} -i {video_filepath} -frames:v 1 {img_filepath}"
        print(command)
        !$command;
    
    # Add to image store
    global IMAGE_STORE
    IMAGE_STORE[img_name] = img_filepath
    
    return img_name

    
def load_dataset(filename: str):
    # Load csv with python library so that first column may be removed.
    raw_data = []
    with open(filename, "r") as csvfile:
        reader = csv.reader(csvfile)
        
        for index, row in enumerate(reader):
            new_row = row[1:]
            if index == 0:
                new_row.insert(0, "filename")
            else:
                # Create image for every row and add to dictionnary {id: path}
                filename_without_ext: str = re.sub("\D", "", filename)
                row_id = extract_image_for_timestamp(filename_without_ext, timestamp_sec=new_row[1])
                new_row.insert(0, row_id)
            raw_data.append(new_row)
    
    data = np.array(raw_data)
    print(data[0:2])
    return data

### Run dataset pipeline

In [37]:
%%capture
training_files: List[str] = [f"./{PATH_DATASET}/{PATH_AU_INTENSITY}/{PATH_TRAIN}/{name}" for 
                             name in os.listdir(f"./{PATH_DATASET}/{PATH_AU_INTENSITY}/{PATH_TRAIN}") if 
                             name[-4:] == ".csv"]

image_dict = {}
dataset1 = load_dataset(training_files[0])
# print(dataset1)

# Load datasets
full_dataset = []

In [38]:
print(IMAGE_STORE)
print(dataset1)

{'6_0:00:00': '/Users/leo/workspace/cmpt419/CMPT419-A2/Phoebe_dataset/processed_images/6_0:00:00.jpg', '6_0:00:00.040000': '/Users/leo/workspace/cmpt419/CMPT419-A2/Phoebe_dataset/processed_images/6_0:00:00.040000.jpg', '6_0:00:00.080000': '/Users/leo/workspace/cmpt419/CMPT419-A2/Phoebe_dataset/processed_images/6_0:00:00.080000.jpg', '6_0:00:00.120000': '/Users/leo/workspace/cmpt419/CMPT419-A2/Phoebe_dataset/processed_images/6_0:00:00.120000.jpg', '6_0:00:00.160000': '/Users/leo/workspace/cmpt419/CMPT419-A2/Phoebe_dataset/processed_images/6_0:00:00.160000.jpg', '6_0:00:00.199000': '/Users/leo/workspace/cmpt419/CMPT419-A2/Phoebe_dataset/processed_images/6_0:00:00.199000.jpg', '6_0:00:00.239000': '/Users/leo/workspace/cmpt419/CMPT419-A2/Phoebe_dataset/processed_images/6_0:00:00.239000.jpg', '6_0:00:00.279000': '/Users/leo/workspace/cmpt419/CMPT419-A2/Phoebe_dataset/processed_images/6_0:00:00.279000.jpg', '6_0:00:00.319000': '/Users/leo/workspace/cmpt419/CMPT419-A2/Phoebe_dataset/processed