# Clustering and Visualization of Phoebe dataset

## Components
* Create dataset with every row linked to the image it was generated from
    * Will need to extract images from videos like this:
    `ffmpeg -ss 00:23:00 -i video.mp4 -frames:v 1 out_time.jpg`


In [1]:
from importlib import reload  # Not needed in Python 2
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

import os
DATASET_FILE = "Phoebe_dataset4.zip"
full_path = os.path.abspath(DATASET_FILE)
if not os.path.exists(full_path):
    !unzip $DATASET_FILE;

---
## Load datasets
### Helper functions

In [2]:
import numpy as np
# import os, os.path
from typing import List, Dict, Union, Any
import csv
import uuid
import re
import time
import datetime
import logging

# Paths
PATH_DATASET = "Phoebe_dataset4"
PATH_AU_INTENSITY = "single_person_au_intensity"
PATH_AU_PRESENCE = "single_person_au_presence"
PATH_TRAIN = "train"
PATH_TEST = "test"
PATH_TRAINING_VIDEOS = PATH_DATASET + "/videos/train"
PATH_PROCESSED_IMAGES = PATH_DATASET + "/processed_images"
PATH_TRAINING_CSV = f"./{PATH_DATASET}/{PATH_AU_INTENSITY}/{PATH_TRAIN}/"

# General
EXT_MP4 = "mp4"

# Stores
IMAGE_STORE = {}

log = logging.getLogger()
log.setLevel(logging.DEBUG)

if not os.path.isdir(PATH_PROCESSED_IMAGES):
    os.mkdir(PATH_PROCESSED_IMAGES)

In [3]:
def load_dataset(filename: str) -> List[Union[str, float]]:
    """Load dataset for a single csv/video pair.
    
    :param filename: name of video and csv without an extension.
    
    :return: result dataset for csv/video pair.
    """
    # Load csv with python library so that first column may be removed.
    print(filename)
    raw_data = []
    with open(filename, "r") as csvfile:
        reader = csv.reader(csvfile)
        
        for index, row in enumerate(reader):
            if index == 0:
                row = [x.strip() for x in row]
                # Rename column row
                row[0] = "frame"
                
                # Create a new column to save filename the row came from.
                # This is used to extract image when needed (filename + frame are needed)
                row.insert(0, "filename")
                
            elif row[0] != "":
                # Create an image for every row and add to dictionnary {id: path}
                filename_without_ext: str = re.findall(r"[0-9]+", filename)[-1]
                row.insert(0, filename_without_ext)
            raw_data.append(row)
    
    return raw_data

def create_full_dataset(training_files) -> List[Union[str, float]]:
    """Load dataset for every csv/video pair.
    
    :param training_files: list of csv files.
    
    :return: full dataset results
    """
    assert(isinstance(training_files, list)) 
    full_dataset = []
    for file in training_files:
        dataset = load_dataset(file)
        if len(full_dataset) == 0:
            full_dataset = dataset
        else:
            full_dataset.extend(dataset[1:])
    return full_dataset

### Get image
If image has not been extracted for given id, extract it from corresponding video. This is made easy because the matrix stores the video name (without the extension) and the frame number.

Pass the entire row of data to the `extract_image_for_frame` function in order to get the correct frame.

In [4]:
from IPython.display import display, Image
import datetime

# %%capture
def extract_image_for_frame(image_path: str, video_name: str, frame: str):
    """Extracts a frame from a video given a timestamp in seconds.
    
    :param orig_filename: name of video for which timestamp applies
    :param row_id: id of row and image
    
    :return: name of the image without its extension.
    """

    video_filepath = PATH_TRAINING_VIDEOS + f"/{video_name}.{EXT_MP4}"
    
    # Data was sampled at about 30fps. Get time at which it is in the video
    time_in_seconds = (1/30) * frame
    time_datetime = datetime.timedelta(seconds=time_in_seconds)
    
    log.debug(f"Extracting frame at time {time_datetime} from path {video_filepath}")        
    # command = f"ffmpeg -i {video_filepath} -r 30 {image_path}.jpg"
    
    command = f"ffmpeg -i {video_filepath} -ss {time_datetime} -vframes 1 {image_path}.jpg -n"
    log.debug(f"Running command: {command}")

    print(f"### ")
    !$command;

    
def show_image(row: List[Any]):
    """
    :param row: a row of data
    """
    log.debug(f"### Getting image for row: {row}")
    
    video_name = int(row[0])
    frame = int(row[0])
    image_name = f"{video_name:03}{frame:03}"
    image_path = PATH_PROCESSED_IMAGES + image_name
    if not os.path.exists(image_path):
        extract_image_for_frame(image_path, video_name, frame)
        
    try:
        image = Image(filename=f'{image_path}.jpg')
        display(image)
    except IOError as e:
        print(e)       

### Run dataset preparation pipeline

In [5]:
# %%capture
training_files: List[str] = [f"{PATH_TRAINING_CSV}{name}" for 
                             name in os.listdir(f"{PATH_TRAINING_CSV}") if 
                             name[-4:] == ".csv"]

print(training_files[0:2])
# Load datasets
full_dataset = create_full_dataset(training_files[4:5])

['./Phoebe_dataset4/single_person_au_intensity/train/6.csv', './Phoebe_dataset4/single_person_au_intensity/train/40.csv']
./Phoebe_dataset4/single_person_au_intensity/train/43.csv


In [6]:
print(len(full_dataset))
print(full_dataset[0:5])

34
[['filename', 'frame', 'face_id', 'confidence', 'success', 'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r'], ['43', '0', '0', '0.98', '1', '0.09', '0.0', '0.65', '0.07', '1.61', '1.69', '0.0', '1.19', '1.6', '0.51', '0.0', '1.9', '0.0', '0.0', '0.59', '0.56', '0.0'], ['43', '1', '0', '0.98', '1', '0.07', '0.21', '0.52', '0.02', '1.09', '1.71', '0.0', '0.71', '1.35', '0.23', '0.0', '1.4', '0.0', '0.0', '0.38', '0.2', '0.0'], ['43', '2', '0', '0.98', '1', '0.07', '0.28', '0.51', '0.0', '0.89', '1.52', '0.0', '0.58', '1.23', '0.11', '0.0', '1.01', '0.0', '0.0', '0.21', '0.02', '0.11'], ['43', '3', '0', '0.98', '1', '0.07', '0.28', '0.41', '0.0', '0.84', '1.43', '0.01', '0.62', '1.2', '0.08', '0.0', '1.0', '0.0', '0.0', '0.47', '0.41', '0.22']]


In [None]:
# Images in Image datastore are "fileid_frame"
show_image(full_dataset[1])
print(full_dataset[0])
print(full_dataset[1])

---
## Preprocess and cleanup data
* Remove rows with confidence < 80
* Remove rows with empty value(s)

### Compare ML algorithms for
* Scale data between 0 and 1, and unscaled data

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
X_train = numpy.array(full_dataset)

X_train_minmax = min_max_scaler.fit_transform(X_train)

---
## Gausian Mixture Model

In [177]:
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns; sns.set()

from sklearn.mixture import GMM



ImportError: cannot import name 'GMM' from 'sklearn.mixture' (/Users/leo/workspace/cmpt419/CMPT419-A2/venv/lib/python3.7/site-packages/sklearn/mixture/__init__.py)