# This notebook shows statistical results from the dataset

In [1]:
import os, sys, pandas as pd, numpy as np

# Import library with current code functions
sys.path.append(os.path.join("..", "lib"))
import general_functions as gf, files_paths as fp

In [2]:
def index_frames_generate(VD_INFO, VD_FEATURE):
    
    # Get video information 
    VIDEO_ID = VD_FEATURE['video_id'][0]
    FRAME_SEQ = np.array(VD_FEATURE['frame_seq'], dtype=np.int16)
    TIME_STEP_FR = VD_INFO['time_step_fr'][0]
    ORIGIN_VID = VD_INFO['origin_vid'][0]
    
    # Get the maximum value of frame_seq
    frame_seq_max = FRAME_SEQ[-1]
    
    # Make a dataframe with a video_id column and n rows (n=maximum value of frame_seq)
    FRAMES_DTFRAME = pd.DataFrame(data={'video_id': VIDEO_ID}, index=pd.RangeIndex(0,frame_seq_max+1,1), dtype=np.int16)
    FRAMES_DTFRAME['frame_seq'] = range(0,frame_seq_max+1)
    
    # Fill the collumn fr_exists with 1 in the indexes that the frame_seq number exists
    FRAMES_DTFRAME.loc[FRAME_SEQ, 'fr_exists'] = 1
    
    # Replace NaN values with zeros in 
    FRAMES_DTFRAME['fr_exists'] = FRAMES_DTFRAME['fr_exists'].fillna(0).astype(int)
    
    # Filter Dataframe to find discontinuous intervals
    FRAMES_DTFRAME['frames_total'] = (FRAMES_DTFRAME.fr_exists.diff(1) != 0).cumsum()
    FRAMES_DTFRAME = pd.DataFrame({'fr_exists' : FRAMES_DTFRAME.groupby('frames_total').fr_exists.first(),
                  'frame_seq_init' : FRAMES_DTFRAME.groupby('frames_total').frame_seq.first(), 
                  'frame_seq_end': FRAMES_DTFRAME.groupby('frames_total').frame_seq.last(),
                  'frames_total' : FRAMES_DTFRAME.groupby('frames_total').size()}).reset_index(drop=True)
    
    # Add  origin_vid collumn
    FRAMES_DTFRAME.insert(0, 'origin_vid', ORIGIN_VID)
    FRAMES_DTFRAME.insert(0, 'video_id', VIDEO_ID)
    FRAMES_DTFRAME.insert(6, 'time_step_fr', TIME_STEP_FR)

    return FRAMES_DTFRAME

## Getting all the VD_INFO files paths

In [3]:
FILE_LIST_VD_INFO = gf.find_files_in_all_subdirectories([fp.DATASET_YT, fp.DATASET_LOCAL], fp.VD_INFO)

## Creating the output DataFrames

In [4]:
FRAMES_INFO_DTFRAME = pd.DataFrame()
VIDEO_INDEX_DTFRAME = pd.DataFrame()
FRAMES_L2_INFO_DTFRAME= pd.DataFrame()
VD_MEASURES_DTFRAME = pd.DataFrame()

## Getting all the information for all files

In [5]:
for current_path_location in FILE_LIST_VD_INFO:
    path_dir = os.path.dirname(current_path_location)
    
    # Read VD_INFO CSV
    video_info_rest = pd.read_csv(current_path_location)
    video_info_rest.drop(columns=['Unnamed: 0'], inplace=True)
    
    # Read VD_FEATURE_L1 CSV
    vd_feature_path = os.path.join(path_dir, fp.VD_FEATURES_L1)
    video_feature_rest = pd.read_csv(vd_feature_path)
    video_feature_rest.drop(columns=['Unnamed: 0'], inplace=True)
    
    # Read VD_FEATURE_L2 CSV
    vd_feature_2_path = os.path.join(path_dir, fp.VD_FEATURES_L2)
    video_feature_rest_2 = pd.read_csv(vd_feature_2_path)
    video_feature_rest_2.drop(columns=['Unnamed: 0'], inplace=True)

    # Read VD_MEASURE_L0 CSV
    vd_measure_path = os.path.join(path_dir, fp.VD_MEASURE_L0)
    video_measure_rest = pd.read_csv(vd_measure_path)
    video_measure_rest.drop(columns=['Unnamed: 0'], inplace=True)
    
    FRAMES_DTFRAME = index_frames_generate(video_info_rest, video_feature_rest)
    FRAMES_L2_DTFRAME = index_frames_generate(video_info_rest, video_feature_rest_2)
    
    # Concatenate all results dataframes
    FRAMES_INFO_DTFRAME = pd.concat([FRAMES_INFO_DTFRAME, FRAMES_DTFRAME], ignore_index=True)
    FRAMES_L2_INFO_DTFRAME = pd.concat([FRAMES_L2_INFO_DTFRAME,FRAMES_L2_DTFRAME], ignore_index=True)
    VIDEO_INDEX_DTFRAME = pd.concat([VIDEO_INDEX_DTFRAME, video_info_rest], ignore_index=True)
    VD_MEASURES_DTFRAME = pd.concat([VD_MEASURES_DTFRAME,video_measure_rest])


## Calculating the total numbers of frames

In [6]:
frames_sum = FRAMES_INFO_DTFRAME.groupby(['origin_vid','video_id'])['frames_total'].sum().reset_index()
VIDEO_INDEX_DTFRAME['dif'] = VIDEO_INDEX_DTFRAME.total_frames - frames_sum.frames_total

## Creating the summaries

In [7]:
SUMMARY_DT = pd.DataFrame(index = ['Raw Data', 'Extracted Data', 'Qualified Data', 'Measured Data'], 
                          columns = ['Duration (s)', 'Number of Samples (frames)'], dtype = float)
SUMMARY_FR = pd.DataFrame(index = ['FPS', 'Height', 'Width', 'Discontinuities', 'Duration (s)'], 
                          columns = ['Mean', 'Min', 'Max'], dtype = float)

In [8]:
# Calc total frames and time of duration
SUMMARY_DT.loc['Raw Data', 'Duration (s)'] = VIDEO_INDEX_DTFRAME.duration_vid.sum()
SUMMARY_DT.loc['Raw Data', 'Number of Samples (frames)'] = VIDEO_INDEX_DTFRAME.total_frames.sum()

# Select frames in which dlib framework detected landmarks.
EXTRACTED_FRAMES = FRAMES_INFO_DTFRAME.query("fr_exists == 1")
EXTRACTED_FRAMES.insert(7, 'duration', EXTRACTED_FRAMES['time_step_fr'] * EXTRACTED_FRAMES['frames_total'])

# Calc Duration and number of frames
SUMMARY_DT.loc['Extracted Data', 'Duration (s)' ] = EXTRACTED_FRAMES['duration'].sum()
SUMMARY_DT.loc['Extracted Data', 'Number of Samples (frames)'] = EXTRACTED_FRAMES['frames_total'].sum()

# Select frames in which dlib framework detected landmarks and it was applyed interpolation.
EXTRACTED_FRAMES = FRAMES_L2_INFO_DTFRAME.query("fr_exists == 1")
EXTRACTED_FRAMES.insert(7, 'duration', EXTRACTED_FRAMES['time_step_fr'] * EXTRACTED_FRAMES['frames_total'])

# Calc Duration and number of frames
SUMMARY_DT.loc['Qualified Data', 'Duration (s)' ] = EXTRACTED_FRAMES['duration'].sum()
SUMMARY_DT.loc['Qualified Data', 'Number of Samples (frames)'] = EXTRACTED_FRAMES['frames_total'].sum()

# Select frames in which dlib framework didn't detect landmarks.
MISSING_FRAMES = FRAMES_INFO_DTFRAME.query("fr_exists == 0 and frame_seq_init != 0")
MISSING_FRAMES.insert(7, 'duration', MISSING_FRAMES['time_step_fr'] * MISSING_FRAMES['frames_total'])

# Measured data values are the same as Qualified data
SUMMARY_DT.loc['Measured Data', 'Duration (s)'] = SUMMARY_DT.loc['Qualified Data'].iloc[0]

# Qualified data duration is equal to Extracted frames plus frames of interpolate data
SUMMARY_DT.loc['Measured Data', 'Number of Samples (frames)'] = SUMMARY_DT.loc['Qualified Data'].iloc[1]

In [9]:
# FPS
SUMMARY_FR.loc['FPS'].Mean = VIDEO_INDEX_DTFRAME.fps_vid.astype(float).mean()
SUMMARY_FR.loc['FPS'].Min = VIDEO_INDEX_DTFRAME.fps_vid.astype(float).min()
SUMMARY_FR.loc['FPS'].Max = VIDEO_INDEX_DTFRAME.fps_vid.astype(float).max()

# Height
SUMMARY_FR.loc['Height'].Mean = VIDEO_INDEX_DTFRAME.height_vid.astype(int).mean()
SUMMARY_FR.loc['Height'].Min = VIDEO_INDEX_DTFRAME.height_vid.astype(int).min()
SUMMARY_FR.loc['Height'].Max = VIDEO_INDEX_DTFRAME.height_vid.astype(int).max()

# Width
SUMMARY_FR.loc['Width'].Mean = VIDEO_INDEX_DTFRAME.width_vid.astype(int).mean()
SUMMARY_FR.loc['Width'].Min = VIDEO_INDEX_DTFRAME.width_vid.astype(int).min()
SUMMARY_FR.loc['Width'].Max = VIDEO_INDEX_DTFRAME.width_vid.astype(int).max()

# Discontinuities
SUMMARY_FR.loc['Discontinuities'].Mean = MISSING_FRAMES.frames_total.astype(int).mean()
SUMMARY_FR.loc['Discontinuities'].Min = MISSING_FRAMES.frames_total.astype(int).min()
SUMMARY_FR.loc['Discontinuities'].Max = MISSING_FRAMES.frames_total.astype(int).max()

# Videos duration
SUMMARY_FR.loc['Duration (s)'].Mean = VIDEO_INDEX_DTFRAME.duration_vid.astype(float).mean()
SUMMARY_FR.loc['Duration (s)'].Min = VIDEO_INDEX_DTFRAME.duration_vid.astype(float).min()
SUMMARY_FR.loc['Duration (s)'].Max = VIDEO_INDEX_DTFRAME.duration_vid.astype(float).max()

In [10]:
VD_MEASURES_DTFRAME_MEASURES = VD_MEASURES_DTFRAME.iloc[:, 6:]

VD_MEASURES_DTFRAME_SUMARY =  VD_MEASURES_DTFRAME_MEASURES.describe().T
VD_MEASURES_DTFRAME_SUMARY = VD_MEASURES_DTFRAME_SUMARY[['mean', 'std', 'min', 'max']]

# Coefficient of variation
VD_MEASURES_DTFRAME_SUMARY['CV (%)'] = (VD_MEASURES_DTFRAME_SUMARY['std']/VD_MEASURES_DTFRAME_SUMARY['mean']) * 100
VD_MEASURES_DTFRAME_SUMARY.drop(columns=['std'], inplace=True)

## Plotting the information summaries

### Below there is some information about duration and number of frames for each type of data

In [11]:
SUMMARY_DT.round(2)

Unnamed: 0,Duration (s),Number of Samples (frames)
Raw Data,2999.42,89907.0
Extracted Data,2465.56,73915.0
Qualified Data,2495.29,74806.0
Measured Data,2495.29,74806.0


### Some structural information about the videos

In [12]:
SUMMARY_FR.round(2)

Unnamed: 0,Mean,Min,Max
FPS,29.97,29.97,30.0
Height,720.0,720.0,720.0
Width,1280.0,1280.0,1280.0
Discontinuities,10.73,1.0,285.0
Duration (s),142.83,3.36,298.54


### Here is some information about the measurements done in all the videos present in the Dataset

In [13]:
VD_MEASURES_DTFRAME_SUMARY.round(2)

Unnamed: 0,mean,min,max,CV (%)
m4,8.36,1.0,27.02,34.12
m5,14.21,2.0,40.61,29.06
m6,23.44,4.0,76.69,28.18
m7,24.95,4.0,79.62,27.66
m8,24.31,2.0,59.68,26.21
m9,54.1,9.0,161.81,25.87
m10,39.92,6.08,123.36,27.56
m11,35.7,6.5,96.0,25.19
m12,35.27,0.0,91.0,38.06
m13,34.27,0.0,94.0,37.91
