### Create Labelled Dataframe for Cholec80 Videos

1. Get all video files from input folder
2. Extract all frames from each 1 of the 80 videos
3. Perform frame pre-processing for each extracted frame
4. Save finished frames to output folder

Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
import os, re
from tqdm import tqdm
from src.process_utils import extract_file_paths, import_tool_annotation_file, import_phase_annotation_file

Create labelled dataframe

In [2]:
#File locations by folder
dir = './cholec80'
frames = './cholec80/frames'
tools = './cholec80/tool_annotations'
phases = './cholec80/phase_annotations'


#File paths for all tool and phase annotation files
tool_annotation_list = extract_file_paths(tools,'.txt')
phase_annotation_list = extract_file_paths(phases,'.txt')



In [3]:
#Final dataframe containing concatenated video annotations and image file paths
df = pd.DataFrame(columns=[
        "image", "video_num", "phase", "frame", "tool_Grasper",
        "tool_Bipolar", "tool_Hook", "tool_Scissors", "tool_Clipper",
        "tool_Irrigator", "tool_SpecimenBag"
    ])

In [4]:
#Loop through all 80 videos
for i in tqdm(range(1, 81)):

    #Assign 2 digit video number as current video id
    video_num = f"video{i:02d}"

    #Current dataframe for current video and extract all frame file paths
    video_df = pd.DataFrame()
    current_vid = frames +'/' + video_num
    img_list = extract_file_paths(current_vid,'.jpg')

    #Sort frame list by frame number
    img_list.sort(key=lambda f: int(re.sub('\D', '', f)))

    #Assign frame path and video id to the dataframe.
    video_df["image"] = img_list
    video_df['video_num'] = video_num

    #Create tool annotation dataframe for current video
    tool_df = import_tool_annotation_file(video_num, tool_annotation_list)

    #Create tool annotation dataframe for current video
    phase_df = import_phase_annotation_file(video_num, phase_annotation_list)            

    #Concat video dataframe with its phase and tool annotation files.
    video_df = pd.concat([video_df, phase_df], axis=1)
    video_df = pd.concat([video_df, tool_df], axis=1)

    #Make phase and time columns lower case
    video_df = video_df.rename(columns={"Phase": "phase","Frame": "frame",})


    df = df.append(video_df, ignore_index=True, sort=False)

100%|██████████| 80/80 [01:50<00:00,  1.38s/it]


In [5]:
#Drop na values
df = df.dropna()

In [6]:
#Get integer representation for video names
df['video_int'] = df['video_num'].astype('category').cat.codes
df['video_int'] = df['video_int'] + 1


In [23]:
#Train videos
train_set = [59, 57, 34, 32, 76, 30, 70, 54, 12, 22,  7, 27, 52, 78, 77,  6, 17,
       37, 45, 58, 66, 20,  9, 14, 25, 62, 31, 74, 49, 65, 38,  2, 24, 73,
       18, 39, 60, 64, 63, 48, 40, 15, 47, 29, 43, 56, 41, 19, 33,  1, 13,
       16, 79,  4, 68, 11, 69, 10, 71, 53]

#Validation videos
validation_set = [ 3,  5,  8, 21, 23, 26, 28, 35, 36, 42]

#Test videos
test_set = [44, 46, 50, 51, 55, 61, 67, 72, 75, 80]
       

In [24]:
#Split into train df and val/test df
train_df = df[df['video_int'].isin(train_set)]


val_df = df[df['video_int'].isin(validation_set)]


test_df = df[df['video_int'].isin(test_set)]

In [28]:
#Output all dataframes
df.to_parquet('full_dataframe.parquet')

train_df.to_parquet('train_df.parquet')

val_df.to_parquet('val_df.parquet')

test_df.to_parquet('test_df.parquet')

In [12]:
df[(df['video_int']>48)]['video_num'].unique().shape[0]

32

In [13]:
#Split into train df and val/test df
train_df = df[df['video_int']< 41]


val_df = df[(df['video_int']>40)  & (df['video_int']<49)]


test_df = df[(df['video_int']>48)]

#Output all dataframes
# df.to_parquet('full_dataframe.parquet')

train_df.to_parquet('data/ordered_train_df.parquet')

val_df.to_parquet('data/ordered_val_df.parquet')

test_df.to_parquet('data/ordered_test_df.parquet')

In [7]:
#Split into train df and val/test df
train_df = df[df['video_int']< 33]


val_df = df[(df['video_int']>32)  & (df['video_int']<41)]


test_df = df[(df['video_int']>40)]

#Output all dataframes
# df.to_parquet('full_dataframe.parquet')

train_df.to_parquet('data/ord_train_df.parquet')

val_df.to_parquet('data/ord_val_df.parquet')

test_df.to_parquet('data/ord_test_df.parquet')