In [1]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import shutil

In [2]:
#time is in the form XXH:XXm:XXs, so we need to convert it to seconds
def convert_time(time):
    hours = int(time[0:2])
    mins = int(time[4:6])
    secs = int(time[8:10])
    return hours*3600 + mins*60 + secs

In [3]:
phases = pd.read_excel('phases.xlsx')
print(phases.shape)

(767, 7)


In [4]:
#sort vid_id in reverse alphabetical order, in order to include yale videos in training set
phases = phases.sort_values(by=['vid_id'], ascending=False)
phases.head()

Unnamed: 0,vid_id,phase,time_start,time_end,labeler,labeler_2,notes
665,yale_9,hiatal_dissec,00H:18m:28s,00H:21m:20s,sp,,
671,yale_9,hiatal_dissec,00H:41m:33s,00H:43m:43s,sp,,
659,yale_9,exposure,00H:00m:00s,00H:02m:06s,sp,,
660,yale_9,sac_reduction,00H:02m:07s,00H:06m:26s,sp,,
661,yale_9,hiatal_dissec,00H:06m:27s,00H:07m:56s,sp,,


In [5]:
phases.head()

Unnamed: 0,vid_id,phase,time_start,time_end,labeler,labeler_2,notes
665,yale_9,hiatal_dissec,00H:18m:28s,00H:21m:20s,sp,,
671,yale_9,hiatal_dissec,00H:41m:33s,00H:43m:43s,sp,,
659,yale_9,exposure,00H:00m:00s,00H:02m:06s,sp,,
660,yale_9,sac_reduction,00H:02m:07s,00H:06m:26s,sp,,
661,yale_9,hiatal_dissec,00H:06m:27s,00H:07m:56s,sp,,


In [6]:
#for vid_4, keep only labeler 'kc'
print(phases.shape)
non_kc_vid_4 = phases[phases['vid_id'] == 'vid_4']
non_kc_vid_4 = non_kc_vid_4[non_kc_vid_4['labeler'] != 'kc']
phases = phases.drop(non_kc_vid_4.index)
phases.reset_index(drop=True, inplace=True)
print(phases.shape)

(767, 7)
(707, 7)


In [7]:
#rename 'sac_reduction' phase to 'hiatal_dissec'
phases.loc[phases['phase'] == 'sac_reduction', 'phase'] = 'hiatal_dissec'

In [8]:
#get the number of unique videos
len(phases['vid_id'].unique())

37

In [9]:
phases['labeler'].value_counts()

kc     251
sp     177
ck     161
AB     117
ck       1
Name: labeler, dtype: int64

In [10]:
#strip leading and trailing whitespace from the time_start and time_end columns
phases['time_start'] = phases['time_start'].str.strip()
phases['time_end'] = phases['time_end'].str.strip()

In [11]:
#for each row in the dataframe, make sure that time_start and time_end are integers
for i in range(len(phases)):
    try:
        start1 = convert_time(phases['time_start'][i])
        end1 = convert_time(phases['time_end'][i])
    except:
        print(i)
        print(phases['time_start'][i])
        print(phases['time_end'][i])

In [12]:
phase_list = phases['phase'].unique()
phase_list

array(['hiatal_dissec', 'exposure', 'other', 'oob', 'eso_mob',
       'hiatal_repair', 'wrap', 'sac_excision', 'fundus_mob',
       'peg_placement'], dtype=object)

In [13]:
#check the total amount of time in the dataset
total_time = 0
for i in range(len(phases)):
    start1 = convert_time(phases['time_start'][i])
    end1 = convert_time(phases['time_end'][i])
    total_time += (end1 - start1)

#get the total time in hours, minutes, and seconds
hours = total_time // 3600
mins = (total_time % 3600) // 60
secs = (total_time % 3600) % 60
print(hours, 'hrs,', mins, 'mins,', secs, 'secs')

48 hrs, 12 mins, 8 secs


In [14]:
print(len(phases))

707


In [15]:
phases['path'] = np.NaN
#if vid_id contains 'vid', then path starts with 'unc/raw_ids', elif vid_id contains 'yale', then path starts with 'yale/raw_ids'
for i, row in phases.iterrows():
    if 'vid' in row['vid_id']:
        phases.loc[i, 'path'] = 'unc/raw_ids/' + row['vid_id'] + '.mp4'
    elif 'yale' in row['vid_id']:
        phases.loc[i, 'path'] = 'yale/raw_ids/' + row['vid_id'] + '.mp4'
    else:
        print('error')

In [16]:
#for each video, check if vid_{}_v2.mp4 exists, if it does, then replace the path with that
for i, row in phases.iterrows():
    if os.path.exists('unc/raw_ids/' + row['vid_id'] + '_v2.mp4'):
        phases.loc[i, 'path'] = 'unc/raw_ids/' + row['vid_id'] + '_v2.mp4'
    elif os.path.exists('yale/raw_ids/' + row['vid_id'] + '_v2.mp4'):
        phases.loc[i, 'path'] = 'yale/raw_ids/' + row['vid_id'] + '_v2.mp4'
    else:
        pass
phases.head()

Unnamed: 0,vid_id,phase,time_start,time_end,labeler,labeler_2,notes,path
0,yale_9,hiatal_dissec,00H:18m:28s,00H:21m:20s,sp,,,yale/raw_ids/yale_9.mp4
1,yale_9,hiatal_dissec,00H:41m:33s,00H:43m:43s,sp,,,yale/raw_ids/yale_9.mp4
2,yale_9,exposure,00H:00m:00s,00H:02m:06s,sp,,,yale/raw_ids/yale_9.mp4
3,yale_9,hiatal_dissec,00H:02m:07s,00H:06m:26s,sp,,,yale/raw_ids/yale_9.mp4
4,yale_9,hiatal_dissec,00H:06m:27s,00H:07m:56s,sp,,,yale/raw_ids/yale_9.mp4


In [17]:
#find values in 'path' that are not strings
phases[phases['path'].apply(lambda x: type(x) != str)]

Unnamed: 0,vid_id,phase,time_start,time_end,labeler,labeler_2,notes,path


In [18]:
#delete any existing frames
if os.path.exists('all_frames'):
    shutil.rmtree('all_frames')

for i in range(len(phases)):
    vid_id = phases['vid_id'][i]
    vid_fname = phases['path'][i]
    phase = phases['phase'][i]
    time_start = phases['time_start'][i]
    time_end = phases['time_end'][i]
    time_start_sec = convert_time(time_start)
    time_end_sec = convert_time(time_end)
    print(vid_fname)
    #if the phase is 'other', then skip it
    if phase == 'other':
        continue
    #if the phase is not 'oob', then add a 4 second buffer to the start and end times
    elif phase != 'oob':
        time_start_sec += 4
        time_end_sec -= 4
        for i in range(time_start_sec, time_end_sec):
            if i%10 == 0:
                cap = cv2.VideoCapture(vid_fname)
                cap.set(cv2.CAP_PROP_POS_MSEC, i*1000)
                ret, frame = cap.read()
                if ret:
                    if not os.path.exists('all_frames/{}/{}'.format(vid_id, phase)):
                        os.makedirs('all_frames/{}/{}'.format(vid_id, phase))
                    cv2.imwrite('all_frames/{}/{}/{}_{}.jpg'.format(vid_id, phase, vid_id, i), frame)
            else:
                continue
            cap.release()
    #if the phase is 'oob', then don't add a buffer
    elif phase == 'oob':
        for i in range(time_start_sec, time_end_sec):
            if i%10 == 0:
                cap = cv2.VideoCapture(vid_fname)
                cap.set(cv2.CAP_PROP_POS_MSEC, i*1000)
                ret, frame = cap.read()
                if ret:
                    if not os.path.exists('all_frames/{}/{}'.format(vid_id, phase)):
                        os.makedirs('all_frames/{}/{}'.format(vid_id, phase))
                    cv2.imwrite('all_frames/{}/{}/{}_{}.jpg'.format(vid_id, phase, vid_id, i), frame)
            else:
                continue
            cap.release()       

yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_9.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_8.mp4
yale/raw_ids/yale_7.mp4
yale/raw_ids/yale_7.mp4
yale/raw_ids/yale_7.mp4
yale/raw_ids/yale_7.mp4
yale/raw_ids/yale_7.mp4
yale/raw_ids/yale_7.mp4
yale/raw_ids/yal