In [1]:
import pandas as pd
import numpy as np
import boto3
import cv2 as cv
import os
import json
from config import key_, secret_, s3_bucket, kaggle_cookie

In [None]:
def download_video_from_s3_bucket(video_name, aws_key=key_, aws_secret=secret_, bucket=s3_bucket):
    '''
    ##Intended for use when not using Sagemaker##
    takes a video name as input, and returns the frame count, and the shape of the first frame of the video
    '''
    s3 = boto3.client('s3',
                      aws_access_key_id=key_, 
                      aws_secret_access_key=secret_,
                      region_name='us-east-2', #region is hardcoded - this is not a security risk to keep public
                      config= boto3.session.Config(signature_version='s3v4')) #the sig version needs to be s3v4 or the url will error
    video_url = s3.generate_presigned_url('get_object',
                                        Params={"Bucket": bucket,
                                               'Key': video_name},
                                        ExpiresIn=60)
    video = cv.VideoCapture(video_url)
    frame_count = int(video.get(cv.CAP_PROP_FRAME_COUNT))
    _, frame_array = video.read()
    frame_x, frame_y, RGB = frame_array.shape
    video.release()
    return video_name, frame_count, frame_x, frame_y, RGB

In [None]:
with open('meta.json') as m:
    meta = json.load(m)

In [None]:
video_names = []
frame_counts = []
x_shapes = []
y_shapes = []
RGB_shapes = []

In [None]:
for video in meta:
    try:
        video_, frames, x_frames, y_frames, RGB = download_video_from_s3_bucket(video)
    except:
        video_, frames, x_frames, y_frames, RGB = video, 0, 0, 0, 0
    video_names.append(video_)
    frame_counts.append(frames)
    x_shapes.append(x_frames)
    y_shapes.append(y_frames)
    RGB_shapes.append(RGB)

In [None]:
video_information_dictionary = {'video_names': video_names, 'number_of_frames': frame_counts, 
                               'number_of_x_pixels': x_shapes, 'number_of_y_pixels': y_shapes,
                               'RGB': RGB_shapes}

In [None]:
video_info_df = pd.DataFrame(video_information_dictionary)

In [None]:
video_info_df.to_csv('video_information.csv')

In [2]:
s3 = boto3.client('s3', aws_access_key_id=key_, aws_secret_access_key=secret_)

In [None]:
s3.upload_file('video_information.csv', s3_bucket, 'video_information.csv')

In [3]:
#read above CSV file 
video_info_df = pd.read_csv('video_information.csv')

In [4]:
video_info_df.head(1)

Unnamed: 0.1,Unnamed: 0,video_names,number_of_frames,number_of_x_pixels,number_of_y_pixels,RGB
0,0,owxbbpjpch.mp4,300,1920,1080,3


In [5]:
#create column for frame size in number of pixels
video_info_df['total_pixels'] = video_info_df['number_of_x_pixels'] * video_info_df['number_of_y_pixels']

#create column for total number of pixels when accounting for the number of frames
#this should help in calculating memory usage
video_info_df['pixels_by_frames'] = video_info_df['total_pixels'] * video_info_df['number_of_frames']

In [6]:
#drop index column from when the DF was previously saved as a csv file
video_info_df = video_info_df.drop(columns=['Unnamed: 0'])

In [7]:
#preview DF
video_info_df.head()

Unnamed: 0,video_names,number_of_frames,number_of_x_pixels,number_of_y_pixels,RGB,total_pixels,pixels_by_frames
0,owxbbpjpch.mp4,300,1920,1080,3,2073600,622080000
1,vpmyeepbep.mp4,300,1920,1080,3,2073600,622080000
2,fzvpbrzssi.mp4,300,1920,1080,3,2073600,622080000
3,htorvhbcae.mp4,300,1920,1080,3,2073600,622080000
4,fckxaqjbxk.mp4,300,1920,1080,3,2073600,622080000


In [8]:
#find minimum value of total_pixels column
video_info_df['total_pixels'].min()
#the '0' value means there are some missing videos in the bucket

0

In [9]:
#create new DF with only 0 frames (the number_of_frames column is 0)
missing_videos_df = video_info_df[video_info_df['number_of_frames'] == 0]
#convert the video_name column in above DF to a list
missing_videos = missing_videos_df['video_names'].tolist()
print(f'there are {len(missing_videos)} videos potentially missing from bucket')

there are 8 videos potentially missing from bucket


In [10]:
#make additional attempt to obtain info from each video
for video in missing_videos:
    try:
        video_, frames, x_frames, y_frames, RGB = download_video_from_s3_bucket(video)
        print(f'{video} is in the S3 bucket')
    except:
        print(f'{video} is not in the S3 bucket')

wipjitfmta.mp4 is not in the S3 bucket
wpuxmawbkj.mp4 is not in the S3 bucket
pvohowzowy.mp4 is not in the S3 bucket
innmztffzd.mp4 is not in the S3 bucket
cfxiikrhep.mp4 is not in the S3 bucket
dzjjtfwiqc.mp4 is not in the S3 bucket
zzfhqvpsyp.mp4 is not in the S3 bucket
glleqxulnn.mp4 is not in the S3 bucket


In [11]:
#remove missing videos from df
video_info_df = video_info_df[video_info_df['number_of_frames'] != 0]

In [12]:
#find longest videos (by number of frames)
video_info_df[video_info_df['number_of_frames'] == video_info_df['number_of_frames'].max()]

Unnamed: 0,video_names,number_of_frames,number_of_x_pixels,number_of_y_pixels,RGB,total_pixels,pixels_by_frames
54987,qanpwofprw.mp4,601,1920,1080,3,2073600,1246233600
54989,vninpbciju.mp4,601,1920,1080,3,2073600,1246233600
54993,xujbhvdrlx.mp4,601,1920,1080,3,2073600,1246233600
54996,kxxmijtqwn.mp4,601,1920,1080,3,2073600,1246233600
55000,pgvxcsjbts.mp4,601,1920,1080,3,2073600,1246233600
55002,nbyhqpaumb.mp4,601,1920,1080,3,2073600,1246233600
55003,ucuckkkwiz.mp4,601,1920,1080,3,2073600,1246233600
55012,tdytyxyttv.mp4,601,1920,1080,3,2073600,1246233600
55013,cyxdgyisae.mp4,601,1920,1080,3,2073600,1246233600
55020,emocjwfcqr.mp4,601,1920,1080,3,2073600,1246233600


In [13]:
#find videos with largest number of pixels
video_info_df[video_info_df['pixels_by_frames'] == video_info_df['pixels_by_frames'].max()]
#it would take approximately 6.7 GB to store one of the below videos in an array
#this is reduced to ~1.11 GB with inproved download_video_from_s3_bucket function that doesnt store all frames

Unnamed: 0,video_names,number_of_frames,number_of_x_pixels,number_of_y_pixels,RGB,total_pixels,pixels_by_frames
85690,ldbldpvjab.mp4,269,3840,2160,3,8294400,2231193600
85694,qchgluoajg.mp4,269,3840,2160,3,8294400,2231193600
85695,pydicxfaui.mp4,269,3840,2160,3,8294400,2231193600
85697,ceoyuhoxof.mp4,269,3840,2160,3,8294400,2231193600
85698,wktdezchfx.mp4,269,3840,2160,3,8294400,2231193600
85700,zsqjihgilv.mp4,269,3840,2160,3,8294400,2231193600
85701,dsxvketzmt.mp4,269,3840,2160,3,8294400,2231193600
85703,wkjhpzukba.mp4,269,3840,2160,3,8294400,2231193600
85704,zcpdiajsfj.mp4,269,3840,2160,3,8294400,2231193600
85705,cngogfretb.mp4,269,3840,2160,3,8294400,2231193600


In [14]:
#save updated DF as a CSV file
video_info_df.to_csv('video_information.csv', index=False)

In [15]:
#upload updated CSV file to S3 bucket
s3.upload_file('video_information.csv', s3_bucket, 'video_information.csv')