# Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import os

from moviepy.editor import *

import pytube
from pytube import YouTube
from pytube.exceptions import VideoUnavailable
from pytube.exceptions import VideoPrivate, AgeRestrictedError, \
LiveStreamError, RecordingUnavailable, MembersOnly, VideoRegionBlocked

from google.cloud import storage
from googleapiclient.discovery import build

import time
import timeout_decorator
from timeout_decorator import TimeoutError

# Global Variables

In [2]:
URL = "https://www.youtube.com/watch?v="

PATH = "/home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/"

# Functions

In [3]:
# In order to properly work, functions need following imports:
# import time
# import timeout_decorator
# from timeout_decorator import TimeoutError

# @timeout_decorator around downloadVideo() function is interupting the download if download lasts more than 5 seconds,
# by throwing an TimeoutError which is then cached with try: except loop in main function

# tests:
#    - 1. attempt - 100 datapoints, 55/100 successfully downloaded and clipped, 5min:20s
#    - 2. attempt - 100 datapoints, 61/100 successfully downloaded and clipped, 3min:52s
#    - 3. attempt - 100 datapoints, 58/100 successfully downloaded and clipped, 3min:40s
#    - 4. attempt - 1000 datapoints, 657/100 successfully downloaded and clipped, 59min:18s


# results would be better with hard connection instead of wifi (i will test it tomorrow)

In [4]:
@timeout_decorator.timeout(5, use_signals=False)
def downloadVideo(data):
    video = YouTube(URL + data.youtube_id)
    video.streams.get_by_itag(18).download(PATH + "download/", filename = f"{data.label}_{data.youtube_id}.mp4")

In [5]:
def clipVideo(data, fps = 1):
    if os.path.isfile(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4"):
        video_clip = VideoFileClip(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4", verbose = False)
        if int(video_clip.duration) == 10:
            video_clip = video_clip.subclip(0, 10)
            video_clip.write_videofile(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4", fps = fps, logger = None)
            os.remove(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4")
        elif video_clip.duration < data.time_end:
            if video_clip.duration > 10:
                video_clip = video_clip.subclip(video_clip.duration - 10, video_clip.duration)
                video_clip.write_videofile(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4", fps = fps, logger = None)
                os.remove(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4")
            else:
                os.remove(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4")
                return False
        else:
            video_clip = video_clip.subclip(data.time_start, data.time_end)
            video_clip.write_videofile(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4", fps = fps, logger = None)
            os.remove(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4")
    return True

In [6]:
# initializing client and bucket

client = storage.Client.from_service_account_json("/home/lockke/code/Koprivnica/gcp/peppy-webbing-332911-743c3173bc03.json")
bucket = client.get_bucket('737-human-action-recognition-bucket')

In [7]:
# code validation required

def uploadCloud(data):
    if os.path.isfile(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4"):
        blob = bucket.blob(f"train_data/{data.label}/{data.label}_{data.youtube_id}.mp4")
        blob.upload_from_filename(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4")
        return True
        #os.remove(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4")

# Real Work

## Preparing my part of dataset

In [8]:
df = pd.read_csv("/home/lockke/code/Koprivnica/737-human-action-recognition/raw_data/train_reduced.csv")

In [9]:
df.shape

(77000, 5)

In [10]:
df.head()

Unnamed: 0,label,youtube_id,time_start,time_end,split
0,clay pottery making,LHtsiogBKvM,34,44,train
1,clay pottery making,K4O9S_gpgbY,64,74,train
2,clay pottery making,kVAScTNRnwA,72,82,train
3,clay pottery making,RE9bEWMzVtg,0,10,train
4,clay pottery making,4mbdJHOnPuA,1,11,train


In [11]:
df.tail()

Unnamed: 0,label,youtube_id,time_start,time_end,split
76995,breaking glass,bwEJAObcdxc,24,34,train
76996,breaking glass,iUZnFrbDX9U,13,23,train
76997,breaking glass,QesfpvuqDOk,7,17,train
76998,breaking glass,wJOzuYXBaO8,0,10,train
76999,breaking glass,GjOO4z2Phd4,0,10,train


In [12]:
df.loc[52502]

label         mopping floor
youtube_id      kFVPkQKat94
time_start               66
time_end                 76
split                 train
Name: 52502, dtype: object

In [13]:
df["label"].value_counts()

clay pottery making             110
laughing                        110
wading through mud              110
changing wheel (not on bike)    110
getting a piercing              110
                               ... 
golf driving                    110
casting fishing line            110
punching bag                    110
sleeping                        110
breaking glass                  110
Name: label, Length: 700, dtype: int64

In [14]:
labels = df["label"].unique()[525:]
df = df[df["label"].isin(labels)].reset_index()

In [15]:
df

Unnamed: 0,index,label,youtube_id,time_start,time_end,split
0,57750,chiseling wood,vglFM8FPf68,45,55,train
1,57751,chiseling wood,VNORa36t2FU,24,34,train
2,57752,chiseling wood,fy7o8xDBsms,12,22,train
3,57753,chiseling wood,7-QF8ltNa04,27,37,train
4,57754,chiseling wood,YhZe_Vq2ZpM,4,14,train
...,...,...,...,...,...,...
19245,76995,breaking glass,bwEJAObcdxc,24,34,train
19246,76996,breaking glass,iUZnFrbDX9U,13,23,train
19247,76997,breaking glass,QesfpvuqDOk,7,17,train
19248,76998,breaking glass,wJOzuYXBaO8,0,10,train


In [16]:
df.shape

(19250, 6)

In [17]:
len(df)

19250

In [18]:
df["label"].value_counts()

chiseling wood           110
air drumming             110
picking blueberries      110
being in zero gravity    110
dumpster diving          110
                        ... 
playing basketball       110
needle felting           110
carving marble           110
laying tiles             110
breaking glass           110
Name: label, Length: 175, dtype: int64

In [19]:
df.loc[2529]

index                 60279
label         yarn spinning
youtube_id      47y24Dh1Xjw
time_start               54
time_end                 64
split                 train
Name: 2529, dtype: object

## Downloading, clipping and uploading

In [20]:
%%time

n_start = 0
n_stop = 1

counter = 0

for i in range(n_start, n_stop):
    print("------------------------------")
    print(f"{i} ({df.label[i]}_{df.youtube_id[i]})")
    try:
        downloadVideo(df.loc[i])
        print("----- downloaded! -----")
        
        if clipVideo(df.loc[i], fps = 10):
            print("----- clipped! -----")
            counter += 1
        else:
            print("----- NOT CLIPPED! -----")
        
        if uploadCloud(df.loc[i]):
            print("----- uploaded! -----")

    except (TimeoutError, KeyError, VideoUnavailable, VideoPrivate, OSError, IndexError, AttributeError,\
            AgeRestrictedError, LiveStreamError, RecordingUnavailable, MembersOnly, VideoRegionBlocked):
        print("--- NOT DOWNLOADED! ---")
        if os.path.isfile(PATH + "download/" + f"{df.loc[i].label}_{df.loc[i].youtube_id}.mp4"):
            os.remove(PATH + "download/" + f"{df.loc[i].label}_{df.loc[i].youtube_id}.mp4")
        pass
    
    print("------------------------------")

print(f"Number of successfully downloaded and clipped videos: {counter} / {n_stop - n_start}")

------------------------------
0 (chiseling wood_vglFM8FPf68)
----- downloaded! -----
----- clipped! -----
----- uploaded! -----
------------------------------
Number of successfully downloaded and clipped videos: 1 / 1
CPU times: user 301 ms, sys: 200 ms, total: 501 ms
Wall time: 6 s


# Testing and stuff

In [21]:
# getting the list of file names

file_names = os.listdir(PATH + "upload/")

len(file_names)

1

In [22]:
len(file_names)   

1

In [23]:
# uploading to GCP
counter = 0

for elem in file_names:
    try:
        gcp_folder = elem.split("_")[0]
    
        blob = bucket.blob(f"train_data/{gcp_folder}/{elem}")
        blob.upload_from_filename(PATH + "upload/" + f"{elem}")
        print(f"{elem} - uploaded!")
    
    except FileNotFoundError:
        print(f"{elem} - NOT FOUND!")
        counter += 1
        pass

chiseling wood_vglFM8FPf68.mp4 - uploaded!


In [129]:
f"train_data/{gcp_folder}/{elem}"

'train_data/yoga/yoga_2WnSJQvhV0E.mp4'

In [130]:
PATH + "upload/" + f"{elem}"

'/home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/yoga_2WnSJQvhV0E.mp4'

In [176]:
%%time

n_start = 8765
n_stop = 8766

counter = 0

for i in range(n_start, n_stop):
    try:
        downloadVideo(df.loc[i])
        print("---------------")
        print(f"{i} ({df.youtube_id[i]}) - downloaded!")
        
        if clipVideo(df.loc[i], fps = 10):
            print(f"{i} ({df.youtube_id[i]}) - clipped!")
            print("---------------")
            counter += 1
        else:
            print(f"{i} ({df.youtube_id[i]}) - NOT CLIPPED!")
            print("---------------")
        
        if uploadCloud(df.loc[i]):
            print("---------------")
            print(f"{i} ({df.youtube_id[i]}) - uploaded!")

    except (TimeoutError, KeyError, VideoUnavailable, VideoPrivate, OSError,\
            AgeRestrictedError, LiveStreamError, RecordingUnavailable, MembersOnly, VideoRegionBlocked):
        print("---------------")
        print(f"{i} ({df.youtube_id[i]}) - FAILED!")
        print("---------------")
        if os.path.isfile(PATH + "download/" + f"{df.loc[i].label}_{df.loc[i].youtube_id}.mp4"):
            os.remove(PATH + "download/" + f"{df.loc[i].label}_{df.loc[i].youtube_id}.mp4")
        pass

print(f"Number of successfully downloaded and clipped videos: {counter} / {n_stop - n_start}")

---------------
8765 (yicaWdZhcfw) - downloaded!
---------------
8765 (yicaWdZhcfw) - FAILED!
---------------
Number of successfully downloaded and clipped videos: 0 / 1
CPU times: user 21 ms, sys: 70.6 ms, total: 91.7 ms
Wall time: 1.57 s


In [174]:
df.loc[8765]

index                  66515
label         herding cattle
youtube_id       yicaWdZhcfw
time_start                31
time_end                  41
split                  train
Name: 8765, dtype: object