# Imports

In [163]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import os

from moviepy.editor import *

import pytube
from pytube import YouTube
from pytube.exceptions import VideoUnavailable
from pytube.exceptions import VideoPrivate, AgeRestrictedError, \
LiveStreamError, RecordingUnavailable, MembersOnly, VideoRegionBlocked

from google.cloud import storage
from googleapiclient.discovery import build

import time
import timeout_decorator
from timeout_decorator import TimeoutError

# Global Variables

In [164]:
URL = "https://www.youtube.com/watch?v="

PATH = "/home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/"

# Functions

In [3]:
# getting the csv file and dropping videos, which end_time is more than 300

def get_data(x):
    #reading the csv
    df = pd.read_csv(PATH + "train.csv")
    df = df.copy().head(x)
    #dropping the videos, which end_time is higher than 300
    df = df.drop(df.loc[df['time_end']>300].index).reset_index(drop=True)
    return df

In [4]:
# droping videos longer than 300 seconds and adding column "length"

def remove_long_videos(df):
    test = df.copy()
    #count is just for testing
    count = 0
    #list of all the lengths in order to build the new column
    lengths = []
    #iterating over through all the videos
    for video in range(0,len(test)):
        url = "https://www.youtube.com/watch?v="+test.youtube_id[video]
        yt = YouTube(url)
        #handling error messages
        try:
            stream = yt.streams
        except (KeyError,VideoPrivate,VideoUnavailable,AgeRestrictedError,LiveStreamError, RecordingUnavailable, MembersOnly, VideoRegionBlocked):
            test.drop(video,inplace=True)
            continue
        #drop video if longer than 5min
        if yt.length > 300:
            test.drop(video,inplace=True)
        #if video is already clipped, but end_time is higher than lenght of the video, we set new start and end points
        elif yt.length == 10:
            test.time_start[video] = 0
            test.time_end[video] = yt.length
            length = yt.length
            lengths.append(length)
        #if time_end greater than the length and it's not clipped we drop it
        elif yt.length < test.time_end[video]:
            test.drop(video,inplace=True)
        #appending the lengths to the list     
        else:
            length = yt.length
            lengths.append(length)
        count = count +1
        print(count)
    #creating the new column    
    test["lenghts"] = lengths
    #returning the new dataframe
    return test.reset_index(drop=True)        

In [5]:
# getting clean data

def get_clean_data(x):
    df = get_data(x)
    df = remove_long_videos(df)
    return df

## Functions for Local Testing - IGNORE!

In [6]:
fps = 1

In [36]:
# downloading video

def videoDownload(filename):
    video = YouTube(URL + filename)
    video.streams.get_by_itag(18).download(PATH + "download/", filename = f"{df.label[i]}_{df.youtube_id[i]}.mp4")

In [8]:
# clipping the video and removing the uncliped version

def videoClip(filename, time_start, time_stop):
    video_clip = VideoFileClip(PATH + "download/" + f"{filename.label}_{filename.youtube_id}.mp4")
    video_clip = video_clip.subclip(time_start, time_stop)
    video_clip.write_videofile(PATH + "upload/" + f"{filename.label}_{filename.youtube_id}.mp4", fps)
    os.remove(PATH + "download/" + f"{filename.label}_{filename.youtube_id}.mp4")

In [9]:
# processing video

def clip_upload(df):
    for i in range(len(df)):
        print(f"---------- Video {i} ({df.loc[i].label}) processing ---------- ")
        videoDownload(df.loc[i].youtube_id)
        videoClip(df.loc[i].youtube_id, df.loc[i].time_start, df.loc[i].time_end)

In [10]:
# return YouTube API json

def videoDuration(video_id):
    searchUrl = "https://www.googleapis.com/youtube/v3/videos?id=" + video_id + "&key=" + API_KEY + "&part=contentDetails"
    response = urllib.request.urlopen(searchUrl).read()
    data = json.loads(response)
    
    return data

# Testing

In [23]:
df = get_data(500000)

In [24]:
df.shape

(467068, 5)

In [153]:
df.head(10)

Unnamed: 0,label,youtube_id,time_start,time_end,split
0,clay pottery making,---0dWlqevI,19,29,train
1,news anchoring,---aQ-tA5_A,9,19,train
2,using bagging machine,---j12rm3WI,14,24,train
3,javelin throw,--07WQ2iBlw,1,11,train
4,climbing a rope,--0NTAs-fA0,29,39,train
5,sipping cup,--0l35AkU34,68,78,train
6,flipping pancake,--33Lscn6sk,4,14,train
7,tickling,--3OAstUWtU,45,55,train
8,watering plants,--3lTx87ebQ,23,33,train
9,eating spaghetti,--3ouPhoy2A,20,30,train


In [26]:
df["label"].value_counts()

somersaulting                954
front raises                 947
eating ice cream             945
slapping                     943
crossing river               942
                            ... 
decoupage                    398
doing sudoku                 388
carving wood with a knife    356
flint knapping               335
crocheting                   331
Name: label, Length: 700, dtype: int64

In [27]:
df_test = df.sample(frac = 1).groupby("label", sort = False).head(50)

In [28]:
df_test["label"].value_counts()

karaoke               50
brushing hair         50
bodysurfing           50
tying bow tie         50
shopping              50
                      ..
tap dancing           50
parasailing           50
bench pressing        50
pouring milk          50
shining flashlight    50
Name: label, Length: 700, dtype: int64

In [149]:
df.loc[30]

label         doing jigsaw puzzle
youtube_id            --GEr5-PyTI
time_start                      0
time_end                       10
split                       train
Name: 30, dtype: object

In [154]:
df.loc[31]

label         playing with trains
youtube_id            --GaSxELz-8
time_start                     39
time_end                       49
split                       train
Name: 31, dtype: object

In [157]:
df.loc[df["youtube_id"] == "--4NLFGNfAs"]

Unnamed: 0,label,youtube_id,time_start,time_end,split
11,calligraphy,--4NLFGNfAs,11,21,train


## New upload possibility

In [161]:
# In order to properly work, functions need following imports:
# import time
# import timeout_decorator
# from timeout_decorator import TimeoutError

# @timeout_decorator around downloadVideo() function is interupting the download if download lasts more than 5 seconds,
# by throwing an TimeoutError which is then cached with try: except loop in main function

# tests:
#    - 1. attempt - 100 datapoints, 55/100 successfully downloaded and clipped, 5min:20s
#    - 2. attempt - 100 datapoints, 61/100 successfully downloaded and clipped, 3min:52s
#    - 3. attempt - 100 datapoints, 58/100 successfully downloaded and clipped, 3min:40s
#    - 3. attempt - 1000 datapoints, 571/100 successfully downloaded and clipped, 56min:49s

# results would be better with hard connection instead of wifi (i will test it tomorrow)

In [135]:
@timeout_decorator.timeout(5, use_signals=False)
def downloadVideo(data):
    video = YouTube(URL + data.youtube_id)
    video.streams.get_by_itag(18).download(PATH + "download/", filename = f"{data.label}_{data.youtube_id}.mp4")

In [171]:
def clipVideo(data, fps = 1):
    if os.path.isfile(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4"):
        video_clip = VideoFileClip(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4")
        if int(video_clip.duration) == 10:
            video_clip.write_videofile(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4")
            os.remove(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4")
        elif video_clip.duration - 1 < data.time_end:
            os.remove(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4")
            return False
        else:
            video_clip = video_clip.subclip(data.time_start, data.time_end)
            video_clip.write_videofile(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4", fps = fps)
            os.remove(PATH + "download/" + f"{data.label}_{data.youtube_id}.mp4")
    return True

In [126]:
# cannot use it, not sufficient access rights

client = storage.Client()
bucket = client.get_bucket('737-human-action-recognition-bucket')

# code validation required

def uploadCloud(data):
    blob = bucket.blob(f"train_data/{data.label}/{data.label}_{data.youtube_id}.mp4")
    blob.upload_from_filename(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4")
    os.remove(PATH + "upload/" + f"{data.label}_{data.youtube_id}.mp4")

In [166]:
%%time

n = 1000
counter = 0

for i in range(n):
    try:
        downloadVideo(df.loc[i])
        print("---------------")
        print(f"{i} ({df.youtube_id[i]}) - downloaded!")
        
        if clipVideo(df.loc[i], fps = 10):
            print(f"{i} ({df.youtube_id[i]}) - clipped!")
            print("---------------")
            counter += 1
        else:
            print(f"{i} ({df.youtube_id[i]}) - NOT CLIPPED!")
            print("---------------")
        
        #uploadCloud(df.loc[i])

    except (TimeoutError, KeyError, VideoUnavailable, VideoPrivate,\
            AgeRestrictedError, LiveStreamError, RecordingUnavailable, MembersOnly, VideoRegionBlocked):
        print("---------------")
        print(f"{i} ({df.youtube_id[i]}) - FAILED!")
        print("---------------")
        if os.path.isfile(PATH + "download/" + f"{df.loc[i].label}_{df.loc[i].youtube_id}.mp4"):
            os.remove(PATH + "download/" + f"{df.loc[i].label}_{df.loc[i].youtube_id}.mp4")
        pass

print(f"Number of successfully downloaded and clipped videos: {counter} / {n}")

---------------
0 (---0dWlqevI) - downloaded!
Moviepy - Building video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/clay pottery making_---0dWlqevI.mp4.
MoviePy - Writing audio in clay pottery making_---0dWlqevITEMP_MPY_wvf_snd.mp3


                                                                                                                        

MoviePy - Done.
Moviepy - Writing video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/clay pottery making_---0dWlqevI.mp4



                                                                                                                        

Moviepy - Done !
Moviepy - video ready /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/clay pottery making_---0dWlqevI.mp4
0 (---0dWlqevI) - clipped!
---------------
---------------
1 (---aQ-tA5_A) - downloaded!
Moviepy - Building video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/news anchoring_---aQ-tA5_A.mp4.
MoviePy - Writing audio in news anchoring_---aQ-tA5_ATEMP_MPY_wvf_snd.mp3


                                                                                                                        

MoviePy - Done.
Moviepy - Writing video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/news anchoring_---aQ-tA5_A.mp4



                                                                                                                        

Moviepy - Done !
Moviepy - video ready /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/news anchoring_---aQ-tA5_A.mp4
1 (---aQ-tA5_A) - clipped!
---------------


KeyboardInterrupt: 

# Real Work

In [141]:
df_reduced = pd.read_csv("/home/lockke/code/Koprivnica/737-human-action-recognition/raw_data/train_reduced.csv")

In [142]:
df_reduced.shape

(77000, 5)

In [143]:
df_reduced.head()

Unnamed: 0,label,youtube_id,time_start,time_end,split
0,clay pottery making,LHtsiogBKvM,34,44,train
1,clay pottery making,K4O9S_gpgbY,64,74,train
2,clay pottery making,kVAScTNRnwA,72,82,train
3,clay pottery making,RE9bEWMzVtg,0,10,train
4,clay pottery making,4mbdJHOnPuA,1,11,train


In [145]:
df_reduced.tail()

Unnamed: 0,label,youtube_id,time_start,time_end,split
76995,breaking glass,bwEJAObcdxc,24,34,train
76996,breaking glass,iUZnFrbDX9U,13,23,train
76997,breaking glass,QesfpvuqDOk,7,17,train
76998,breaking glass,wJOzuYXBaO8,0,10,train
76999,breaking glass,GjOO4z2Phd4,0,10,train


In [150]:
df_reduced.loc[52502]

label         mopping floor
youtube_id      kFVPkQKat94
time_start               66
time_end                 76
split                 train
Name: 52502, dtype: object

In [156]:
df_reduced["label"].value_counts()

clay pottery making             110
laughing                        110
wading through mud              110
changing wheel (not on bike)    110
getting a piercing              110
                               ... 
golf driving                    110
casting fishing line            110
punching bag                    110
sleeping                        110
breaking glass                  110
Name: label, Length: 700, dtype: int64

In [167]:
labels = df_reduced["label"].unique()[525:]
my_df = df_reduced[df_reduced["label"].isin(labels)].reset_index()

In [168]:
my_df

Unnamed: 0,index,label,youtube_id,time_start,time_end,split
0,57750,chiseling wood,vglFM8FPf68,45,55,train
1,57751,chiseling wood,VNORa36t2FU,24,34,train
2,57752,chiseling wood,fy7o8xDBsms,12,22,train
3,57753,chiseling wood,7-QF8ltNa04,27,37,train
4,57754,chiseling wood,YhZe_Vq2ZpM,4,14,train
...,...,...,...,...,...,...
19245,76995,breaking glass,bwEJAObcdxc,24,34,train
19246,76996,breaking glass,iUZnFrbDX9U,13,23,train
19247,76997,breaking glass,QesfpvuqDOk,7,17,train
19248,76998,breaking glass,wJOzuYXBaO8,0,10,train


In [169]:
my_df.shape

(19250, 6)

In [127]:
%%time

#my_df_croped = remove_long_videos(my_df)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs


In [172]:
%%time

n = 10
counter = 0

for i in range(n):
    try:
        downloadVideo(df.loc[i])
        print("---------------")
        print(f"{i} ({df.youtube_id[i]}) - downloaded!")
        
        if clipVideo(df.loc[i], fps = 10):
            print(f"{i} ({df.youtube_id[i]}) - clipped!")
            print("---------------")
            counter += 1
        else:
            print(f"{i} ({df.youtube_id[i]}) - NOT CLIPPED!")
            print("---------------")
        
        #uploadCloud(df.loc[i])

    except (TimeoutError, KeyError, VideoUnavailable, VideoPrivate,\
            AgeRestrictedError, LiveStreamError, RecordingUnavailable, MembersOnly, VideoRegionBlocked):
        print("---------------")
        print(f"{i} ({df.youtube_id[i]}) - FAILED!")
        print("---------------")
        if os.path.isfile(PATH + "download/" + f"{df.loc[i].label}_{df.loc[i].youtube_id}.mp4"):
            os.remove(PATH + "download/" + f"{df.loc[i].label}_{df.loc[i].youtube_id}.mp4")
        pass

print(f"Number of successfully downloaded and clipped videos: {counter} / {n}")

---------------
0 (---0dWlqevI) - downloaded!
Moviepy - Building video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/clay pottery making_---0dWlqevI.mp4.
MoviePy - Writing audio in clay pottery making_---0dWlqevITEMP_MPY_wvf_snd.mp3


                                                                                                                        

MoviePy - Done.
Moviepy - Writing video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/clay pottery making_---0dWlqevI.mp4



                                                                                                                        

Moviepy - Done !
Moviepy - video ready /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/clay pottery making_---0dWlqevI.mp4
0 (---0dWlqevI) - clipped!
---------------
---------------
1 (---aQ-tA5_A) - downloaded!
Moviepy - Building video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/news anchoring_---aQ-tA5_A.mp4.
MoviePy - Writing audio in news anchoring_---aQ-tA5_ATEMP_MPY_wvf_snd.mp3


                                                                                                                        

MoviePy - Done.
Moviepy - Writing video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/news anchoring_---aQ-tA5_A.mp4



                                                                                                                        

Moviepy - Done !
Moviepy - video ready /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/news anchoring_---aQ-tA5_A.mp4
1 (---aQ-tA5_A) - clipped!
---------------
---------------
2 (---j12rm3WI) - downloaded!
Moviepy - Building video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/using bagging machine_---j12rm3WI.mp4.
MoviePy - Writing audio in using bagging machine_---j12rm3WITEMP_MPY_wvf_snd.mp3


                                                                                                                        

MoviePy - Done.
Moviepy - Writing video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/using bagging machine_---j12rm3WI.mp4



                                                                                                                        

Moviepy - Done !
Moviepy - video ready /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/using bagging machine_---j12rm3WI.mp4
2 (---j12rm3WI) - clipped!
---------------
---------------
3 (--07WQ2iBlw) - downloaded!
Moviepy - Building video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/javelin throw_--07WQ2iBlw.mp4.
MoviePy - Writing audio in javelin throw_--07WQ2iBlwTEMP_MPY_wvf_snd.mp3


                                                                                                                        

MoviePy - Done.
Moviepy - Writing video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/javelin throw_--07WQ2iBlw.mp4



                                                                                                                        

Moviepy - Done !
Moviepy - video ready /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/javelin throw_--07WQ2iBlw.mp4
3 (--07WQ2iBlw) - clipped!
---------------
---------------
4 (--0NTAs-fA0) - downloaded!
4 (--0NTAs-fA0) - NOT CLIPPED!
---------------
---------------
5 (--0l35AkU34) - downloaded!
Moviepy - Building video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/sipping cup_--0l35AkU34.mp4.
MoviePy - Writing audio in sipping cup_--0l35AkU34TEMP_MPY_wvf_snd.mp3


                                                                                                                        

MoviePy - Done.
Moviepy - Writing video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/sipping cup_--0l35AkU34.mp4



                                                                                                                        

Moviepy - Done !
Moviepy - video ready /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/sipping cup_--0l35AkU34.mp4
5 (--0l35AkU34) - clipped!
---------------
---------------
6 (--33Lscn6sk) - downloaded!
Moviepy - Building video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/flipping pancake_--33Lscn6sk.mp4.
MoviePy - Writing audio in flipping pancake_--33Lscn6skTEMP_MPY_wvf_snd.mp3


                                                                                                                        

MoviePy - Done.
Moviepy - Writing video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/flipping pancake_--33Lscn6sk.mp4



                                                                                                                        

Moviepy - Done !
Moviepy - video ready /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/flipping pancake_--33Lscn6sk.mp4
6 (--33Lscn6sk) - clipped!
---------------
---------------
7 (--3OAstUWtU) - FAILED!
---------------
---------------
8 (--3lTx87ebQ) - downloaded!
8 (--3lTx87ebQ) - NOT CLIPPED!
---------------
---------------
9 (--3ouPhoy2A) - downloaded!
Moviepy - Building video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/eating spaghetti_--3ouPhoy2A.mp4.
MoviePy - Writing audio in eating spaghetti_--3ouPhoy2ATEMP_MPY_wvf_snd.mp3


                                                                                                                        

MoviePy - Done.
Moviepy - Writing video /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/eating spaghetti_--3ouPhoy2A.mp4



                                                                                                                        

Moviepy - Done !
Moviepy - video ready /home/lockke/code/Koprivnica/737-human-action-recognition/737-human-action-recognition/data/upload/eating spaghetti_--3ouPhoy2A.mp4
9 (--3ouPhoy2A) - clipped!
---------------
Number of successfully downloaded and clipped videos: 7 / 10
CPU times: user 1.54 s, sys: 1.28 s, total: 2.82 s
Wall time: 27.1 s
