In [1]:
from pytube import YouTube
import cv2
import os
from PIL import ImageStat, Image
import numpy as np
import pandas as pd
import operator
from functools import reduce
import shutil
from tqdm import tqdm
from imdb import IMDb

In [2]:
OFFSET = 45

In [3]:
#create list of genres which will use in future
ia = IMDb()
lst_genres = ['Action','Adventure','Animation','Children','Comedy','Crime','Documentary',\
            'Drama','Fantasy','Film_Noir','Horror','Musical','Mystery','Romance','Sci_Fi',\
            'Thriller','War','Western']

In [4]:
#get trailer information by given limit
#cam(cv2.VideoCapture): cv2 capture video
#limit(int): percentage to jump
#return(list): list of mean, median, variance for each frame
def get_trailer_info(cam, limit):
    lst_mean = [[] for _ in range(3)]
    lst_median = [[] for _ in range(3)]
    lst_var = [[] for _ in range(3)]
    total = int(cam.get(cv2.CAP_PROP_FRAME_COUNT))
    up = int(total*limit)
    down = int(total*(1-limit))
    count = 0
    while(True):
        ret,frame = cam.read()
        count += 1
        if ret:
            if count >= up and count <= down:
                result = ImageStat.Stat(Image.fromarray(frame))
                if np.var(result.mean) >= 10:
                    for i in range(3):
                        lst_mean[i].append(result.mean[i])
                        lst_median[i].append(result.median[i])
                        lst_var[i].append(result.var[i])
        else: 
            break
    return [lst_mean,lst_median,lst_var]

In [5]:
#generate 27 features
#lst_mean(list): list of ImageStat given mean
#lst_median(list): list of ImageStat given median
#lst_var(list): list of ImageStat given variance
#return(list): list of 9*rgb
def color_preprocessing(lst_mean,lst_median,lst_var):
    meanmean = np.mean(lst_mean,axis=1)
    meanmedian = np.median(lst_mean,axis=1)
    meanvar = np.var(lst_mean,axis=1)
    medianmean = np.mean(lst_median,axis=1)
    medianmedian = np.median(lst_median,axis=1)
    medianvar = np.var(lst_median,axis=1)
    varmean = np.mean(lst_var,axis=1)
    varmedian = np.median(lst_var,axis=1)
    varvar = np.var(lst_var,axis=1)
    return [meanmean,meanmedian,meanvar,medianmean,medianmedian,medianvar,varmean,varmedian,varvar]

In [6]:
#get dataframe by given path
#path(str): dir
#title(str): movie title
#return(list): list of movie trailer info
def trailer_to_dataframe(path, title):
    trailer_lst = os.listdir(path)
    count = 0
    for i in trailer_lst:
        if i.split('.')[-1] == 'mp4':
            cam = cv2.VideoCapture(i)
            lst = get_trailer_info(cam, 0.05)
            result = color_preprocessing(lst[0],lst[1],lst[2])
            temp = []
            temp.append(i[:-11])
            temp.append(i[-9:-5])
            temp.extend(reduce(operator.concat,[list(j) for j in result]))
            temp.append(title)
    return temp

In [7]:
#download movie trailer
#offset(int): offset of list from ml-youtube.csv
#return(list): list of movie trailer information
def download_trailer(offset):
    path = os.getcwd()
    base_link = 'https://www.youtube.com/watch?v='
    link = base_link + df_tube['youtubeId'][offset]
    name = df_tube['title'][offset]
    lst = []
    try:
        yt = YouTube(link)
        file_name = df_tube['title'][offset]
        try:
            yt.streams.filter(progressive=True,file_extension='mp4').order_by('resolution').last().download(path, \
                                                                                    filename = file_name)
            lst = trailer_to_dataframe(path, name)
            trailer_lst = os.listdir(path)
            count = 0
            for i in trailer_lst:
                if i.split('.')[-1] == 'mp4':
                    filename = i
            os.remove(filename)
        except:
            pass
            #print('error in download')
    except:
        pass
        #print('error in access video')
    if lst != []:
        lst.extend(imdb_info(name))
    return lst

In [8]:
#get imdb information of genres, directors, runtim, production country, actors(3), plot
#title(str): title of movie
#return(list): all the information mentioned above
def imdb_info(title):
    plot = ''
    lst = []
    result = ia.search_movie(title)
    if  result == []:
        for _ in range(len(lst_genres)+7):
            lst.append('')
    else:
        movie = result[0]
        movie_id = movie.getID()
        movie = ia.get_movie(movie_id)
        temp_genres = movie['genres']
        for i in lst_genres:
            if i in temp_genres:
                lst.append(1)
            else:
                lst.append(0)
        if movie['directors']:
            lst.append(str(movie['directors'][0]))
        else:
            lst.append('')
        if movie['runtime']:
            lst.append(int(movie['runtime'][0]))
        else:
            lst.append('')
        lst.append(str(movie['production country'][0]))
        lst.append(str(movie['actors'][0]))
        lst.append(str(movie['actors'][1]))
        lst.append(str(movie['actors'][2]))
        temp = movie.get('plot')
        if temp:
            for i in temp:
                plot += i.split('::')[0] + ' '
            plot = plot[:-1]
        lst.append(plot)
    return lst

In [9]:
#read in movie trialer youtube csv file
df_tube = pd.read_csv('ml-youtube.csv')
df_tube.head()

Unnamed: 0,youtubeId,movieId,title
0,K26_sDKnvMU,1,Toy Story (1995)
1,3LPANjHlPxo,2,Jumanji (1995)
2,rEnOoWs3FuA,3,Grumpier Old Men (1995)
3,j9xml1CxgXI,4,Waiting to Exhale (1995)
4,ltwvKLnj1B4,5,Father of the Bride Part II (1995)


In [10]:
len(df_tube)

25623

In [None]:
#get the info from movie trailers
df_result = pd.DataFrame(columns=['name','year','r_mm','g_mm','b_mm','r_mme','g_mme','b_mme','r_mv','g_mv','b_mv',\
                                      'r_mem','g_mem','b_mem','r_meme','g_meme','b_meme','r_mev','g_mev','b_mev',\
                                      'r_vm','g_vm','b_vm','r_vme','g_vme','b_vme','r_vv','g_vv','b_vv', 'title',\
                                      'Action','Adventure','Animation','Children','Comedy','Crime','Documentary',\
                                      'Drama','Fantasy','Film_Noir','Horror','Musical','Mystery','Romance','Sci_Fi',\
                                      'Thriller','War','Western','director','runtime','production_country','actor_1',\
                                      'actor_2','actor_3','plot'])
count = 0
for i in tqdm(range(500)):
    lst = download_trailer(i+OFFSET)
    if lst != []:
        df_result.loc[count] = lst
        count+=1

  0%|          | 2/500 [00:39<2:15:18, 16.30s/it]

In [None]:
name = str(OFFSET) + '.csv'
df_result.to_csv(name)