In [57]:
'''
This script is responsible for intial results of 
cleaning CSV movie_metadata retrieved from kaggle.com
'''

import pandas as pd 
import numpy as np 

movie_df = pd.read_csv('movie_metadata.csv')

# Deleting columns that are not useful 
delete = ['color', 'num_critic_for_reviews', 'actor_3_facebook_likes', 'actor_1_facebook_likes','movie_imdb_link', 'country', 'language', 
         'aspect_ratio', 'num_user_for_reviews', 'num_voted_users', 'actor_2_facebook_likes', 'content_rating', 'director_facebook_likes',
         'cast_total_facebook_likes', 'movie_facebook_likes']
for d in delete: del movie_df[d]

# Delete rows with empty cells for any column     
for c in movie_df: 
    movie_df[c].replace('', np.nan, inplace=True)
    movie_df.dropna(subset=[c], inplace=True)

movie_df = movie_df.reset_index(drop=True)

genres_set = set() 
for i in range(len(movie_df)): 
    g = movie_df['genres'][i].split('|')
    genres_set.update(g)

genres = list(genres_set)
genres = dict.fromkeys(genres)
genres_ct = {key: 0 for key in genres} 

# Count number of movies that belong to each genre
for i in range(len(movie_df)): 
    g = movie_df['genres'][i].split('|')
    for item in g: 
        if item in genres: 
            genres_ct[item] += 1 

# Get the top 5 genres 
sortedCounts = [(genres_ct[key], key) for key in genres_ct]
sortedCounts.sort()
sortedCounts.reverse() 
top_genres = [item[1] for item in sortedCounts[:5]] 

# Build list of row indices that need to dropped because they are not in the top 5 genres
topg_set = set(top_genres) 
drop = [] 
for i in range(len(movie_df)):
    g = set(movie_df['genres'][i].split('|'))
    if len(set.intersection(g, topg_set)) == 0:
        drop.append(i)

movie_df = movie_df.drop(movie_df.index[drop])
movie_df = movie_df.reset_index(drop=True) 
movie_df.to_csv('movie_clean_metadata.csv')

In [31]:
'''
This script is responsible for scraping 
additional movie data from imdb.com
'''

from bs4 import BeautifulSoup
import urllib.request
import csv
import re
import datetime
import cognitive_face as CF
import time

# Create or overwrite 'movie_scrape.csv' file
columns = ['director_name', 'duration', 'actor_2_name', 'gross', 'genres', 'actor_1_name', 'movie_title', 'actor_3_name', 'facenumber_in_poster', 'plot_keywords', 'budget', 'title_year', 'imdb_score']
csv_file = open('movie_scrape_2012.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(columns)

# Helper function to remove all non-numeric characters from a string
def removeNonNumeric(str):
    return re.sub("[^0-9]", "", str)

# Function to scrape data from IMDB box office results
def scrapeData(soup):
    
    # Finds the link for next page
    pagination = soup.find('div', {'class':'desc'})
    link_tag = pagination.find('a', {'class':'lister-page-next next-page'})
    
    for title in soup.find_all('h3', class_='lister-item-header'):
        
        movie_title = title.find('a').text
        movie_url = title.find('a')['href']
        m = urllib.request.urlopen('https://www.imdb.com/' + movie_url).read()
        msoup = BeautifulSoup(m, 'html.parser')
        
        try:
            imdb_score = msoup.find_all('span', attrs={'itemprop':'ratingValue'})[0].text
        except IndexError:
            imdb_score = ''
            
        try:
            genre_div = msoup.find_all('div', itemprop='genre')[0].find_all('a')
            i, genres = 0, ''
            for g in genre_div:
                i += 1
                if i == len(genre_div):
                    genres += g.text
                else:
                    genres += g.text + ' |'
        except IndexError:
            genres = ''

        try:
            director_name = msoup.find_all(attrs={'itemprop':'director'})[0].find('span').text
        except IndexError:
            director_name = ''
        
        try:
            stars_spans = msoup.find_all(attrs={'itemprop':'actors'})
            i, actor_1_name, actor_2_name, actor_3_name = 1, '', '', ''
            for s in stars_spans:
                if i == 1:
                    actor_1_name += s.find('span').text
                elif i == 2:
                    actor_2_name += s.find('span').text
                else:
                    actor_3_name += s.find('span').text
                i += 1
        except TypeError:
            actor_1_name, actor_2_name, actor_3_name = '', '', ''
        
        try:
            details_divs = msoup.find_all('div', id='titleDetails')[0].find_all('div')
            budget, gross, title_year, duration = '', '', '', ''
            for d in details_divs:
                try:
                    if d.find('h4').text == 'Budget:':
                        budget = removeNonNumeric(d.text.replace('\n',''))
                    elif d.find('h4').text == 'Gross:':
                        gross = removeNonNumeric(d.text.replace('\n',''))
                    elif d.find('h4').text == 'Release Date:':
                        date = d.text.split('\n')[1].split('\s')[0]
                        title_year = re.search('\d{4}', date).group(0)
                    elif d.find('h4').text == 'Runtime:':
                        duration = removeNonNumeric(d.text.replace('\n',''))
                except AttributeError:
                    pass
        except IndexError:
            budget, gross, title_year, duration = '', '', '', ''

        try: 
            keywords_span = msoup.find_all('span', attrs={'itemprop':'keywords'})
            keywords = [key.text for key in keywords_span]
            i, plot_keywords = 0, ''
            for word in keywords: 
                i += 1 
                if i == len(keywords):
                    plot_keywords += word 
                else: 
                    plot_keywords += word + '|'
        except TypeError:
            plot_keywords = ''
        
        try:
            poster_img_url = msoup.find_all('img', attrs={'itemprop':'image'})[0]['src']
            KEY = '44053727cda344da8fbf58088dce0044'
            CF.Key.set(KEY)
            result = CF.face.detect(poster_img_url)
            facenumber_in_poster = len(result)
        except IndexError:
            facenumber_in_poster = 0
        
        new_entry = [director_name, duration, actor_2_name, gross, genres, actor_1_name, movie_title, actor_3_name, facenumber_in_poster, plot_keywords, budget, title_year, imdb_score]
        csv_writer.writerow(new_entry)
        
        time.sleep(2)
        
    return link_tag

# url_base = 'http://www.imdb.com/search/title' 
# next_page = '?year=2015,2015&title_type=feature&sort=boxoffice_gross_us,desc'
# stop_ct = 0
# while True: 
#     r = urllib.request.urlopen(url_base + next_page).read()
#     soup = BeautifulSoup(r, 'html.parser')
#     tag = scrapeData(soup)
#     stop_ct += 1
#     print('Retrieving next page of movies')
#     if tag is None and stop_ct <= 20: 
#         print('Done')
#         break
#     else:
#         next_page = tag['href']

# url_base = 'http://www.imdb.com/search/title' 
# next_page = '?year=2014,2014&title_type=feature&sort=boxoffice_gross_us,desc'
# stop_ct = 0
# while True: 
#     r = urllib.request.urlopen(url_base + next_page).read()
#     soup = BeautifulSoup(r, 'html.parser')
#     tag = scrapeData(soup)
#     stop_ct += 1
#     print('Retrieving next page of movies')
#     if tag is None and stop_ct <= 20: 
#         print('Done')
#         break
#     else:
#         next_page = tag['href']

# url_base = 'http://www.imdb.com/search/title' 
# next_page = '?year=2013,2013&title_type=feature&sort=boxoffice_gross_us,desc'
# stop_ct = 0
# while True: 
#     r = urllib.request.urlopen(url_base + next_page).read()
#     soup = BeautifulSoup(r, 'html.parser')
#     tag = scrapeData(soup)
#     stop_ct += 1
#     print('Retrieving next page of movies')
#     if tag is None and stop_ct <= 20: 
#         print('Done')
#         break
#     else:
#         next_page = tag['href']

url_base = 'http://www.imdb.com/search/title' 
next_page = '?year=2012,2012&title_type=feature&sort=boxoffice_gross_us,desc'
stop_ct = 0
while True: 
    r = urllib.request.urlopen(url_base + next_page).read()
    soup = BeautifulSoup(r, 'html.parser')
    tag = scrapeData(soup)
    stop_ct += 1
    print('Retrieving next page of movies')
    if tag is None and stop_ct <= 20: 
        print('Done')
        break
    else:
        next_page = tag['href']

Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies
Retrieving next page of movies


KeyboardInterrupt: 

In [82]:
'''
This script is responsible for cleaning and merging the 
scraped movie data with the existing movie data set
'''
# Helper function to delete rows with empty cells for any column in movie dataframe    
def deleteEmptyCol(movie_df):
    for c in movie_df: 
        movie_df[c].replace('', np.nan, inplace=True)
        movie_df.dropna(subset=[c], inplace=True)

    movie_df = movie_df.reset_index(drop=True)

# Helper function to remove movies that are not in the top 5 genres
def topGenres(movie_df):
    genres_set = set() 
    for i in movie_df['genres'].iteritems():
        g = i[1].split('|')
        genres_set.update(g)

    genres = list(genres_set)
    genres = dict.fromkeys(genres)
    genres_ct = {key: 0 for key in genres} 

    # Count number of movies that belong to each genre
    for i in movie_df['genres'].iteritems():
        g = i[1].split('|')
        for item in g: 
            if item in genres: 
                genres_ct[item] += 1 

    # Get the top 5 genres 
    sortedCounts = [(genres_ct[key], key) for key in genres_ct]
    sortedCounts.sort()
    sortedCounts.reverse() 
    top_genres = [item[1] for item in sortedCounts[:5]] 

    # Build list of row indices that need to dropped because they are not in the top 5 genres
    topg_set = set(top_genres) 
    drop = [] 
    for i in movie_df['genres'].iteritems():
        g = set(i[1].split('|'))
        if len(set.intersection(g, topg_set)) == 0:
            drop.append(i)
    
movie_df_2015 = pd.read_csv('movie_scrape_2015.csv')
deleteEmptyCol(movie_df_2015)
movie_df_2014 = pd.read_csv('movie_scrape_2014.csv')
deleteEmptyCol(movie_df_2014)
movie_df_2013 = pd.read_csv('movie_scrape_2013.csv')
deleteEmptyCol(movie_df_2013)
movie_df_2012 = pd.read_csv('movie_scrape_2012.csv')
deleteEmptyCol(movie_df_2012)

kaggle_movie_df = pd.read_csv('movie_clean_metadata.csv')

frames = [movie_df_2015, movie_df_2014, movie_df_2013, movie_df_2012, kaggle_movie_df]
scraped_merge = pd.concat(frames)
print(len(scraped_merge))
del scraped_merge['Unnamed: 0']

topGenres(scraped_merge)
scraped_merge = scraped_merge.reset_index(drop=True)

scraped_merge.to_csv('movie_master_dataset')

4487
