# Imports

In [2]:
from __future__ import print_function, division
%autosave 120

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import requests
from bs4 import BeautifulSoup
import re

Autosaving every 120 seconds


# Scrape IMDB search of movies 'based on novels'
I searched for most popular "based-on-novel" feature films,   
released 1920-01-01 to 2017-04-01, with 1-9999999 votes,   
user ratings between 1.0 - 10.0, running time of 1 - 999999 minutes.

In [None]:
def imdb_index(a, b):
    website = 'http://www.imdb.com/search/title?boxoffice_gross_us=1,9999999999&keywords=based-on-novel&release_date=1955,2017-04-01&title_type=feature&page=%d&ref_=adv_nxt'
    for num in range(a, b):
        url = website % num
        soup = BeautifulSoup(requests.get(url).text, "lxml")
        pretty_imdb = str(soup.prettify)
        pickle.dump(pretty_imdb, open("scrape_%s.p" % num, "wb"))       

In [None]:
imdb_index(1, 201)

# 'Search' function: From files, scrape IMDBID, Ranks, Titles, Links, & Release

In [None]:
#when everything on the page works.

def one_frame(n):
    number = int(n)
    soup = BeautifulSoup(pickle.load(open("scrape_%d.p" %number, "rb")), "lxml")

    df = pd.DataFrame()
    ranks = []
    title = []
    links = []
    imdbid = []
    release = []

    for num in range(0, 50):
        item1 = str(soup.find_all('span', {'class' : 'lister-item-index unbold text-primary'})[num].text)
        ranks.append(item1)
    
        item2 = str(soup.find_all('span', {'class' : 'lister-item-year text-muted unbold'})[num].text)
        release.append(item2)

        item3 = soup.find_all('span', {'class' : 'lister-item-index unbold text-primary'})[num].parent()[1]['href']
        links.append(item3)
    
        item4 = str(soup.find_all('span', {'class' : 'lister-item-index unbold text-primary'})[num].parent()[1].text)
        title.append(item4)
    
        item8 = soup.find_all('span', {'class' : 'userRatingValue'})[num].get('data-tconst')
        imdbid.append(item8)
    
    del soup

    df['ranks'] = ranks
    df['title'] = title
    df['imdbid'] = imdbid
    df['links'] = links
    df['release'] = release
    return df

In [None]:
imdb_search = [] 
for number in range (1, 201):
    data_imdb = one_frame(int(number))
    imdb_search.append(data_imdb)

In [None]:
super_imdb = pd.concat(imdb_search)
# pickle.dump(super_imdb, open("super_imdb.p", "wb"))

# 'Metascore' function: From files, scrape IMDBID & Metascrore

In [None]:
def mscore_frame(a, b):
    
    metacritic_list = [] # empty list
    for number in range(a, b):
        soup = BeautifulSoup(pickle.load(open("scrape_%d.p" % number, "rb")), "lxml")

        df_mscore = pd.DataFrame()
        imdbid = []
        metascore = []

        ratings = soup.find_all('div', {'class' : "inline-block ratings-metascore"})
        length = len(ratings)

        for num in range(0, length):
            item12 = str(soup.find_all('div', {'class' : "inline-block ratings-metascore"})[num].parent()[-1].text)
            metascore.append(item12)
    
            item13 = soup.find_all('div', {'class' : "inline-block ratings-metascore"})[num].parent()[4].get('data-tconst')
            imdbid.append(item13)

        df_mscore['imdbid'] = imdbid
        df_mscore['metascore'] = metascore
        metacritic_list.append(df_mscore)
        
        del soup

    return metacritic_list

In [None]:
total_score = mscore_frame(1, 201)

In [None]:
super_meta = pd.concat(total_score)
# pickle.dump(super_meta, open("super_meta.p", "wb"))

##  Merge 'Search' df & 'Metascore' df, & clean.

In [1]:
# super_imdb = pickle.load(open("super_imdb.p", "rb"))
# super_meta = pickle.load(open("super_meta.p", "rb"))
lean_results = super_meta.merge(super_imdb, on='imdbid', how='left')
# pickle.dump(lean_results, open("lean_results.p", "wb"))

# 'AmazonID' Function: Using IMDBID, scrape AmazonID & Amazon link from IMDB literature pages

In [68]:
def amazon_frame(data, n):
    
    amz_df = pd.DataFrame()
    imdbid = []
    amazon = []
    amazonid = []

    website = 'http://www.imdb.com/title/%s/literature?ref_=tt_ql_dt_8'
    for ttid in data[int(n):int(n+1)]['imdbid']:
        url = website % ttid
        soup = BeautifulSoup(requests.get(url).text, "lxml")
        
        try: 
            item15 = soup.select('a[href^="http://www.amazon.com/exec/"]')[0]
            amazon.append(item15)
    
            item16 = item15.text
            amazonid.append(item16)
    
        except:
            amazon.append(np.nan)
            amazonid.append(np.nan)
        
        imdbid.append(ttid)    
        del soup

    amz_df['imdbid'] = imdbid
    amz_df['amazon'] = amazon
    amz_df['amazonid'] = amazonid
    return amz_df

In [None]:
amazon_list = [] # empty list

In [69]:
page = 1
for number in range (1, 1232):
    print('Scraping page %d out of 1231' % page)
    page +=1
    amz_imdb = amazon_frame(lean_results, int(number))
    amazon_list.append(amz_imdb)
# function breaks when page has a different layout. skip & continue.

In [85]:
data_amazon = pd.concat(amazon_list)
final_amazon = data_amazon.dropna()

##  Merge 'AmazonID' df with 'Search+Metascore' df

In [96]:
almost_final = final_amazon.merge(lean_results, on='imdbid', how='left')

# 'Gross' function: Using IMDBID, scrape Gross Box Office from IMDB business pages

In [2]:
def gross_frame(data, n):

    money_df = pd.DataFrame()
    imdbid = []
    gross = []
    
    website = 'http://www.imdb.com/title/%s/business?ref_=tt_dt_bus'
    for ttid in data[int(n):int(n+1)]['imdbid']:
        url = website % ttid
        soup = BeautifulSoup(requests.get(url).text, "lxml")
        
        try: 
            parent = soup.find_all('div', {'id' : 'tn15content'})[0]
            for child in parent.find_all('h5'):
                if("Gross" in child):
                    gross.append(str(child.next_sibling.strip()))
        except:
            gross.append(np.nan)
 
        imdbid.append(ttid)
        del soup

    money_df['imdbid'] = imdbid
    money_df['gross'] = gross 
    return money_df

In [102]:
gross_list = [] # empty list

In [153]:
page = 1
for number in range (1, 646):
    print('Scraping page %d out of 645' % page)
    page +=1
    df = gross_frame(almost_final, int(number))
    gross_list.append(df)

Scraping page 645 out of 645


In [158]:
gross_df = pd.concat(gross_list).dropna()

## Merge 'Gross' df with 'Search+Metascore+AmazonID' df

In [161]:
final_imdb = gross_df.merge(almost_final, on='imdbid', how='left')
final_imdb = final_imdb.dropna()
# final_imdb.to_pickle('final_imdb.p')

# Clean 

In [None]:
def feature_cleaner(feature):
    empty_list = []
    for s in final_imdb[feature]:
        s = re.sub("[^0-9]", "", s)
        empty_list.append(s)
    return empty_list

## Box Office & Metascore

In [12]:
# final_imdb = pickle.load(open("final_imdb.p", 'rb'))

In [15]:
final_imdb['box_office'] = feature_cleaner('gross')

In [16]:
final_imdb['box_office'] = final_imdb['box_office'].astype(np.int64)
final_imdb['metascore'] = final_imdb['metascore'].astype(np.int64)
final_imdb['amazon'] = final_imdb['amazon'].astype(str)

## Clean release

In [None]:
final_imdb['release'] = feature_cleaner('release')

In [None]:
pickle.dump(final_imdb, open("final_imdb.p", "wb"))