In [1]:
from imdb import IMDb
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import requests
from bs4 import BeautifulSoup as bs

In [2]:
ia = IMDb(accessSystem='http', adultSearch=False)

In [3]:
html = 'http://www.boxofficemojo.com/franchises/?view=Franchise&p=.htm'
r = requests.get(html)  # Most sites can be accessed via the get function


In [4]:
page_source = r.content
page_soup = bs(page_source, "lxml")

In [5]:
print("The number of tables in the document is ", len(page_soup.find_all('table')) )

The number of tables in the document is  4


In [6]:
# Table attributes: table border="0" cellspacing="1" cellpadding="5" width="95%"
no_cell_space = len(page_soup.find_all('table', attrs={'cellspacing': '1'}))
print("The number of tables in the document with this cellspacing is ", no_cell_space)

no_cell_pad = len(page_soup.find_all('table', attrs={'cellpadding': '5'}))
print( "The number of tables in the document with this cellpadding is ", no_cell_pad)

no_border = len(page_soup.find_all('table', attrs={'border': '0'}))
print( "The number of tables in the document with this border is ", no_border)

The number of tables in the document with this cellspacing is  1
The number of tables in the document with this cellpadding is  1
The number of tables in the document with this border is  4


In [7]:
table = page_soup.find_all('table', attrs={'cellspacing': '1'})[0]  

In [8]:
#check header row
data_row = table.find_all('tr')[0]
data_cols = data_row.find_all('td')  # List of columns in the first data row
for column in data_cols:
    print( column.text)  # Access the text in each column

Franchise (click to view chart)
Total Gross
# Movies / Average
#1 Picture
Gross


In [9]:
# Check data 
data_row = table.find_all('tr')[5]
data_cols = data_row.find_all('td')  # List of columns in the first data row
for column in data_cols:
    print( column.text)  # Access the text in each column

Alice in Wonderland
$411.2
2
$205.6
Alice in Wonderland
$334.2


In [10]:
# Set up the dictionary to save the data
output_columns = ['Franchise', 'Total Gross','Total Movies', 'Average Revenue', 'Top Movie', 'Top Gross']
output = dict((x, []) for x in output_columns)
#Ignore first row coz its the header row
all_rows = table.find_all('tr')[1:]
for row in all_rows:
    row_cols = row.find_all('td')
    # Loop through the columns and output keys to populate dictionary
    for dict_key, col in zip(output_columns, row_cols):
        output[dict_key].append(col.text)
        
# Put output into a DataFrame and rearrange columns in desired order
output_pd = pd.DataFrame(output)
output_pd = output_pd[output_columns]

In [11]:
# Get reference urls
output_columns = ['FranchiseID','MovieID']
IDoutput = dict((x, []) for x in output_columns)
#Ignore first row coz its the header row
all_rows = table.find_all('tr')[1:]
for row in all_rows:
    row_cols = row.find_all('a', href=True)
    # Loop through the columns and output keys to populate dictionary
    for dict_key, col in zip(output_columns, row_cols):
        IDoutput[dict_key].append(col['href'])
        
# Put output into a DataFrame and rearrange columns in desired order
IDoutput_pd = pd.DataFrame(IDoutput)
IDoutput_pd = IDoutput_pd[output_columns]

In [12]:
# Add ID columns
output_pd['FranchiseID']= IDoutput['FranchiseID']
output_pd['MovieID']= IDoutput['MovieID']

In [13]:
#Make numbers readable by Numpy
totalmovies=np.array(output_pd['Total Movies'],dtype=int)
A=output_pd['Total Gross']
for i in range(A.shape[0]):
    A[i]=A[i].strip('$')
    A[i]=A[i].replace(',','')
    A[i]=A[i].replace('k','e-3')    
totalgross=np.array(A,dtype=float)

A=output_pd['Average Revenue']
for i in range(A.shape[0]):
    A[i]=A[i].strip('$')
    A[i]=A[i].replace(',','')
    A[i]=A[i].replace('k','e-3')    
avgrev=np.array(A,dtype=float)

A=output_pd['Top Gross']
for i in range(A.shape[0]):
    A[i]=A[i].strip('$')
    A[i]=A[i].replace(',','')
    A[i]=A[i].replace('k','e-3')    
topgross=np.array(A,dtype=float)
A=output_pd['FranchiseID']
for i in range(A.shape[0]):
    A[i]=A[i].strip('.')    
output_pd['FranchiseID']=A
output_pd['Total Movies']=totalmovies
output_pd['Total Gross']=totalgross
output_pd['Average Revenue']=avgrev
output_pd['Top Gross']=topgross

In [17]:
df_pop=output_pd[output_pd['Average Revenue']>100]
df_pop=df_pop[df_pop['Total Movies']>1]

In [18]:
df_pop

Unnamed: 0,Franchise,Total Gross,Total Movies,Average Revenue,Top Movie,Top Gross,FranchiseID,MovieID
1,300,317.2,2,158.6,300,210.6,/chart/?id=300.htm,/movies/?id=300.htm
4,Alice in Wonderland,411.2,2,205.6,Alice in Wonderland,334.2,/chart/?id=aliceinwonderland.htm,/movies/?id=aliceinwonderland10.htm
6,Alvin and the Chipmunks,662.7,5,132.5,Alvin and the Chipmunks 2,219.6,/chart/?id=alvinseries.htm,/movies/?id=alvinandthechipmunksii.htm
8,American Pie,409.2,4,102.3,American Pie 2,145.1,/chart/?id=americanpie.htm,/movies/?id=americanpie2.htm
15,Austin Powers,473.2,3,157.7,Goldmember,213.3,/chart/?id=austinpowers.htm,/movies/?id=austinpowers3.htm
17,Avengers,1755.3,3,585.1,Avengers: Infinity War,672.9,/chart/?id=avengersfranchise.htm,/movies/?id=marvel0518.htm
18,Back to the Future,416.8,3,138.9,Back to the Future,210.6,/chart/?id=backtothefuture.htm,/movies/?id=backtothefuture.htm
19,Bad Boys,204.4,2,102.2,Bad Boys II,138.6,/chart/?id=badboys.htm,/movies/?id=badboys2.htm
23,Batman,2407.7,11,218.9,The Dark Knight,533.3,/chart/?id=batman.htm,/movies/?id=darkknight.htm
27,Beverly Hills Cop,431.0,3,143.7,Beverly Hills Cop,234.8,/chart/?id=beverlyhillscop.htm,/movies/?id=beverlyhillscop.htm


In [19]:
df_pop.to_csv('Franchise_dataset.csv')