# Project Luther

*Project 2 of Metis Data Science Bootcamp.* Problem statement below:  

Using information we scrape from the web, can we build linear regression models from which we can learn about the movie industry?

**I chose to focus on movies "Based on a true story" to determine which features predict success, as measured by Worldwide Gross.**

*This notebook shows the execution of web scraping for IMDB.com, Boxofficemojo.com, and The-Numbers.com with Beautiful Soup.*

### Web Scraping Movie Data From IMDB.com:

In [10]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import numpy as np

In [11]:
def read_url(url):
    response = requests.get(url)
    page=response.text
    soup=BeautifulSoup(page,'lxml')
    return soup

Acquire all page URLs for top 800 movies in "Based on a True Story" category:

In [43]:
url_list=[]

for i in range(1,13):
    url_list.append('http://www.imdb.com/search/keyword?keywords=based-on-true-story&mode=advanced&page={}&ref_=kw_nxt&sort=moviemeter,asc'.format(i))

Collect IMDB Scores:

In [44]:
i_scores=[]

for url in url_list:
    scores=read_url(url).find_all("div", {"class": "lister-item-content"})
    for p in scores:
        try:
            if p.find('div')['class']==['ratings-bar']:
                i_scores.append(float(p.find('div',{'class':'ratings-bar'}).find('strong').get_text()))
        except:
            i_scores.append(np.nan)

Collecting Run Times:

In [45]:
runtimes=[]

for url in url_list:
    RT=read_url(url).find_all("p", {"class": "text-muted"})
    for p in RT:
        if p.find('span')==None:
            pass
        elif p.find_all('span')[0]['class']==['certificate'] and p.find_all('span')[2]['class']!=['genre']:
            runtimes.append(int(filter(str.isdigit,str(p.find_all('span')[2].get_text()))))    
        elif p.find_all('span')[0]['class']==['runtime']:
            runtimes.append(int(filter(str.isdigit,str(p.find_all('span')[0].get_text()))))
        else:    
            runtimes.append(np.nan)

Collect ratings:

In [46]:
ratings=[]

for url in url_list:
    rate=read_url(url).find_all("p", {"class": "text-muted"})
    for p in rate:
        if p.find('span')==None:
            pass
        elif p.find('span')['class']==['certificate']:
            ratings.append(str(p.find('span',{'class':'certificate'}).get_text()))
        else:
            ratings.append(np.nan)

Collecting genres:

In [47]:
genres=[]
for url in url_list:
    genre=read_url(url).find_all('span',{"class":"genre"})
    for item in [g.get_text() for g in genre]:
        genres.append(str(item).replace('\n','').strip())

Create categorical variable for each genre where 1 = Movie contained this genre, 0 = Movie did not contain this genre

In [48]:
gset=['Action','Adventure','Biography','Comedy','Crime','Documentary','Drama','Family','History','Horror',
      'Music','Mystery','Romance','Sport','Thriller','War','Western']
Action,Adventure,Biography,Comedy,Crime,Documentary,Drama,Family,History,Horror,Music,Mystery,Romance,Sport,Thriller,War,Western=([] for i in range(len(gset)))

glist=[Action,Adventure,Biography,Comedy,Crime,Documentary,Drama,Family,History,Horror,Music,Mystery,Romance,Sport,Thriller,War,Western]
for g in genres:
    for c in gset:
        if c in g:
            glist[gset.index(c)].append(1)
        else:
            glist[gset.index(c)].append(0)

Collecting movie titles:

In [49]:
titles=[]
for url in url_list:
    d1=read_url(url).find_all('h3',{"class":"lister-item-header"})
    for h in d1:
        for link in h.find_all('a'):
            titles.append(str(link.get_text().encode('utf-8')))

Combine into dataframe:

In [50]:
imdb_movies=pd.DataFrame({"Title":titles,'Rating':ratings,'Runtime':runtimes,'IMDB_Score':i_scores, 'Action':Action,'Adventure':Adventure,'Biography':Biography,'Comedy':Comedy,'Crime':Crime,'Documentary':Documentary,'Drama':Drama,'Family':Family,'History':History,'Horror':Horror,'Music':Music,'Mystery':Mystery,'Romance':Romance,'Sport':Sport,'Thriller':Thriller,'War':War,'Western':Western})
imdb_movies=imdb_movies[['Title','Rating','Runtime','IMDB_Score','Action','Adventure','Biography','Comedy','Crime','Documentary','Drama','Family','History','Horror','Music','Mystery','Romance','Sport','Thriller','War','Western']]

#Replace movie rating with value 1-4 based on typical box office performance
imdb_movies['Rating']=imdb_movies['Rating'].replace('G',1).replace('PG',2).replace('PG-13',4).replace('R',3).replace([i for i in imdb_movies['Rating'] if i not in ['G','PG','PG-13','R']],np.nan)
#imdb_movies['Rating']=[np.nan for i in imdb_movies['Rating'] if type(i)==str]

imdb_movies.head()

Unnamed: 0,Title,Rating,Runtime,IMDB_Score,Action,Adventure,Biography,Comedy,Crime,Documentary,...,Family,History,Horror,Music,Mystery,Romance,Sport,Thriller,War,Western
0,Spotlight,3,128,8.2,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,The Revenant,3,156,8.2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,The Danish Girl,3,119,7.0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,The Big Short,3,130,7.9,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Titanic,4,194,7.7,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Collecting Movies A-Z From BoxOfficeMojo.com

Generate all URLs:

In [52]:
abc_list=[]
abc_dict={'A':9,'B':8,'C':7,'D':6,'E':7,'F':6,'G':7,'H':6,'I':6,'J':4,'K':4,'L':5,'M':7,'N':5,'O':5,'P':7,'R':5,'S':13,'T':8,'U':2,'V':3,'W':6,'Y':2,'Z':2}
for letter in abc_dict.keys():
    for num in range(1,abc_dict[letter]+1):
        abc_list.append('http://www.boxofficemojo.com/movies/alphabetical.htm?letter={}&page={}&p=.htm'.format(letter,num))

Collect Data for: Title, Total Gross, Total Theaters, Opening Gross, Opening Theaters, Date

In [53]:
title,tot_gross,t_theaters,opening,o_theaters,date= ([] for i in range(6))

for url in abc_list: 
    table1=read_url(url).find_all('table')[3]
    rows=table1.find_all('tr') 

    for row in rows[1:]:
        title.append(str(row.find_all('td')[0].find('a').get_text().encode('utf-8')))
        tot_gross.append(int(str(row.find_all('td')[2].get_text()).replace('$','').replace(',','').replace('*','').replace('n/a','0')))
        opening.append(int(str(row.find_all('td')[4].get_text()).replace('$','').replace(',','').replace('n/a','0')))
        date.append(str(row.find_all('td')[6].get_text()))
        try: 
            t_theaters.append(int(filter(str.isdigit, str(row.find_all('td')[3].get_text()))))
        except:
            t_theaters.append(np.nan)
        try: 
            o_theaters.append(int(filter(str.isdigit, str(row.find_all('td')[5].get_text()))))
        except:
            o_theaters.append(np.nan)

Create new variable "Primetime", indicating if a movie was released during a typically successful time of year (summer or holiday season).

In [54]:
date2=[]
primetime=[]
for d in date:
    try:
        date2.append(datetime.strptime(d,'%m/%d/%Y'))
    except:
        date2.append('')
for e in date2:
    try:
        d3=e.month
        if d3 in [5,6,7,11,12]:
            primetime.append(1)
        else:
            primetime.append(0)
    except:
        primetime.append(np.nan)

Combine in dataframe:

In [55]:
abc_movies=pd.DataFrame({'Title':title, 'TotalGross':tot_gross, 'TotalTheaters':t_theaters, 'OpeningGross':opening, 'OpeningTheaters':o_theaters, 'Primetime':primetime})
abc_movies=abc_movies[['Title','TotalTheaters','OpeningGross','OpeningTheaters','Primetime','TotalGross']]
abc_movies.head()

Unnamed: 0,Title,TotalTheaters,OpeningGross,OpeningTheaters,Primetime,TotalGross
0,The A-Team,3544,25669455,3535,1,77222099
1,A.C.O.D.,42,19001,3,0,175705
2,A.I. Artificial Intelligence,3242,29352630,3242,1,78616689
3,Aaja Nachle,66,257500,66,1,484108
4,Aarakshan (Reservation),91,344661,91,0,651096


### Collecting data for Title, Budge, and Worldwide Gross from The-numbers.com

In [56]:
url='http://www.the-numbers.com/movie/budgets/all'
titles=[]
budgets=[]
w_grosses=[]

table1=read_url(url).find_all('table')[0]
m_list=table1.find_all('td')[2::6]
budget=table1.find_all('td')[3::6]
w_gross=table1.find_all('td')[5::6]

for m in m_list:
    titles.append(str(m.find('b').find('a').get_text().encode('utf-8')))
for b in budget:
    budgets.append(int(str(b.get_text()).replace('$','').replace(',','')))
for w in w_gross:
    w_grosses.append(int(str(w.get_text()).replace('$','').replace(',','')))

In [57]:
budget_idx=pd.DataFrame({'Title':titles, 'Budget':budgets, 'WorldwideGross':w_grosses})

**Merge A-Z Movies from BoxOfficeMojo.com, IMDB.com "Based on a True Story" movies, and The-numbers.com movies by Title:**

In [58]:
movies=pd.merge(imdb_movies,abc_movies,how='inner',on=['Title'])
movies=pd.merge(movies,budget_idx,how='inner',on=['Title'])
movies.dropna(inplace=True)
movies.head()

Unnamed: 0,Title,Rating,Runtime,IMDB_Score,Action,Adventure,Biography,Comedy,Crime,Documentary,...,Thriller,War,Western,TotalTheaters,OpeningGross,OpeningTheaters,Primetime,TotalGross,Budget,WorldwideGross
0,The Revenant,3,156,8.2,0,1,0,0,0,0,...,1,0,0,3711,474560,4,1,177143137,135000000,430643137
1,The Big Short,3,130,7.9,0,0,1,1,0,0,...,0,0,0,2529,705527,8,1,69521861,28000000,127221861
2,Titanic,4,194,7.7,0,0,0,0,0,0,...,0,0,0,3265,28638131,2674,1,600788188,200000000,2207615668
3,Eddie the Eagle,4,106,7.6,0,0,1,1,0,0,...,0,0,0,2044,6084682,2042,0,11917804,23000000,11917804
4,The Wolf of Wall Street,3,180,8.2,0,0,1,1,1,0,...,0,0,0,2557,18361578,2537,1,116900694,100000000,391976723


In [64]:
movies.to_csv('movies.csv')