# An Analysis of the IMDB Top 50 Best films from 2015 - 2019 (A 2 Part Jupyter Notebook)
## PART I - This Jupyter notebook contains:
##### * IMDB list Data Scraper built using beautifulsoup.
##### * Scraping of movie budgets from individual links
##### * Converting Scraped Data into a pandas Dataframe
##### * Data Exported to csv

## - Hasan Zafar

In [80]:
import pandas as pd
import numpy as np
import re
import lxml
import csv
import matplotlib

from bs4 import BeautifulSoup
from requests import get
%matplotlib inline

### List of URL's for "Top 50 Best Films of 20xx"

In [3]:
url= "https://www.imdb.com/list/ls041125816/"
url18= "https://www.imdb.com/list/ls021105452/"
url20= "https://www.imdb.com/list/ls093274866/"
url17= "https://www.imdb.com/list/ls062002489/"
url16= "https://www.imdb.com/list/ls031269618/"
url15= "https://www.imdb.com/list/ls071776440/"
suburl="https://www.imdb.com/title/tt6751668/"

In [4]:
#brings URL content
page = get(url)
soup = BeautifulSoup(page.content, 'lxml')

### Class IMDB that scrapes relevant data from the URL's

In [82]:
class IMDB(object):
    """docstring for IMDB"""
    def __init__(self, url):
        super(IMDB, self).__init__()
        page = get(url)
        
        self.soup = BeautifulSoup(page.content, 'lxml')

    def articleTitle(self):
        return self.soup.find("h1", class_="header").text.replace("\n","")

    def bodyContent(self):
        content = self.soup.find(id="main")
        return content.find_all("div", class_="lister-item mode-detail")        
        
    def movieData(self):
        movieFrame = self.bodyContent()
        movieTitle = []
        movieDate = []
        movieRunTime = []
        movieGenre = []
        movieRating = []
        movieDescription = []
        movieDirector = []
        movieStars = []
        movieVotes = []
        movieGross = []
        movieUrls = []
        for movie in movieFrame:
            movieFirstLine = movie.find("h3", class_="lister-item-header")
            movieUrl = movieFirstLine.find("a")
            sburl = "https://www.imdb.com"+movieUrl['href']
            movieUrls.append(sburl)
            movieTitle.append(movieFirstLine.find("a").text)
            movieDate.append(re.sub(r"[()]","", movieFirstLine.find_all("span")[-1].text))
            try:
                movieRunTime.append(movie.find("span", class_="runtime").text[:-4])
            except:
                movieRunTime.append(np.nan)
            movieGenre.append(movie.find("span", class_="genre").text.rstrip().replace("\n","").split(","))
            try:
                movieRating.append(movie.find("span", class_="ipl-rating-star__rating").text)
            except:
                movieRating.append(np.nan)

            movieDescription.append(movie.find_all("p",class_="")[0].text.rstrip().replace("\n",""))
            movieCast = movie.find_all("p", class_="text-muted")[1]

            try:
                cast = movieCast.text.replace("\n","").split('|')
                cast = [x.strip() for x in cast]
                cast = [cast[i].replace(j, "") for i,j in enumerate(["Director:", "Stars:"])]
                movieDirector.append(cast[0])
                movieStars.append([x.strip() for x in cast[1].split(",")])
            except:
                casts = movieCast.text.replace("\n","").strip()
                movieDirector.append(np.nan)
                movieStars.append([x.strip() for x in casts.split(",")])

            movieNumbers = movie.find_all("span", attrs={"name": "nv"})
            
            if len(movieNumbers) == 2:
                movieVotes.append(movieNumbers[0].text)
                movieGross.append(movieNumbers[1].text)
            else:
                movieVotes.append(movieNumbers[0].text)
                movieGross.append(np.nan)
                
        movieData = {'title': movieTitle, 'date': movieDate, 'runtime': movieRunTime, 'genre': movieGenre, 'rating': movieRating, 'description': movieDescription,
                            'director': movieDirector, 'stars': movieStars, 'votes': movieVotes, 'gross': movieGross, 'urls': movieUrls}
       # imdb = pd.DataFrame(movieData)
        return movieData

In [125]:
#Instance of a class with URL as parameter
id1 = IMDB(url16)

In [126]:
#Get Article Title
print(id1.articleTitle())

Top 50 Best Films of 2016


### Here we convert the data dictionary returned into a Pandas DataFrame

In [127]:
imdbdata = pd.DataFrame(id1.movieData())
imdbdata.head()

Unnamed: 0,title,date,runtime,genre,rating,description,director,stars,votes,gross,urls
0,Moonlight,I 2016,111,[Drama],7.4,A young African-American man grapples with...,Barry Jenkins,"[Mahershala Ali, Naomie Harris, Trevante Rhode...",253408,$27.85M,https://www.imdb.com/title/tt4975722/
1,Kimi no na wa.,2016,106,"[Animation, Drama, Fantasy]",8.4,Two strangers find themselves linked in a ...,Makoto Shinkai,"[Ryûnosuke Kamiki, Mone Kamishiraishi, Ryô Nar...",165482,$5.02M,https://www.imdb.com/title/tt5311514/
2,Julieta,2016,99,"[Drama, Mystery, Romance]",7.1,"After a casual encounter, a brokenhearted ...",Pedro Almodóvar,"[Emma Suárez, Adriana Ugarte, Daniel Grao, Inm...",27077,$1.49M,https://www.imdb.com/title/tt4326444/
3,A Monster Calls,2016,108,"[Adventure, Drama, Family]",7.5,A boy seeks the help of a tree monster to ...,J.A. Bayona,"[Lewis MacDougall, Sigourney Weaver, Felicity ...",76737,$3.73M,https://www.imdb.com/title/tt3416532/
4,Hell or High Water,II 2016,102,"[Action, Crime, Drama]",7.6,A divorced father and his ex-con older bro...,David Mackenzie,"[Chris Pine, Ben Foster, Jeff Bridges, Gil Bir...",191998,$26.86M,https://www.imdb.com/title/tt2582782/


### To extract budget data, each link had to be opened, and the following method allows us to do that

In [96]:
#Since we added budget to our data later, we are executing a separate method. Also because the rest of the data is scraped from TOP 50 list, but this one is from individual link pages.
def movieBudge(suburl):
    try:
        subpage = get(suburl)
        subsoup = BeautifulSoup(subpage.content, 'lxml')
        content = subsoup.find(id="main_bottom")
        moviedets =content.find("div", class_="article",id="titleDetails")
        budgetblock = moviedets.find_all("div", class_="txt-block")[6]
        budgetblk = budgetblock.text.split("\n")
        budget = budgetblk[1].split(":")
        moviebudget=budget[1]
        return moviebudget
    except:
        return None

#### Store budget data into a list, and then add that list to the dataframe

In [128]:
budget=[]
for url in imdbdata['urls']:
    budget.append(movieBudge(url))

In [129]:
imdbdata['budget'] = budget

In [130]:
imdbdata.head()

Unnamed: 0,title,date,runtime,genre,rating,description,director,stars,votes,gross,urls,budget
0,Moonlight,I 2016,111,[Drama],7.4,A young African-American man grapples with...,Barry Jenkins,"[Mahershala Ali, Naomie Harris, Trevante Rhode...",253408,$27.85M,https://www.imdb.com/title/tt4975722/,"$4,000,000"
1,Kimi no na wa.,2016,106,"[Animation, Drama, Fantasy]",8.4,Two strangers find themselves linked in a ...,Makoto Shinkai,"[Ryûnosuke Kamiki, Mone Kamishiraishi, Ryô Nar...",165482,$5.02M,https://www.imdb.com/title/tt5311514/,"$1,813,781,"
2,Julieta,2016,99,"[Drama, Mystery, Romance]",7.1,"After a casual encounter, a brokenhearted ...",Pedro Almodóvar,"[Emma Suárez, Adriana Ugarte, Daniel Grao, Inm...",27077,$1.49M,https://www.imdb.com/title/tt4326444/,"EUR1,350,000"
3,A Monster Calls,2016,108,"[Adventure, Drama, Family]",7.5,A boy seeks the help of a tree monster to ...,J.A. Bayona,"[Lewis MacDougall, Sigourney Weaver, Felicity ...",76737,$3.73M,https://www.imdb.com/title/tt3416532/,"$43,000,000"
4,Hell or High Water,II 2016,102,"[Action, Crime, Drama]",7.6,A divorced father and his ex-con older bro...,David Mackenzie,"[Chris Pine, Ben Foster, Jeff Bridges, Gil Bir...",191998,$26.86M,https://www.imdb.com/title/tt2582782/,"$12,000,000"


### Finally, to store the retrieved data into a csv file for further analysis, we use:

In [131]:
imdbdata.to_csv('d2016.csv',index=False)

### In this manner, will get csv files from 2015-19 and merge/clean/combine on excel