In [91]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [144]:
#turns an encoded title into readable english
def decode(title_encoded):
    #encoded meanings
    attr = {"â\x99": "'", "â\x80\x94": "-" }
    #replace the decoded portions
    title = title_encoded.replace("â\x99", "'").replace("â\x80\x94", "-" )
    return title


In [145]:
#returns the full title of the movie
def get_movie_title(soup):
    #go to title section in html
    title_info = soup.find("h1")
    #get the title - weird characters, encoded
    title_encoded = title_info.get_text()[:-7]
    #decode the title to english
    title = decode(title_encoded)
    return title
    
# title = get_movie_title(soup)
# title


In [146]:
#goes to movie page - returns the movie budget
def get_movie_budget(soup):
    
    #go to table with info
    movie_info = soup.find_all('table')
    #there are multiple tables on page - some movies dont have a budget listed
    #go to table with has budget - get to html line with that info - get it
    try:
        movie_budget = movie_info[3].find("td").next_sibling.get_text()
    except:
        movie_budget = None
    return movie_budget



# bud = get_movie_budget(soup)
# print(bud)

In [150]:
#takes in section of one movie at a time
#returns a tuple with the of the movie (rank_in_year, year, movie_name, Genre, Gross, Budget)
def movie_info(web, movie_stats, year):
    #go to section where nexcessary information is - take it
    data = movie_stats.find_all('td', class_='data')
    #get the rank and gross of movie
    movie_rank = data[0].get_text()
    movie_gross = data[1].get_text()
    #go to section where info is
    link_data = movie_stats.find_all('a')
    #get and genre
    movie_genre = link_data[-1].get_text()
    #get the suffix for the link where the movie budget is
    movie_info_link_suf = link_data[0]['href']
    #create the link to go into the movie page to grab the budget
    movie_url = "{}{}".format(web, movie_info_link_suf)

    #get the page with the full name and budget
    r = requests.get(movie_url)
    c = r.content
    soup = BeautifulSoup(c, 'html.parser')
    #get movie budget and title
    movie_budget = get_movie_budget(soup)
    movie_title = get_movie_title(soup)

    #return the tuple with movie information

    return (movie_rank, movie_title, year, movie_genre, movie_gross, movie_budget)

    
#movie_info(web, movie_stats_lst[1], 2018)
    

In [157]:
#get infos of top movie in that year
def movies_of_year(web, year):
    #create url for year
    url = "{}/market/{}/top-grossing-movies".format(web, year)
    r = requests.get(url)
    c = r.content
    soup = BeautifulSoup(c, 'html.parser')
    #get to where the table information is
    movie_table = soup.find("table")
    movie_stats_lst = movie_table.find_all("tr")
    all_movies_year_stats = []
    #2019 year doesn't have many movies listed yet
    num_movies = list(range(1,501))
    if year == 2019:
        num_movies = list(range(1,191))
    
    #iterate through the table of movies, ignore the first one - i = rank
    for i in num_movies:
        #get the information of the movie 
        movie_stat = movie_info(web, movie_stats_lst[i], year)
        #append that information to a list
        all_movies_year_stats.append(movie_stat)
    #return a list of tuples with each tuple containing a movie info
    return all_movies_year_stats

# web = "https://www.the-numbers.com"
# test = movies_of_year(web, 2019)


In [158]:
#returns dictionary with key=year, value=list of movies info
def movies_of_years():
    web = "https://www.the-numbers.com"
    #crete dict with each year as key
    year_movies = {}
    #go through the years
    for year in range(2019, 2014, -1):
        #print(year)
        #go to page with top movies in year
        all_movies = movies_of_year(web, year)
        #attach to dict
        year_movies[year] = all_movies
    
    return year_movies
        
year_movies = movies_of_years()

In [159]:
year_movies[2019]

[('1', 'Captain Marvel', 2019, 'Action', '$323,646,033', '$152,000,000'),
 ('2',
  'How to Train Your Dragon: The Hidden World',
  2019,
  'Adventure',
  '$147,460,290',
  '$129,000,000'),
 ('3', 'Glass', 2019, 'Thriller/Suspense', '$110,861,630', '$20,000,000'),
 ('4', 'The Upside', 2019, 'Comedy', '$106,984,792', '$37,500,000'),
 ('5',
  'The LEGO Movie 2: The Second Part',
  2019,
  'Adventure',
  '$103,762,494',
  '$99,000,000'),
 ('6', 'Us', 2019, 'Thriller/Suspense', '$85,106,185', '$24,600,000'),
 ('7', 'Alita: Battle Angel', 2019, 'Action', '$83,956,461', None),
 ('8', 'Aquaman', 2019, 'Action', '$74,802,530', '$160,000,000'),
 ('9',
  "Tyler Perry's A Madea Family Funeral",
  2019,
  'Comedy',
  '$66,688,573',
  None),
 ('10',
  'Spider-Man: Into The Spider-Verse 3D',
  2019,
  'Adventure',
  '$55,923,345',
  '$90,000,000'),
 ('11', 'Bumblebee', 2019, 'Adventure', '$55,769,470', '$102,000,000'),
 ('12', 'What Men Want', 2019, 'Comedy', '$54,085,444', '$20,000,000'),
 ('13', 'G

In [160]:
year_movies.keys()

dict_keys([2019, 2018, 2017, 2016, 2015])

In [161]:
#creates a dataframe with the movie dictionary
def create_frame(year_movies):
    rank = []
    title = []
    year = []
    genre = []
    gross = []
    budget = []
    #turn dict into list
    for movie_lst in year_movies.values():
        #go through each movie in the list
        for movie in movie_lst:
            rank.append(movie[0])
            title.append(movie[1])
            year.append(movie[2])
            genre.append(movie[3])
            gross.append(movie[4])
            budget.append(movie[5])
            
    #create dataframe
    df = pd.DataFrame({'rank_in_year': rank,
                       'title': title,
                       'year': year,
                       'genre': genre,
                       'gross': gross,
                       'budget': budget})
    #returns movie dataframe
    return df

df = create_frame(year_movies)


In [162]:
df

Unnamed: 0,rank_in_year,title,year,genre,gross,budget
0,1,Captain Marvel,2019,Action,"$323,646,033","$152,000,000"
1,2,How to Train Your Dragon: The Hidden World,2019,Adventure,"$147,460,290","$129,000,000"
2,3,Glass,2019,Thriller/Suspense,"$110,861,630","$20,000,000"
3,4,The Upside,2019,Comedy,"$106,984,792","$37,500,000"
4,5,The LEGO Movie 2: The Second Part,2019,Adventure,"$103,762,494","$99,000,000"
5,6,Us,2019,Thriller/Suspense,"$85,106,185","$24,600,000"
6,7,Alita: Battle Angel,2019,Action,"$83,956,461",
7,8,Aquaman,2019,Action,"$74,802,530","$160,000,000"
8,9,Tyler Perry's A Madea Family Funeral,2019,Comedy,"$66,688,573",
9,10,Spider-Man: Into The Spider-Verse 3D,2019,Adventure,"$55,923,345","$90,000,000"


In [163]:
#Website = the_numbers - turn movie info from dataframe to csv
#df.to_csv("the_numbers_scrap.csv", index=False)