In [172]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [173]:
class BoxOfficeMojo():
  url_base ="https://www.boxofficemojo.com"
  url = ""
  frequency = "weekly" # frequency of time [daily, weekend, weekly, monthly, quarterly, yearly]
  since_year = 0
  to_year = 0
  actual_year = 0
  actual_week = 1 # number of weeks from 01 to 52
  weeks_per_year = 52
  query_order = "?sort=numWeeksInRelease&sortDir=asc&ref_=bo_wl__resort#table"
  labels = ['rank', 'rank last week', 'release', 'gross per week', 'gross change/week', 'theaters', 'number of theaters change', 'per theater average gross', 'total gross', 'number ow weeks in release', 'distributor', 'actual year', 'actual week', 'origin']
  json_schema = {
      'rank': '',
      'rank last week': '',
      'release': '',
      'gross per week': '',
      'gross change/week': '',
      'theaters': '',
      'number of theaters change': '',
      'per theater average gross': '',
      'total gross': '',
      'number ow weeks in release': '',
      'distributor': '',
      'actual year': '',
      'actual week': '',
      'origin': 'box-office-mojo'
  }

  def __init__(self, since_year, to_year):
    self.since_year = since_year
    self.to_year = to_year
    self.actual_year = since_year

  def built_url(self):
    if self.actual_week < 10:
      actual_week = "0" + str(self.actual_week)
    else:
      actual_week = str(self.actual_week)

    self.url = self.url_base + "/" + self.frequency + "/" + str(self.actual_year) + "W" + actual_week + "/" + self.query_order

  def get_next_url(self):
    if self.actual_week < 52:
      self.built_url()
      self.actual_week = self.actual_week + 1
    else:
      if self.actual_year < self.to_year:
        self.built_url()
        self.actual_year = self.actual_year + 1
        self.actual_week = 1
      else:
        self.url = ""

    return self.url

  def get_page_content(self):
    url = self.get_next_url()
    page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    content = BeautifulSoup(page.content)
    tr_list = content.find_all("tr", {'class': 'mojo-annotation-isNewThisWeek'})
    #remove label cols
    tr_list.pop(0)

    movies_list = []
    for tr in tr_list:
      movie = []
      for td in tr:
        text = td.get_text().strip()
        if text != "" and text != "true" and text != "false":
          if text == "-":
            text = "0"
          movie.append(text)
      
      movie.append(self.actual_year)
      movie.append(self.actual_week)
      movie.append('box-office-mojo')

      movies_list.append(movie)
    
    return movies_list

  def extract_and_export_data(self):
    print(f'Initializing process')
    total_weeks = (self.to_year - self.since_year) * self.weeks_per_year
    advice_counter1 = int(total_weeks / 4 ) * 3 # 75% progress
    advice_counter2 = int(total_weeks / 4 ) * 2 # 50% progress
    advice_counter3 = int(total_weeks / 4 ) # 25% progress

    movies_list = []
    while total_weeks > 0:
      print(f'Extracting data... Year:{self.actual_year} Week:{self.actual_week}')      
      movies = self.get_page_content()
      movies_list.extend(movies)
      total_weeks = total_weeks - 1

    print(f'Data extracted!') 
    self.export_to_csv_file(movies_list)

  def export_to_csv_file(self, movies_list):
    print(f'Exporting data... ') 
    movies_list_df = pd.DataFrame(movies_list, columns = self.labels)
    file_name =  'movies_' + str(self.since_year) + '-' + str(self.to_year) + '.csv'
    movies_list_df.to_csv(file_name, index = 0, columns = self.labels)
    print(f'Data exported! \nProcess finalized!') 

    



In [174]:
mojo = BoxOfficeMojo(since_year = 2012, to_year = 2020)

In [175]:
mojo.extract_and_export_data()

Initializing process
Extracting data... Year:2012 Week:1
Extracting data... Year:2012 Week:2
Extracting data... Year:2012 Week:3
Extracting data... Year:2012 Week:4
Extracting data... Year:2012 Week:5
Extracting data... Year:2012 Week:6
Extracting data... Year:2012 Week:7
Extracting data... Year:2012 Week:8
Extracting data... Year:2012 Week:9
Extracting data... Year:2012 Week:10
Extracting data... Year:2012 Week:11
Extracting data... Year:2012 Week:12
Extracting data... Year:2012 Week:13
Extracting data... Year:2012 Week:14
Extracting data... Year:2012 Week:15
Extracting data... Year:2012 Week:16
Extracting data... Year:2012 Week:17
Extracting data... Year:2012 Week:18
Extracting data... Year:2012 Week:19
Extracting data... Year:2012 Week:20
Extracting data... Year:2012 Week:21
Extracting data... Year:2012 Week:22
Extracting data... Year:2012 Week:23
Extracting data... Year:2012 Week:24
Extracting data... Year:2012 Week:25
Extracting data... Year:2012 Week:26
Extracting data... Year:20