In [59]:
import itertools
import numpy as np
import pandas as pd 
from numbers import Number
import sqlite3
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
import zipfile

In [60]:
# 1. Unzip the .db.zip file
with zipfile.ZipFile('zippedData/im.db.zip', 'r') as zip_ref:
    zip_ref.extractall('temp_folder')  
    # Extract the database to a temporary folder

# 2. Connect to the SQLite database
db_file = 'temp_folder/im.db'  
# 3. Path to the extracted SQLite database
conn = sqlite3.connect(db_file)


In [61]:
# Query to fetch table information from the schema
pd.read_sql('''SELECT name FROM sqlite_master WHERE type='table';''', conn)
    

Unnamed: 0,name
0,movie_basics
1,directors
2,known_for
3,movie_akas
4,movie_ratings
5,persons
6,principals
7,writers


In [62]:
df_gross = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
df_info = pd.read_csv('zippedData/rt.movie_info.tsv.gz', sep='\t', encoding='latin')
df_reviews = pd.read_csv('zippedData/rt.reviews.tsv.gz', sep='\t', encoding='latin')
df_tmdb = pd.read_csv('zippedData/tmdb.movies.csv.gz')
df_budgets = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')

In [63]:
df_gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [64]:
df_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [65]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54432 non-null  int64 
 1   review      48869 non-null  object
 2   rating      40915 non-null  object
 3   fresh       54432 non-null  object
 4   critic      51710 non-null  object
 5   top_critic  54432 non-null  int64 
 6   publisher   54123 non-null  object
 7   date        54432 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


In [66]:
df_gross.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [67]:
df_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [68]:
df_reviews.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [69]:
pd.read_sql('''
    SELECT *
    FROM movie_basics as mb
    INNER JOIN movie_ratings as mr
    ON mb.movie_id = mr.movie_id
    LIMIT 10;''', conn)

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres,movie_id.1,averagerating,numvotes
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",tt0063540,7.0,77
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama",tt0066787,7.2,43
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama,tt0069049,6.9,4517
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama",tt0069204,6.1,13
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy",tt0100275,6.5,119
5,tt0112502,Bigfoot,Bigfoot,2017,,"Horror,Thriller",tt0112502,4.1,32
6,tt0137204,Joe Finds Grace,Joe Finds Grace,2017,83.0,"Adventure,Animation,Comedy",tt0137204,8.1,263
7,tt0146592,Pál Adrienn,Pál Adrienn,2010,136.0,Drama,tt0146592,6.8,451
8,tt0154039,So Much for Justice!,Oda az igazság,2010,100.0,History,tt0154039,4.6,64
9,tt0159369,Cooper and Hemingway: The True Gen,Cooper and Hemingway: The True Gen,2013,180.0,Documentary,tt0159369,7.6,53


In [70]:
basics_ratings = pd.read_sql('''
    SELECT *
    FROM movie_basics as mb
    LEFT JOIN movie_ratings as mr
    ON mb.movie_id = mr.movie_id;''', conn).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
 6   movie_id         73856 non-null   object 
 7   averagerating    73856 non-null   float64
 8   numvotes         73856 non-null   float64
dtypes: float64(3), int64(1), object(5)
memory usage: 10.0+ MB


In [71]:
from sqlalchemy import create_engine

# Step 1: Load data from SQL table into a Pandas DataFrame
engine = create_engine('sqlite:///temp_folder/im.db')  
sql_query = 'SELECT * FROM movie_basics as mb LEFT JOIN movie_ratings as mr ON mb.movie_id = mr.movie_id'
df_sql = pd.read_sql(sql_query, engine)

# Step 3: Combine data with different column names
# Specify the column names explicitly using left_on and right_on parameters
result_df_left = pd.merge(df_sql, df_gross, left_on='primary_title', right_on='title', how='left')
result_df_left.info()
result_df_left.to_csv('result_left.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146146 entries, 0 to 146145
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         146146 non-null  object 
 1   primary_title    146146 non-null  object 
 2   original_title   146125 non-null  object 
 3   start_year       146146 non-null  int64  
 4   runtime_minutes  114407 non-null  float64
 5   genres           140738 non-null  object 
 6   movie_id         73858 non-null   object 
 7   averagerating    73858 non-null   float64
 8   numvotes         73858 non-null   float64
 9   title            3366 non-null    object 
 10  studio           3363 non-null    object 
 11  domestic_gross   3342 non-null    float64
 12  foreign_gross    2043 non-null    object 
 13  year             3366 non-null    float64
dtypes: float64(5), int64(1), object(8)
memory usage: 16.7+ MB


In [72]:
from sqlalchemy import create_engine

# Step 1: Load data from SQL table into a Pandas DataFrame
engine = create_engine('sqlite:///temp_folder/im.db')  # Replace with your database connection details
sql_query = 'SELECT * FROM movie_basics as mb LEFT JOIN movie_ratings as mr ON mb.movie_id = mr.movie_id'
df_sql = pd.read_sql(sql_query, engine)

# Step 3: Combine data with different column names
# Specify the column names explicitly using left_on and right_on parameters
result_df_inner = pd.merge(df_sql, df_gross, left_on='primary_title', right_on='title', how='inner')
result_df_inner.info()
result_df_inner.to_csv('result_inner.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3366 entries, 0 to 3365
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   movie_id         3366 non-null   object 
 1   primary_title    3366 non-null   object 
 2   original_title   3366 non-null   object 
 3   start_year       3366 non-null   int64  
 4   runtime_minutes  3198 non-null   float64
 5   genres           3326 non-null   object 
 6   movie_id         3027 non-null   object 
 7   averagerating    3027 non-null   float64
 8   numvotes         3027 non-null   float64
 9   title            3366 non-null   object 
 10  studio           3363 non-null   object 
 11  domestic_gross   3342 non-null   float64
 12  foreign_gross    2043 non-null   object 
 13  year             3366 non-null   int64  
dtypes: float64(4), int64(2), object(8)
memory usage: 394.5+ KB


In [73]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2023/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2023 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2023)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)


     Rank                   Release Group       Worldwide      Domestic  \
0       1                          Barbie  $1,440,380,000  $635,680,000   
1       2     The Super Mario Bros. Movie  $1,362,659,200  $574,934,330   
2       3                     Oppenheimer    $945,362,510  $324,042,510   
3       4  Guardians of the Galaxy Vol. 3    $845,555,777  $358,995,815   
4       5                          Fast X    $704,709,660  $145,960,660   
..    ...                             ...             ...           ...   
195   196                            2018      $4,547,765             -   
196   197               IM HERO THE FINAL      $4,514,656             -   
197   198                       Mari(dos)      $4,483,495             -   
198   199                  Weekend Rebels      $4,477,262             -   
199   200                       Maamannan      $4,463,992             -   

         %       Foreign    %.1  
0    44.1%  $804,700,000  55.9%  
1    42.2%  $787,724,870  57.8%

In [75]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2022/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2022 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2022)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                                Release Group       Worldwide  \
0       1                     Avatar: The Way of Water  $2,320,250,281   
1       2                            Top Gun: Maverick  $1,495,696,292   
2       3                      Jurassic World Dominion  $1,001,978,080   
3       4  Doctor Strange in the Multiverse of Madness    $955,775,804   
4       5                     Minions: The Rise of Gru    $939,628,210   
..    ...                                          ...             ...   
195   196                             Lesson in Murder      $7,348,964   
196   197                               Family Affairs      $7,330,755   
197   198                                Listy do M. 5      $7,328,061   
198   199                         Laid-Back Camp Movie      $7,317,913   
199   200        Osomatsusan the Movie 2022 Re-release      $7,297,522   

         Domestic      %         Foreign    %.1  
0    $684,075,767  29.5%  $1,636,174,514  70.5%  
1    $718,7

In [76]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2021/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2021 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2021)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                Release Group       Worldwide      Domestic      %  \
0       1      Spider-Man: No Way Home  $1,912,233,593  $804,793,477  42.1%   
1       2  The Battle at Lake Changjin    $902,548,476      $342,411  <0.1%   
2       3                      Hi, Mom    $822,009,764             -      -   
3       4               No Time to Die    $774,153,007  $160,891,007  20.8%   
4       5            F9: The Fast Saga    $726,229,501  $173,005,945  23.8%   
..    ...                          ...             ...           ...    ...   
195   196              The Mauritanian      $7,527,030      $836,536  11.1%   
196   197                 The Ice Road      $7,502,846             -      -   
197   198  Judas and the Black Messiah      $7,478,009    $5,478,009  73.3%   
198   199  Love Letter 2021 Re-release      $7,400,000             -      -   
199   200            Signal: The Movie      $7,388,603             -      -   

            Foreign    %.1  
0    $1,107,440,116  5

In [77]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2020/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2020 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2020)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                                      Release Group     Worldwide  \
0       1                                  The Eight Hundred  $461,421,559   
1       2  Demon Slayer: Kimetsu no Yaiba - The Movie: Mu...  $453,210,959   
2       3                                  Bad Boys for Life  $426,505,244   
3       4                             My People, My Homeland  $422,390,820   
4       5                                              Tenet  $365,304,105   
..    ...                                                ...           ...   
195   196                                              Panga    $4,886,124   
196   197                        The Tales for Old and Young    $4,853,143   
197   198                                          Streltsov    $4,850,073   
198   199                                    Persian Lessons    $4,849,240   
199   200             The SpongeBob Movie: Sponge on the Run    $4,810,790   

         Domestic      %       Foreign    %.1  
0        $372,7

In [78]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2019/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2019 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2019)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                                      Release Group       Worldwide  \
0       1                                  Avengers: Endgame  $2,799,439,100   
1       2                                      The Lion King  $1,656,943,394   
2       3                                          Frozen II  $1,450,026,933   
3       4                          Spider-Man: Far from Home  $1,131,927,996   
4       5                                     Captain Marvel  $1,128,274,794   
..    ...                                                ...             ...   
195   196                                       The Specials     $19,363,826   
196   197                                    Always Miss You     $19,015,465   
197   198                               An Officer and a Spy     $18,899,214   
198   199  Crayon Shin-chan: Honeymoon Hurricane - The Lo...     $18,738,951   
199   200                                              Greta     $18,653,107   

         Domestic      %         Foreig

In [79]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2018/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2018 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2018)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                   Release Group       Worldwide      Domestic  \
0       1          Avengers: Infinity War  $2,048,359,754  $678,815,482   
1       2                   Black Panther  $1,346,913,161  $700,059,566   
2       3  Jurassic World: Fallen Kingdom  $1,308,467,944  $417,719,760   
3       4                   Incredibles 2  $1,242,805,359  $608,581,744   
4       5                         Aquaman  $1,151,961,807  $335,061,807   
..    ...                             ...             ...           ...   
195   196                       Padmaavat     $22,991,060   $11,846,060   
196   197       Won't You Be My Neighbor?     $22,844,741   $22,835,787   
197   198                  Europe Raiders     $22,435,156             -   
198   199          Mojin: The Worm Valley     $22,381,583      $101,516   
199   200                           Stree     $22,075,730             -   

         %         Foreign    %.1  
0    33.1%  $1,369,544,272  66.9%  
1      52%    $646,853,595 

In [80]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2017/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2017 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2017)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                            Release Group       Worldwide  \
0       1  Star Wars: Episode VIII - The Last Jedi  $1,332,539,889   
1       2                     Beauty and the Beast  $1,263,521,126   
2       3                  The Fate of the Furious  $1,236,005,118   
3       4                          Despicable Me 3  $1,034,799,409   
4       5           Jumanji: Welcome to the Jungle    $962,077,546   
..    ...                                      ...             ...   
195   196                    Extraordinary Mission     $22,757,764   
196   197                         The Glass Castle     $22,088,533   
197   198                      Épouse-moi mon pote     $21,571,464   
198   199                               The Prison     $21,205,329   
199   200         One Hundred Thousand Bad Jokes 2     $20,460,352   

         Domestic      %         Foreign    %.1  
0    $620,181,382  46.5%    $712,358,507  53.5%  
1    $504,014,165  39.9%    $759,506,961  60.1%  
2    $226

In [81]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2016/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2016 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2016)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                              Release Group       Worldwide  \
0       1                 Captain America: Civil War  $1,153,296,293   
1       2               Rogue One: A Star Wars Story  $1,056,057,273   
2       3                               Finding Dory  $1,028,570,889   
3       4                                   Zootopia  $1,023,784,195   
4       5                            The Jungle Book    $966,550,600   
..    ...                                        ...             ...   
195   196  Middle School: The Worst Years of My Life     $23,316,139   
196   197                                   Triple 9     $23,177,948   
197   198                                   Rock Dog     $23,139,802   
198   199                                 The Choice     $23,064,015   
199   200                             Penny Pincher!     $22,955,486   

         Domestic      %       Foreign    %.1  
0    $408,084,349  35.4%  $745,211,944  64.6%  
1    $532,177,324  50.4%  $523,879,949 

In [82]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2015/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2015 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2015)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                                   Release Group       Worldwide  \
0       1      Star Wars: Episode VII - The Force Awakens  $2,068,223,624   
1       2                                  Jurassic World  $1,670,400,637   
2       3                                       Furious 7  $1,515,047,671   
3       4                         Avengers: Age of Ultron  $1,402,805,868   
4       5                                         Minions  $1,159,398,397   
..    ...                                             ...             ...   
195   196  Huevos: Little Rooster's Egg-cellent Adventure     $25,892,561   
196   197                          Dügün Dernek 2: Sünnet     $25,836,668   
197   198                                 Look Who's Back     $25,513,752   
198   199                               Serial Teachers 2     $25,364,150   
199   200                                       Wild City     $24,817,852   

         Domestic      %         Foreign    %.1  
0    $936,662,225  45.3% 

In [83]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2014/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2014 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2014)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                              Release Group       Worldwide  \
0       1            Transformers: Age of Extinction  $1,104,054,072   
1       2  The Hobbit: The Battle of the Five Armies    $956,019,788   
2       3                    Guardians of the Galaxy    $772,776,600   
3       4                                 Maleficent    $758,410,378   
4       5      The Hunger Games: Mockingjay - Part 1    $755,356,711   
..    ...                                        ...             ...   
195   196                          A Haunted House 2     $25,358,716   
196   197                             And So It Goes     $25,312,387   
197   198                             Kung Fu Jungle     $24,070,765   
198   199                                   Hot Road     $22,916,313   
199   200                              Fading Gigolo     $22,706,304   

         Domestic      %       Foreign    %.1  
0    $245,439,076  22.2%  $858,614,996  77.8%  
1    $255,119,788  26.7%  $700,900,000 

In [84]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2013/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2013 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2013)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                        Release Group       Worldwide      Domestic  \
0       1                               Frozen  $1,280,802,282  $400,738,009   
1       2                           Iron Man 3  $1,214,811,252  $409,013,994   
2       3                      Despicable Me 2    $970,766,005  $368,065,385   
3       4  The Hobbit: The Desolation of Smaug    $958,366,855  $258,366,855   
4       5      The Hunger Games: Catching Fire    $865,011,746  $424,668,047   
..    ...                                  ...             ...           ...   
195   196                     The Great Beauty     $24,350,615    $2,852,400   
196   197                               Trance     $24,261,569    $2,328,743   
197   198                      The Railway Man     $24,174,885    $4,438,438   
198   199                       Out of Inferno     $24,109,886             -   
199   200                 Bring Happiness Home     $23,980,000             -   

         %       Foreign    %.1  
0    

In [85]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2012/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2012 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2012)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                      Release Group       Worldwide      Domestic  \
0       1                       The Avengers  $1,518,812,988  $623,357,910   
1       2                            Skyfall  $1,108,561,013  $304,360,277   
2       3              The Dark Knight Rises  $1,084,939,099  $448,139,099   
3       4  The Hobbit: An Unexpected Journey  $1,017,003,568  $303,003,568   
4       5         Ice Age: Continental Drift    $877,244,782  $161,321,843   
..    ...                                ...             ...           ...   
195   196                            Unbowed     $22,132,903             -   
196   197               Confession of Murder     $21,701,525             -   
197   198      StreetDance 2 2012 Re-release     $21,638,853             -   
198   199               2012 2012 Re-release     $21,538,353             -   
199   200        Beasts of the Southern Wild     $21,107,746   $12,795,746   

         %       Foreign    %.1  
0      41%  $895,455,078    5

In [86]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2011/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2011 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2011)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                                 Release Group       Worldwide  \
0       1  Harry Potter and the Deathly Hallows: Part 2  $1,341,511,219   
1       2                Transformers: Dark of the Moon  $1,123,794,079   
2       3   Pirates of the Caribbean: On Stranger Tides  $1,045,713,802   
3       4     The Twilight Saga: Breaking Dawn - Part 1    $712,205,856   
4       5          Mission: Impossible - Ghost Protocol    $694,713,380   
..    ...                                           ...             ...   
195   196                                The Front Line     $20,629,645   
196   197                                       Polisse     $20,590,872   
197   198                                      Hollywoo     $20,587,850   
198   199                              Scabbard Samurai     $20,251,745   
199   200                 Kaiji 2: The Ultimate Gambler     $19,971,259   

         Domestic      %       Foreign    %.1  
0    $381,011,219  28.4%  $960,500,000  71.6%  
1  

In [87]:
# Define the URL of the website you want to scrape
url = "https://www.boxofficemojo.com/year/world/2010/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the data you want to extract
    table = soup.find('table', {'class': 'a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated'})

    # Use Pandas to read the table into a DataFrame
    df_bom_webscraping_2010 = pd.read_html(str(table))[0]

    # Display the DataFrame
    print(df_bom_webscraping_2010)

    # Now you can work with the DataFrame as needed
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

     Rank                                 Release Group       Worldwide  \
0       1                                   Toy Story 3  $1,066,969,703   
1       2                           Alice in Wonderland  $1,025,467,110   
2       3  Harry Potter and the Deathly Hallows: Part 1    $976,536,918   
3       4                                     Inception    $828,258,695   
4       5                           Shrek Forever After    $752,600,867   
..    ...                                           ...             ...   
195   196                                   The Servant     $19,259,164   
196   197                                Norwegian Wood     $19,115,721   
197   198                               Head Over Heels     $18,876,082   
198   199                                  13 Assassins     $18,689,058   
199   200                    Green Zone 2010 Re-release     $18,495,045   

         Domestic      %       Foreign    %.1  
0    $415,004,880  38.9%  $651,964,823  61.1%  
1  

In [88]:
# Create an empty DataFrame to store the combined data
df_bom_webscraping = pd.DataFrame(columns=['Rank', 'Release Group', 'Worldwide', 'Domestic', '%', 'Foreign', '%.1'])

# List of DataFrames
dataframes = [df_bom_webscraping_2010, df_bom_webscraping_2011,
              df_bom_webscraping_2012, df_bom_webscraping_2013, 
              df_bom_webscraping_2014, df_bom_webscraping_2015, 
              df_bom_webscraping_2016, df_bom_webscraping_2017, 
              df_bom_webscraping_2018, df_bom_webscraping_2019, 
              df_bom_webscraping_2020, df_bom_webscraping_2021, 
              df_bom_webscraping_2022, df_bom_webscraping_2023]

# Loop through the list of DataFrames and concatenate them
df_bom_webscraping = pd.concat([df for df in dataframes], ignore_index=True)


df_bom_webscraping


Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1
0,1,Toy Story 3,"$1,066,969,703","$415,004,880",38.9%,"$651,964,823",61.1%
1,2,Alice in Wonderland,"$1,025,467,110","$334,191,110",32.6%,"$691,276,000",67.4%
2,3,Harry Potter and the Deathly Hallows: Part 1,"$976,536,918","$295,983,305",30.3%,"$680,553,613",69.7%
3,4,Inception,"$828,258,695","$292,576,195",35.3%,"$535,682,500",64.7%
4,5,Shrek Forever After,"$752,600,867","$238,736,787",31.7%,"$513,864,080",68.3%
...,...,...,...,...,...,...,...
2795,196,2018,"$4,547,765",-,-,"$4,547,765",100%
2796,197,IM HERO THE FINAL,"$4,514,656",-,-,"$4,514,656",100%
2797,198,Mari(dos),"$4,483,495",-,-,"$4,483,495",100%
2798,199,Weekend Rebels,"$4,477,262",-,-,"$4,477,262",100%


In [89]:
csv_file_path = 'df_boms_webscraping.csv'

# Save the DataFrame as a CSV file
df_bom_webscraping.to_csv(csv_file_path, index=False)

In [90]:
df_bom_webscraping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           2800 non-null   int64 
 1   Release Group  2800 non-null   object
 2   Worldwide      2800 non-null   object
 3   Domestic       2800 non-null   object
 4   %              2800 non-null   object
 5   Foreign        2800 non-null   object
 6   %.1            2800 non-null   object
dtypes: int64(1), object(6)
memory usage: 153.2+ KB


In [91]:
pip install html5lib

Note: you may need to restart the kernel to use updated packages.


In [92]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [93]:
pip install --upgrade selenium

Requirement already up-to-date: selenium in /Users/kariprimiano/anaconda3/envs/learn-env/lib/python3.8/site-packages (4.14.0)
Note: you may need to restart the kernel to use updated packages.


In [94]:
pip install webdriver_manager

Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Using cached urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
Collecting chardet<4,>=3.0.2
  Using cached chardet-3.0.4-py2.py3-none-any.whl (133 kB)
Installing collected packages: urllib3, chardet
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.0.7
    Uninstalling urllib3-2.0.7:
      Successfully uninstalled urllib3-2.0.7
  Attempting uninstall: chardet
    Found existing installation: chardet 5.2.0
    Uninstalling chardet-5.2.0:
      Successfully uninstalled chardet-5.2.0
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

selenium 4.14.0 requires urllib3[socks]<3,>=1.26, but you'll have urllib3 1.25.11 which is incompatible.[0m
Successfully installed chardet-

In [95]:
pip install selenium webdriver_manager

Collecting urllib3[socks]<3,>=1.26
  Using cached urllib3-2.0.7-py3-none-any.whl (124 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.25.11
    Uninstalling urllib3-1.25.11:
      Successfully uninstalled urllib3-1.25.11
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

requests 2.24.0 requires urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1, but you'll have urllib3 2.0.7 which is incompatible.
botocore 1.18.16 requires urllib3<1.26,>=1.20, but you'll have urllib3 2.0.7 which is incompatible.[0m
Successfully installed urllib3-2.0.7
Note: you may need to restart the kernel to use updated packages.


In [96]:
pip install --upgrade urllib3 chardet

Requirement already up-to-date: urllib3 in /Users/kariprimiano/anaconda3/envs/learn-env/lib/python3.8/site-packages (2.0.7)
Collecting chardet
  Using cached chardet-5.2.0-py3-none-any.whl (199 kB)
Installing collected packages: chardet
  Attempting uninstall: chardet
    Found existing installation: chardet 3.0.4
    Uninstalling chardet-3.0.4:
      Successfully uninstalled chardet-3.0.4
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

requests 2.24.0 requires chardet<4,>=3.0.2, but you'll have chardet 5.2.0 which is incompatible.
requests 2.24.0 requires urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1, but you'll have urllib3 2.0.7 which is incompatible.
aiohttp 3.6.2 requires chardet<4.0,>=2.0, but you'll have chardet 5.2.0 which is incompatib

In [97]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

# Initialize the Chrome WebDriver using the manager
driver = webdriver.Chrome(ChromeDriverManager().install())

# Specify the URL of the webpage
url = "https://www.the-numbers.com/movie/budgets/all"

# Load the webpage
driver.get(url)

# Get the page source after waiting for some time (adjust the delay as needed)
time.sleep(5)  # Wait for 5 seconds (you can adjust this)

# Get the page source
page_source = driver.page_source

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Find the table element
table = soup.find('table')

# Read the table into a DataFrame using Pandas
df = pd.read_html(str(table))[0]  # Adjust if there are multiple tables on the page

# Close the WebDriver
driver.quit()


AttributeError: 'str' object has no attribute 'capabilities'

In [98]:
df_budgets

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495"
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338"
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0


In [99]:
df_budgets['year'] = df_budgets['release_date'].str[-4:]

sorted_df_budgets = df_budgets.sort_values(by=['year'], ascending=False)

sorted_df_budgets

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,year
194,95,"Dec 31, 2020",Moonfall,"$150,000,000",$0,$0,2020
535,36,"Feb 21, 2020",Call of the Wild,"$82,000,000",$0,$0,2020
1205,6,"Dec 31, 2020",Hannibal the Conqueror,"$50,000,000",$0,$0,2020
2029,30,"Sep 30, 2019",Unhinged,"$29,000,000",$0,$0,2019
670,71,"Aug 30, 2019",PLAYMOBIL,"$75,000,000",$0,$0,2019
...,...,...,...,...,...,...,...
5606,7,"Nov 19, 1925",The Big Parade,"$245,000","$11,000,000","$22,000,000",1925
5683,84,"Sep 17, 1920",Over the Hill to the Poorhouse,"$100,000","$3,000,000","$3,000,000",1920
5614,15,"Dec 24, 1916","20,000 Leagues Under the Sea","$200,000","$8,000,000","$8,000,000",1916
5523,24,"Sep 5, 1916",Intolerance,"$385,907",$0,$0,1916


In [120]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
# Define the URL of the webpage to scrape
url = "https://www.the-numbers.com/movies/report/All/All/All/All/All/All/All/All/All/None/None/None/None/None/None/None/None/None/None?view-order-by=domestic-box-office&show-release-year=On&view-order-direction=desc&show-production-budget=On&show-domestic-box-office=On&show-inflation-adjusted-domestic-box-office=On&show-genre=On&show-source=On&show-production-method=On&show-creative-type=On"

# Send an HTTP GET request to the URL
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table element that contains the data
    table = soup.find("table")

    # Use Pandas to read the HTML table into a DataFrame
    df_tn1 = pd.read_html(str(table))[0]

    # Optionally, clean and preprocess the data as needed

    # Display the DataFrame
    print(df_tn1.head())
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)


   Unnamed: 0  Released                                 Title      Genre  \
0           1      2015  Star Wars Ep. VII: The Force Awakens  Adventure   
1           2      2019                     Avengers: Endgame     Action   
2           3      2021               Spider-Man: No Way Home     Action   
3           4      2009                                Avatar     Action   
4           5      2022                     Top Gun: Maverick     Action   

                         Source       ProductionMethod          CreativeType  \
0           Original Screenplay  Animation/Live Action       Science Fiction   
1  Based on Comic/Graphic Novel  Animation/Live Action            Super Hero   
2  Based on Comic/Graphic Novel            Live Action            Super Hero   
3           Original Screenplay  Animation/Live Action       Science Fiction   
4           Original Screenplay            Live Action  Contemporary Fiction   

  ProductionBudget DomesticBox Office Infl. Adj. Dom.Box Offic

In [122]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

# Define a function to scrape data from a single URL
def scrape_data(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table')
        df = pd.read_html(str(table))[0]
        return df
    except Exception as e:
        print(f"Error scraping data from {url}: {e}")
        return None

# List of URLs to scrape
urls = [
    "https://www.the-numbers.com/movie/budgets/all/6401",
    "https://www.the-numbers.com/movie/budgets/all/6301",
    'https://www.the-numbers.com/movie/budgets/all/6201',
    'https://www.the-numbers.com/movie/budgets/all/6101',
    'https://www.the-numbers.com/movie/budgets/all/6001',
    'https://www.the-numbers.com/movie/budgets/all/5901',
    'https://www.the-numbers.com/movie/budgets/all/5801',
    'https://www.the-numbers.com/movie/budgets/all/5701',
    'https://www.the-numbers.com/movie/budgets/all/5601',
    'https://www.the-numbers.com/movie/budgets/all/5501',
    'https://www.the-numbers.com/movie/budgets/all/5401',
    'https://www.the-numbers.com/movie/budgets/all/5301',
    'https://www.the-numbers.com/movie/budgets/all/5201',
    'https://www.the-numbers.com/movie/budgets/all/5101',
    'https://www.the-numbers.com/movie/budgets/all/5001',
    'https://www.the-numbers.com/movie/budgets/all/4901',
    'https://www.the-numbers.com/movie/budgets/all/4801',
    'https://www.the-numbers.com/movie/budgets/all/4701',
    'https://www.the-numbers.com/movie/budgets/all/4601',
    'https://www.the-numbers.com/movie/budgets/all/4501',
    'https://www.the-numbers.com/movie/budgets/all/4401',
    'https://www.the-numbers.com/movie/budgets/all/4301',
    'https://www.the-numbers.com/movie/budgets/all/4201',
    'https://www.the-numbers.com/movie/budgets/all/4101',
    'https://www.the-numbers.com/movie/budgets/all/4001',
    'https://www.the-numbers.com/movie/budgets/all/3901',
    'https://www.the-numbers.com/movie/budgets/all/3801',
    'https://www.the-numbers.com/movie/budgets/all/3701',
    'https://www.the-numbers.com/movie/budgets/all/3601',
    'https://www.the-numbers.com/movie/budgets/all/3501',
    'https://www.the-numbers.com/movie/budgets/all/3401',
    'https://www.the-numbers.com/movie/budgets/all/3301',
    'https://www.the-numbers.com/movie/budgets/all/3201',
    'https://www.the-numbers.com/movie/budgets/all/3101',
    'https://www.the-numbers.com/movie/budgets/all/3001',
    'https://www.the-numbers.com/movie/budgets/all/2901',
    'https://www.the-numbers.com/movie/budgets/all/2801',
    'https://www.the-numbers.com/movie/budgets/all/2701',
    'https://www.the-numbers.com/movie/budgets/all/2601',
    'https://www.the-numbers.com/movie/budgets/all/2501',
    'https://www.the-numbers.com/movie/budgets/all/2401',
    'https://www.the-numbers.com/movie/budgets/all/2301',
    'https://www.the-numbers.com/movie/budgets/all/2201',
    'https://www.the-numbers.com/movie/budgets/all/2101',
    'https://www.the-numbers.com/movie/budgets/all/2001',
    'https://www.the-numbers.com/movie/budgets/all/1901',
    'https://www.the-numbers.com/movie/budgets/all/1801',
    'https://www.the-numbers.com/movie/budgets/all/1701',
    'https://www.the-numbers.com/movie/budgets/all/1601',
    'https://www.the-numbers.com/movie/budgets/all/1501',
    'https://www.the-numbers.com/movie/budgets/all/1401',
    'https://www.the-numbers.com/movie/budgets/all/1301',
    'https://www.the-numbers.com/movie/budgets/all/1201',
    'https://www.the-numbers.com/movie/budgets/all/1101',
    'https://www.the-numbers.com/movie/budgets/all/1001',
    'https://www.the-numbers.com/movie/budgets/all/901',
    'https://www.the-numbers.com/movie/budgets/all/801',
    'https://www.the-numbers.com/movie/budgets/all/701',
    'https://www.the-numbers.com/movie/budgets/all/601',
    'https://www.the-numbers.com/movie/budgets/all/501',
    'https://www.the-numbers.com/movie/budgets/all/401',
    'https://www.the-numbers.com/movie/budgets/all/301',
    'https://www.the-numbers.com/movie/budgets/all/201',
    'https://www.the-numbers.com/movie/budgets/all/101',
    'https://www.the-numbers.com/movie/budgets/all'
    
]

# Create a ThreadPoolExecutor to execute scraping tasks concurrently
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit scraping tasks for each URL
    results = [executor.submit(scrape_data, url) for url in urls]

# Collect the results (DataFrames) from the completed tasks
dataframes = [result.result() for result in results if result.result() is not None]

# Concatenate the DataFrames into a single DataFrame
df_tn = pd.concat(dataframes, ignore_index=True)

# Display or further process the combined DataFrame
print(df_tn)

      Unnamed: 0   ReleaseDate                             Movie  \
0           6401   Dec 1, 2015                       Dutch Kills   
1           6402   Aug 1, 1991                           Slacker   
2           6403       Unknown                         Dry Spell   
3           6404  Jan 11, 2002                      Steel Spirit   
4           6405   Aug 9, 2019                          Socrates   
...          ...           ...                               ...   
6429          96   Sep 4, 2020                             Mulan   
6430          97   Jul 2, 2021                  The Tomorrow War   
6431          98  Jul 13, 2022                      The Gray Man   
6432          99  Jun 29, 2011    Transformers: Dark of the Moon   
6433         100   Jun 6, 2023  Transformers: Rise of the Beasts   

     ProductionBudget DomesticGross  WorldwideGross  
0             $25,000            $0              $0  
1             $23,000    $1,227,508      $1,227,508  
2             $22,000

In [127]:
df_tn['ReleaseDate'].value_counts

<bound method IndexOpsMixin.value_counts of 0        Dec 1, 2015
1        Aug 1, 1991
2            Unknown
3       Jan 11, 2002
4        Aug 9, 2019
            ...     
6429     Sep 4, 2020
6430     Jul 2, 2021
6431    Jul 13, 2022
6432    Jun 29, 2011
6433     Jun 6, 2023
Name: ReleaseDate, Length: 6434, dtype: object>

In [131]:
df_tn['Year'] = df_tn['ReleaseDate'].str[-4:]

In [137]:
df_tn = df_tn.loc[(df_tn['Year'] >= '2010') & (df_tn['Year'] <= '2023')]
df_tn['Year'].value_counts()

2015    345
2010    261
2016    261
2012    257
2014    255
2011    248
2013    232
2018    200
2017    189
2019    175
2021     93
2020     78
2022     69
2023     55
Name: Year, dtype: int64

In [138]:
df_tn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2718 entries, 0 to 6433
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        2718 non-null   int64 
 1   ReleaseDate       2718 non-null   object
 2   Movie             2718 non-null   object
 3   ProductionBudget  2718 non-null   object
 4   DomesticGross     2718 non-null   object
 5   WorldwideGross    2718 non-null   object
 6   Year              2718 non-null   object
dtypes: int64(1), object(6)
memory usage: 169.9+ KB


In [139]:
csv_file_path = 'tn_budgets.csv'

# Save the DataFrame as a CSV file
df_tn.to_csv(csv_file_path, index=False)