# Web Scraping Wikipedia

In [1]:
import bs4
import requests
import pandas as pd
import numpy as np
import time
import random
import regex as re

#### Form initial list of movies based off Wikipedia

In [22]:
def wikipedia_scrape(url_lookup, col_lst, n = 1):

    # Scrape table data from URL
    html_content = requests.get(url_lookup).text
    soup = bs4.BeautifulSoup(html_content, "lxml")
    table = soup.find_all('table') 

    data = []

    for child_table in soup.find_all('table'):
        table_rows = child_table.findAll('tr')

        # Add to list
        data.append([[td.findChildren(text=True) for td in tr.findAll("td")] for tr in table_rows])

    # Create dataframe
    df = pd.DataFrame()
    for i in data[n]:
        df = df.append(pd.DataFrame(i).transpose())

    df.columns = col_lst
    df.reset_index(drop = True, inplace = True)
    
    # Remove null rows
    try:
        df = df.loc[~(pd.isna(df['Film']) 
                                  & (df['Film'] == '\n')), :]
        df = df.loc[~pd.isna(df['Year']), :]
        df.reset_index(inplace=True, drop=True)
    except:
        pass

    return df

**Highest grossing sports films**

In [23]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_sports_films'

# Column names
cols = ['Film'
        , 'Year'
        , 'Worldwide Gross'
        , 'Ref'
        , 'Sport']

df_sport = wikipedia_scrape(url,cols, 1 )
df_sport.head()

Unnamed: 0,Film,Year,Worldwide Gross,Ref,Sport
0,The Hunger Games: Catching Fire,2013\n,"$865,011,746\n",[1],Battle royale
1,The Hunger Games: Mockingjay – Part 1,2014\n,"$755,356,711\n",[2],
2,The Hunger Games,2012\n,"$694,394,724\n",[3],
3,Forrest Gump,1994\n,"$678,222,284\n",[4],American football
4,The Hunger Games: Mockingjay – Part 2,2015\n,"$653,428,261\n",[5],Battle royale\n


**Superhero films**

In [24]:
%%time
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_superhero_films'

# Column names
cols = ['Rank'
        , 'Film'
        , 'Worldwide Gross'
        , 'Year'
        , 'Superheroes'
        , 'Source'
        , 'Ref']

df_superhero = wikipedia_scrape(url, cols, 0)

df_superhero.head()

Wall time: 950 ms


Unnamed: 0,Rank,Film,Worldwide Gross,Year,Superheroes,Source,Ref
0,1\n,Avengers: Endgame,"$2,797,800,564",2019,Avengers,Marvel,[1]
1,2\n,Avengers: Infinity War,"$2,048,359,754",2018,[2],,
2,3\n,The Avengers,"$1,518,812,988",2012,[3],,
3,4\n,Avengers: Age of Ultron,"$1,405,403,694",2015,[4],,
4,5\n,Black Panther,"$1,346,913,171",2018,Black Panther,[5],


In [25]:
# Correct Year using forward fill
df_superhero['Year'] = df_superhero['Year'].apply(lambda x: re.sub("[^0-9]", '', x))
df_superhero['Year'] = df_superhero['Year'].str.strip().dropna().apply(lambda x: np.NaN if len(x) == 0 else x)
df_superhero['Year']  = df_superhero['Year'].fillna(method='ffill')

In [26]:
df_superhero.head()

Unnamed: 0,Rank,Film,Worldwide Gross,Year,Superheroes,Source,Ref
0,1\n,Avengers: Endgame,"$2,797,800,564",2019,Avengers,Marvel,[1]
1,2\n,Avengers: Infinity War,"$2,048,359,754",2018,[2],,
2,3\n,The Avengers,"$1,518,812,988",2012,[3],,
3,4\n,Avengers: Age of Ultron,"$1,405,403,694",2015,[4],,
4,5\n,Black Panther,"$1,346,913,171",2018,Black Panther,[5],


**Science Fiction films**

In [27]:
%%time
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_science_fiction_films'

# Column names
cols = ['Rank'
        , 'Film'
        , 'Worldwide Gross'
        , 'Year'
        , 'Ref']

df_scifi = wikipedia_scrape(url, cols, 1)

df_scifi.head()

Wall time: 886 ms


Unnamed: 0,Rank,Film,Worldwide Gross,Year,Ref
0,1,Avatar,"$2,847,246,203",2009,[1]
1,2,Star Wars: The Force Awakens,"$2,068,223,624",2015,[2]
2,3,Jurassic World,"$1,670,516,444",2015,[3]
3,4,Star Wars: The Last Jedi,"$1,332,539,889",2017,[4]
4,5,Jurassic World: Fallen Kingdom,"$1,308,467,944",2018,[5]


**Musical films**

In [28]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_musicals#Highest-grossing_musical_films'

# Column names
cols = ['Rank'
        , 'Film'
        , 'Year'
        , 'Worldwide Gross'
        , 'Ref']

df_musc = wikipedia_scrape(url, cols, 1)

df_musc.head()

Unnamed: 0,Rank,Film,Year,Worldwide Gross,Ref
0,1\n,The Lion King,1997\n,"$8,251,556,700",[b]
1,2\n,The Phantom of the Opera,1986\n,"$6,060,000,000",[c]
2,3\n,Mamma Mia!,1999\n,"$4,000,000,000\n",[5]
3,4\n,Cats,1981\n,"$3,565,624,091",[e]
4,5\n,Wicked,2003\n,"$3,530,000,000",[g]


**Horror Films**

In [29]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_horror_films'

# Column names
cols = ['Rank'
        , 'Film'
        , 'Worldwide Gross'
        , 'Year'
        , 'Franchise'
        , 'Ref']

df_horror = wikipedia_scrape(url, cols, 0)

df_horror.head()

Unnamed: 0,Rank,Film,Worldwide Gross,Year,Franchise,Ref
0,1,It,"$700,381,759",2017,It,[1]
1,2,The Sixth Sense,"$672,806,292",1999,,[2]
2,3,War of the Worlds,"$603,873,119",2005,The War of the Worlds,[3]
3,4,I Am Legend,"$585,349,010",2007,I Am Legend,[4]
4,5,Kong: Skull Island,"$566,652,812",2017,MonsterVerse,[5]


**Fantasy Films**

In [30]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_fantasy_films'

# Column names
cols = ['Rank'
        , 'Film'
        , 'Worldwide Gross'
        , 'Year'
        , 'Ref']

df_fantasy = wikipedia_scrape(url, cols, 0)

df_fantasy.head()

Unnamed: 0,Rank,Film,Worldwide Gross,Year,Ref
0,1\n,Frozen II,"$1,450,026,933",2019,[2]
1,2\n,Harry Potter and the Deathly Hallows – Part 2,"$1,342,321,665",2011,[2]
2,3\n,Frozen,"$1,290,000,000",2013,F
3,4\n,Beauty and the Beast,"$1,263,521,126",2017,[2]
4,5\n,The Lord of the Rings: The Return of the King,"$1,146,030,912",2003,[2]


**Comedy Films**

In [31]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_comedy_films'

# Column names
cols = ['Rank'
        , 'Film'
        , 'Type'
        , 'Worldwide Gross'
        , 'Year'
        , 'Ref']

df_comd = wikipedia_scrape(url, cols, 0)

df_comd.head()

Unnamed: 0,Rank,Film,Type,Worldwide Gross,Year,Ref
0,1,Incredibles 2,A,"$1,242,805,359",2018,[3]
1,2,Minions,A,"$1,159,398,397",2015,[4]
2,3,Toy Story 4,A,"$1,073,394,593",2019,[5]
3,4,Toy Story 3,A,"$1,066,970,811",2010,[6]
4,5,Despicable Me 3,A,"$1,035,799,409",2017,[7]


**Chirstmas Films**

In [32]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_Christmas_films'
    
# Column names
cols = ['Rank'
        , 'Film'
        , 'Worldwide Gross'
        , 'Year'
        , 'Ref']

df_xmas = wikipedia_scrape(url, cols, 1)

df_xmas.head()

Unnamed: 0,Rank,Film,Worldwide Gross,Year,Ref
0,1,The Grinch,"$511,595,957",2018,[1]
1,2,Home Alone,"$476,700,000",1990,[2]
2,3,Dr. Seuss' How the Grinch Stole Christmas!,"$345,141,403",2000,[3]
3,4,A Christmas Carol,"$325,286,646",2009,[4]
4,5,The Polar Express,"$314,215,454",2004,[5]


**highest-grossing openings for films**

In [33]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_openings_for_films'
# Column names
cols = ['Rank'
        , 'Film'
        , 'Year'
        , 'Worldwide Gross']

df_high = wikipedia_scrape(url, cols, 0)

df_high.head()

Unnamed: 0,Rank,Film,Year,Worldwide Gross
0,1,Avengers: Endgame,2019,"$1,223,641,414\n"
1,2,Avengers: Infinity War,2018,"$640,521,291\n"
2,3,The Fate of the Furious,2017,"$541,937,239\n"
3,4,Star Wars: The Force Awakens,2015,"$528,966,675\n"
4,5,Jurassic World,2015,"$525,504,128\n"


**highest-grossing puppet films**

In [34]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_puppet_films'
# Column names
cols = ['Rank'
        , 'Film'
        , 'Worldwide Gross'
        , 'Year'
        , 'Ref']

df_pup = wikipedia_scrape(url, cols, 0)

df_pup.head()

Unnamed: 0,Rank,Film,Worldwide Gross,Year,Ref
0,1,The Muppets,"$165,184,237",2011,[2]
1,2,Muppets Most Wanted,"$80,383,290",2014,[3]
2,3,The Muppet Movie,"$65,810,475",1979,[4]
3,4,Team America: World Police,"$50,907,422",2004,[5]
4,5,The Dark Crystal,"$41,613,957",1982,[6]


**highest-grossing anime films**

In [35]:
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_anime_films'

# Column names
cols = ['Rank'
        , 'Film'
        , 'Worldwide Gross'
        , 'Year'
        , 'Ref'
       ]

df_ani = wikipedia_scrape(url, cols, 0)

df_ani.head()

Unnamed: 0,Rank,Film,Worldwide Gross,Year,Ref
0,1\n,Demon Slayer: Mugen Train,"$503,063,688\n",2020\n,[3]
1,2\n,Spirited Away,"$395,580,000\n",2001\n,[4]
2,3\n,Your Name,"$380,140,500\n",2016\n,[4]
3,4\n,Howl's Moving Castle,"$236,214,446\n",2004\n,[5]
4,5\n,Ponyo,"$204,826,668\n",2008\n,[6]


**Box Office**

In [36]:
url = 'https://en.wikipedia.org/wiki/List_of_films_by_box_office_admissions'

# Column names
cols = [ 'Film'
        , 'Year'
        , 'Worldwide Gross'
        ,'Territories'
        , 'Notes'
       ]

df_box = wikipedia_scrape(url, cols, 0)

df_box.head()

Unnamed: 0,Film,Year,Worldwide Gross,Territories,Notes
0,In-Laws,1981\n,"469,290,000\n",China,[2]
1,Mysterious Buddha,1980\n,"403,210,000\n",China\n,[2]
2,Titanic,1997\n,"362,433,457\n",Worldwide,[b]
3,Avengers: Endgame,2019\n,"349,236,385\n",Worldwide\n,[c]
4,Star Wars,1977\n,"338,400,000\n",Worldwide\n,[19]


**Worst films ever**

In [37]:
url = 'https://en.wikipedia.org/wiki/List_of_films_with_a_0%25_rating_on_Rotten_Tomatoes'

# Column names
cols = [ 'Film'
        , 'Year'
        , '# reviews'
        ,'Reference'
       ]

df_worst = wikipedia_scrape(url, cols, 0)
df_worst['Worldwide Gross'] = None

df_worst.head()

Unnamed: 0,Film,Year,# reviews,Reference,Worldwide Gross
0,Staying Alive,1983\n,27\n,[5],
1,Bolero,1984\n,23\n,[6],
2,Jaws: The Revenge,1987\n,39\n,[7],
3,Police Academy 4: Citizens on Patrol,1987\n,20\n,[8],
4,Return of the Living Dead Part II,1988\n,20\n,[9],


In [38]:
df_concat = pd.concat([
     df_sport.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_horror.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_musc.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_scifi.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_superhero.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_fantasy.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_comd.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_high.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_pup.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_ani.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_box.loc[:, ['Film', 'Year', 'Worldwide Gross']]
    , df_worst.loc[:, ['Film', 'Year', 'Worldwide Gross']]
])

# Remove new line
df_concat['Year'] = df_concat['Year'].str.strip()
df_concat['Worldwide Gross'] = df_concat['Worldwide Gross'].str.strip()
df_concat['Film'] = df_concat['Film'].str.strip()

df_concat.rename(columns={"Film": "Title"}, inplace=True)
df_concat.drop_duplicates(inplace = True)
df_concat.reset_index(drop = True, inplace = True)
df_concat.head()


Unnamed: 0,Title,Year,Worldwide Gross
0,The Hunger Games: Catching Fire,2013,"$865,011,746"
1,The Hunger Games: Mockingjay – Part 1,2014,"$755,356,711"
2,The Hunger Games,2012,"$694,394,724"
3,Forrest Gump,1994,"$678,222,284"
4,The Hunger Games: Mockingjay – Part 2,2015,"$653,428,261"


In [39]:
# Number of Films
print(f'{len(df_concat)} films extracted from Wikipedia')

566 films extracted from Wikipedia


In [40]:
# Backup list
df_concat.to_csv('bk_wiki.csv', index = False)