# Web Scraping Wikipedia

In [1]:
import bs4
import requests
import pandas as pd
import numpy as np
import time
import random
import regex as re

In [2]:
def wikipedia_scrape(url_lookup, col_lst, n = 1):
    """
    Custom function to perform the web scraping of Wikipedia
    :param url_lookup: url string to web scrape
    :param col_lst: list of column names for returned Data Frame
    :param n: table 1 on the web page to scrape
    :return: Pandas DataFrame
    """

    # Scrape table data from URL
    html_content = requests.get(url_lookup).text
    soup = bs4.BeautifulSoup(html_content, "lxml")
    table = soup.find_all('table')

    # Find table elements
    data = []
    for child_table in soup.find_all('table'):
        table_rows = child_table.findAll('tr')

        # Add to list
        data.append([[td.findChildren(text=True) for td in tr.findAll("td")] for tr in table_rows])

    # Create dataframe
    df = pd.DataFrame()
    for i in data[n]:
        df = df.append(pd.DataFrame(i).transpose())
    df.columns = col_lst
    df.reset_index(drop = True, inplace = True)
    
    # Remove rows if film name is null
    try:
        df = df.loc[~(pd.isna(df['Film']) & (df['Film'] == '\n')), :]
        df = df.loc[~pd.isna(df['Year']), :]
        df.reset_index(inplace=True, drop=True)
    except:
        pass
    
    # Correct Year using forward fill if filled with movie name
    df['Year'] = df['Year'].apply(lambda x: re.sub("[^0-9]", '', x))
    df['Year'] = df['Year'].str.strip().dropna().apply(lambda x: np.NaN if len(x) == 0 else x)
    df['Year'] = df['Year'].fillna(method='ffill')
    
    # Return only columns of interest
    try:      
        df = df.loc[:, ['Year', 'Film', 'Worldwide Gross']]
    except KeyError: # if Worldwide Gross does not exist
        df['Worldwide Gross'] = None
        df = df.loc[:, ['Year', 'Film', 'Worldwide Gross']]
    
    # Remove new line
    df['Year'] = df['Year'].str.strip()
    df['Worldwide Gross'] = df['Worldwide Gross'].str.strip()
    df['Film'] = df['Film'].str.strip()

    # Return dataframe
    return df

Dictionary of URLs to webscrape and column names to apply

In [3]:
# Dictionary of URLs to scrape from Wikipedia
wiki_dict = (
    {'Sport': ['https://en.wikipedia.org/wiki/List_of_highest-grossing_sports_films'
               , ['Film', 'Year', 'Worldwide Gross', 'Ref', 'Sport']
               , 1]
    ,'Superhero': ['https://en.wikipedia.org/wiki/List_of_highest-grossing_superhero_films'
                   , ['Rank', 'Film', 'Worldwide Gross', 'Year', 'Superheroes', 'Source', 'Ref']
                   , 0]
    ,'Sci-fi': ['https://en.wikipedia.org/wiki/List_of_highest-grossing_science_fiction_films'
                , ['Rank', 'Film', 'Worldwide Gross', 'Year', 'Ref']
                , 1]
    ,'Musical': ['https://en.wikipedia.org/wiki/List_of_highest-grossing_musicals#Highest-grossing_musical_films'
                 , ['Rank', 'Film', 'Worldwide Gross', 'Year', 'Ref']
                 , 1]
    ,'Horror': ['https://en.wikipedia.org/wiki/List_of_highest-grossing_horror_films'
                , ['Rank', 'Film', 'Worldwide Gross', 'Year', 'Franchise', 'Ref']
                , 0]
    ,'Fantasy': ['https://en.wikipedia.org/wiki/List_of_highest-grossing_fantasy_films'
                , ['Rank', 'Film', 'Worldwide Gross', 'Year', 'Ref']
                , 0]
    ,'Comedy': ['https://en.wikipedia.org/wiki/List_of_highest-grossing_comedy_films'
                , ['Rank', 'Film', 'Type', 'Worldwide Gross', 'Year', 'Ref']
                , 0]
    ,'Christmas': ['https://en.wikipedia.org/wiki/List_of_highest-grossing_Christmas_films'
                   , ['Rank', 'Film', 'Worldwide Gross', 'Year', 'Ref']
                   , 1]
    ,'Openings': ['https://en.wikipedia.org/wiki/List_of_highest-grossing_openings_for_films'
                 , ['Rank', 'Film', 'Year', 'Worldwide Gross']
                 , 0]
    ,'Puppet':  ['https://en.wikipedia.org/wiki/List_of_highest-grossing_puppet_films'
               , ['Rank', 'Film', 'Worldwide Gross', 'Year', 'Ref']
               , 0]
    ,'Anime':  ['https://en.wikipedia.org/wiki/List_of_highest-grossing_anime_films'
               , ['Rank', 'Film', 'Worldwide Gross', 'Year', 'Ref']
               , 0]
    ,'Box Office': ['https://en.wikipedia.org/wiki/List_of_films_by_box_office_admissions',
                   [ 'Film', 'Year', 'Worldwide Gross','Territories', 'Notes']
                   , 0]
    ,'Worst': ['https://en.wikipedia.org/wiki/List_of_films_with_a_0%25_rating_on_Rotten_Tomatoes'
              , [ 'Film', 'Year', '# reviews','Reference']
              , 0]
    })

Run the WebCrawler

In [4]:
# List to store results of WebCrawler
lst_wiki = []

# Loop through the URL dictionary
for key, value in wiki_dict.items():
    
    # Values for WebCrawler
    url = wiki_dict[key][0]
    cols = wiki_dict[key][1]
    n = wiki_dict[key][2]
    
    # Execute WebCrawler and save results in list
    df = wikipedia_scrape(url, cols, n)
    lst_wiki.append(df)

    print(f'Lines fetched for {key}: {len(df)}')

Lines fetched for Sport: 50
Lines fetched for Superhero: 50
Lines fetched for Sci-fi: 49
Lines fetched for Musical: 28
Lines fetched for Horror: 50
Lines fetched for Fantasy: 50
Lines fetched for Comedy: 50
Lines fetched for Christmas: 32
Lines fetched for Openings: 50
Lines fetched for Puppet: 22
Lines fetched for Anime: 50
Lines fetched for Box Office: 110
Lines fetched for Worst: 41


In [5]:
# Transform data into a dataframe for future use
df_concat = pd.concat(lst_wiki)
df_concat.rename(columns={"Film": "Title"}, inplace=True)
df_concat.drop_duplicates(inplace = True)
df_concat.reset_index(drop = True, inplace = True)

# Backup list
df_concat.to_csv('bk_wiki.csv', index = False)

# Number of Films (excluding duplicates)
print(f'{len(df_concat)} films extracted from Wikipedia excluding duplicates')

df_concat.head()

617 films extracted from Wikipedia excluding duplicates


Unnamed: 0,Year,Title,Worldwide Gross
0,2013,The Hunger Games: Catching Fire,"$865,011,746"
1,2014,The Hunger Games: Mockingjay – Part 1,"$755,356,711"
2,2012,The Hunger Games,"$694,394,724"
3,1994,Forrest Gump,"$678,222,284"
4,2015,The Hunger Games: Mockingjay – Part 2,"$653,428,261"
