In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_Super_Bowl_halftime_shows"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.find_all("table", {"class": "wikitable"})
#Here, the code is searching through the parsed HTML to find all <table> elements 
# that have a class attribute of "wikitable". 
# Wikipedia commonly uses this class for its data tables. 
# The find_all() method returns a list of all matching elements, 
# which is stored in the tables variable. Each element in this list 
# represents one HTML table from the page.

all_dfs = [pd.read_html(str(table))[0] for table in tables]
#This line uses a list comprehension to process each table found 
#in the previous step. For each table:

#str(table) converts the BeautifulSoup table object back to an HTML string
#pd.read_html() is a pandas function that converts HTML tables into pandas DataFrames
#[0] extracts the first (and presumably only) DataFrame from the list returned by pd.read_html()

#The result is all_dfs, a list of pandas DataFrames, 
# with each DataFrame containing the data from one of the tables found on the webpage.


df = pd.concat(all_dfs, ignore_index=True)
#The pd.concat() function combines all the separate DataFrames 
# in all_dfs into a single DataFrame, which is stored in df. 
# The ignore_index=True parameter tells pandas to create a new sequential index 
# for the combined DataFrame rather than keeping the original indices 
# from the separate DataFrames.

print(df.columns)

if "Super Bowl" in df.columns:
    df.rename(columns={"Super Bowl": "Super_Bowl"}, inplace=True)

df["Super_Bowl"] = df["Super_Bowl"].str.replace("(show)", "", regex=False).str.strip()

print(df.head())


Index(['Super Bowl', 'Date', 'Location', 'Theme', 'Performer(s)', 'Producer',
       'Setlist', 'Ref.', 'Sponsor', 'Special guest(s)', 'Headliner(s)',
       'Director'],
      dtype='object')
  Super_Bowl              Date  \
0          I  January 15, 1967   
1         II  January 14, 1968   
2        III  January 12, 1969   
3         IV  January 11, 1970   
4          V  January 17, 1971   

                                            Location                  Theme  \
0  Los Angeles Memorial Coliseum (Los Angeles, Ca...                      —   
1                 Miami Orange Bowl (Miami, Florida)                      —   
2                 Miami Orange Bowl (Miami, Florida)         America Thanks   
3            Tulane Stadium (New Orleans, Louisiana)  Tribute to Mardi Gras   
4                 Miami Orange Bowl (Miami, Florida)                      —   

                                        Performer(s)      Producer  \
0  University of Arizona Symphonic Marching Band ...  Tom

  all_dfs = [pd.read_html(str(table))[0] for table in tables]
  all_dfs = [pd.read_html(str(table))[0] for table in tables]
  all_dfs = [pd.read_html(str(table))[0] for table in tables]
  all_dfs = [pd.read_html(str(table))[0] for table in tables]
  all_dfs = [pd.read_html(str(table))[0] for table in tables]
  all_dfs = [pd.read_html(str(table))[0] for table in tables]
  all_dfs = [pd.read_html(str(table))[0] for table in tables]


In [10]:
df = df.drop(index=0).reset_index(drop=True)

In [11]:
df.to_csv("super_bowl_halftime_shows.csv", index=False)
print("Data saved to 'super_bowl_halftime_shows.csv'")

Data saved to 'super_bowl_halftime_shows.csv'
