In [11]:
!pip3 install beautifulsoup4
!pip3 install requests

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [12]:
import requests
from bs4 import BeautifulSoup
import unicodedata
import pandas as pd

In [13]:
# Helper functions

def date_time(table_cells):
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    out = ''.join([booster_version for i, booster_version 
                   in enumerate(table_cells.strings) if i % 2 == 0][0:-1])
    return out

def landing_status(table_cells):
    return [i for i in table_cells.strings][0]

def get_mass(table_cells):
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        new_mass = mass[0:mass.find("kg") + 2]
    else:
        new_mass = 0
    return new_mass

def extract_column_from_header(row):
    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    column_name = ' '.join(row.contents)
    
    if not column_name.strip().isdigit():
        return column_name.strip()

In [33]:
# URL snapshot
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36"
}


In [34]:
# Request page
response = requests.get(static_url, headers=headers, timeout=30)
soup = BeautifulSoup(response.text)

In [38]:
# Extract target table
html_tables = soup.find_all("table")
first_launch_table = html_tables[2]

In [39]:
# Extract column names
column_names = []
for th in first_launch_table.find_all("th"):
    name = extract_column_from_header(th)
    if name is not None and len(name) > 0:
        column_names.append(name)

In [40]:
# Initialize dictionary
launch_dict = dict.fromkeys(column_names)
del launch_dict['Date and time ( )']

launch_dict.update({
    'Flight No.': [],
    'Launch site': [],
    'Payload': [],
    'Payload mass': [],
    'Orbit': [],
    'Customer': [],
    'Launch outcome': [],
    'Version Booster': [],
    'Booster landing': [],
    'Date': [],
    'Time': []
})

In [41]:
# Extract table rows
for table in soup.find_all('table', "wikitable plainrowheaders collapsible"):
    for rows in table.find_all("tr"):
        if rows.th and rows.th.string:
            flight_number = rows.th.string.strip()
            flag = flight_number.isdigit()
        else:
            flag = False

        row = rows.find_all('td')

        if flag:
            launch_dict['Flight No.'].append(flight_number)

            datatimelist = date_time(row[0])
            launch_dict['Date'].append(datatimelist[0].strip(','))
            launch_dict['Time'].append(datatimelist[1])

            bv = booster_version(row[1])
            if not bv:
                bv = row[1].a.string if row[1].a else None
            launch_dict['Version Booster'].append(bv)

            launch_dict["Launch site"].append(
                row[2].a.string if row[2].a else None
            )

            launch_dict["Payload"].append(
                row[3].a.string if row[3].a else row[3].get_text(strip=True)
            )

            launch_dict["Payload mass"].append(get_mass(row[4]))

            launch_dict["Orbit"].append(
                row[5].a.string if row[5].a else row[5].get_text(strip=True)
            )

            launch_dict["Customer"].append(
                row[6].a.string if row[6].a else row[6].get_text(strip=True)
            )

            launch_dict['Launch outcome'].append(
                list(row[7].strings)[0].strip()
            )

            launch_dict['Booster landing'].append(
                landing_status(row[8])
            )

In [45]:
# Create DataFrame
df = pd.DataFrame({key: pd.Series(value) for key, value in launch_dict.items()})

In [49]:
# See the data frame

df.head()

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,1,CCAFS,Dragon Spacecraft Qualification Unit,0,LEO,SpaceX,Success,F9 v1.07B0003.18,Failure,4 June 2010,18:45
1,2,CCAFS,Dragon,0,LEO,NASA,Success,F9 v1.07B0004.18,Failure,8 December 2010,15:43
2,3,CCAFS,Dragon,525 kg,LEO,NASA,Success,F9 v1.07B0005.18,No attempt\n,22 May 2012,07:44
3,4,CCAFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success,F9 v1.07B0006.18,No attempt,8 October 2012,00:35
4,5,CCAFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success,F9 v1.07B0007.18,No attempt\n,1 March 2013,15:10


In [48]:
# Save dataset
df.to_csv('spacex_web_scraped.csv', index=False)