
# Web Scraping — Wikipedia Falcon 9 / Falcon Heavy Launches

**Author:** _Your Name_  
**Course:** IBM Data Science Professional Certificate — Capstone  
**Objective:** Scrape launch tables from Wikipedia (Falcon 9/Falcon Heavy), normalize to a tidy dataset, perform light cleaning, and persist for EDA/ML.

> Run all cells so that **outputs render on GitHub** for peer review (tables, counts, and saved-file messages).


In [2]:

import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pathlib import Path
import sqlite3

pd.set_option('display.max_columns', 100)
print("Libraries imported.")

Libraries imported.


## 1) Target URL

In [3]:

URL = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"
URL

'https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches'

## 2) Fetch HTML (requests) + basic checks

In [4]:

headers = {'User-Agent': 'Mozilla/5.0 (compatible; CapstoneBot/1.0; +https://example.com/bot)'}
resp = requests.get(URL, headers=headers, timeout=60)
print("Status:", resp.status_code)
resp.raise_for_status()

html = resp.text
len(html)

Status: 200


1525300

## 3) Parse with BeautifulSoup; locate tables

In [5]:

soup = BeautifulSoup(html, "html.parser")
tables = soup.select("table.wikitable")
print("Found wikitable count:", len(tables))
# Keep an HTML snapshot for reproducibility if needed
Path("data").mkdir(exist_ok=True, parents=True)
with open("data/falcon9_wikipedia_snapshot.html", "w", encoding="utf-8") as f:
    f.write(html)
"Saved HTML snapshot to data/falcon9_wikipedia_snapshot.html" 

Found wikitable count: 5


'Saved HTML snapshot to data/falcon9_wikipedia_snapshot.html'

## 4) Parse tables with pandas.read_html

In [6]:

dfs = pd.read_html(html)  # returns a list of DataFrames
len(dfs)

  dfs = pd.read_html(html)  # returns a list of DataFrames


16

## 5) Standardize/clean tables into a unified schema

In [7]:

def norm_col(c):
    c = re.sub(r"\[.*?\]", "", str(c))  # remove footnote markers like [a]
    c = c.lower().strip()
    c = c.replace("\n", " ").replace("\xa0", " ")
    c = re.sub(r"\s+", " ", c)
    c = c.replace(" ", "_")
    return c

# Synonyms (lowercase, underscores) → canonical
synonyms = {
    "flightnumber": ["no.", "flight_no.", "flight", "no", "nr."],
    "date": ["date_and_time_(utc)", "date/time_(utc)", "date", "date_and_time_\(utc\)"],
    "launch_site": ["launch_site", "site", "launch_site[a]", "launch_location"],
    "payload_mass": ["payload_mass_kg", "payload_mass_(kg)", "mass_(kg)", "payload_mass", "payload_mass_(kg)a"],
    "orbit": ["orbit", "orbits"],
    "booster_version": ["booster_version", "booster", "first_stage", "core"],
    "mission_outcome": ["outcome", "mission_outcome", "result"],
}

def choose_col(cols, candidates):
    for cand in candidates:
        for col in cols:
            if col == cand:
                return col
    # allow loose contains
    for cand in candidates:
        for col in cols:
            if cand in col:
                return col
    return None

def extract_numeric_kg(val):
    if pd.isna(val):
        return np.nan
    s = str(val)
    # common formats: "5,200 kg (11,464 lb)" or "5,200" or "5200"
    m = re.search(r"([0-9][0-9,\.]*)(?=\s*kg|$)", s, flags=re.I)
    if not m:
        # last resort: first number-like
        m = re.search(r"([0-9][0-9,\.]*)", s)
    if m:
        num = m.group(1).replace(",", "")
        try:
            return float(num)
        except:
            return np.nan
    return np.nan

records = []
for i, df in enumerate(dfs):
    # Normalize columns
    df2 = df.copy()
    df2.columns = [norm_col(c) for c in df2.columns]
    cols = list(df2.columns)
    
    # Map to canonical names
    col_map = {}
    col_map["flightnumber"] = choose_col(cols, synonyms["flightnumber"])
    col_map["date"] = choose_col(cols, synonyms["date"])
    col_map["launch_site"] = choose_col(cols, synonyms["launch_site"])
    col_map["payload_mass"] = choose_col(cols, synonyms["payload_mass"])
    col_map["orbit"] = choose_col(cols, synonyms["orbit"])
    col_map["booster_version"] = choose_col(cols, synonyms["booster_version"])
    col_map["mission_outcome"] = choose_col(cols, synonyms["mission_outcome"])
    
    # If essential columns missing, skip table
    essential = ["date", "launch_site"]
    if not all(col_map[k] for k in essential):
        continue
    
    # Build partial standardized frame
    tmp = pd.DataFrame()
    for k, v in col_map.items():
        if v in df2.columns:
            tmp[k] = df2[v]
        else:
            tmp[k] = np.nan
    
    # Clean fields
    tmp["payload_mass"] = tmp["payload_mass"].apply(extract_numeric_kg)
    tmp["date"] = pd.to_datetime(tmp["date"], errors="coerce", utc=True)
    # Flight number: coerce to numeric if present
    if "flightnumber" in tmp.columns:
        tmp["flightnumber"] = pd.to_numeric(tmp["flightnumber"], errors="coerce")
    
    tmp["source_table_index"] = i
    records.append(tmp)

len(records)

  "date": ["date_and_time_(utc)", "date/time_(utc)", "date", "date_and_time_\(utc\)"],
  tmp["date"] = pd.to_datetime(tmp["date"], errors="coerce", utc=True)
  tmp["date"] = pd.to_datetime(tmp["date"], errors="coerce", utc=True)
  tmp["date"] = pd.to_datetime(tmp["date"], errors="coerce", utc=True)
  tmp["date"] = pd.to_datetime(tmp["date"], errors="coerce", utc=True)


5

## 6) Concatenate standardized tables

In [8]:

if records:
    data = pd.concat(records, ignore_index=True)
else:
    data = pd.DataFrame(columns=["flightnumber","date","launch_site","payload_mass","orbit","booster_version","mission_outcome"])

# De-duplicate and sort by date/flightnumber where available
data = data.drop_duplicates().sort_values(by=["date","flightnumber"], na_position="last").reset_index(drop=True)
print("Unified dataset shape:", data.shape)
data.head(10)

Unified dataset shape: (594, 8)


Unnamed: 0,flightnumber,date,launch_site,payload_mass,orbit,booster_version,mission_outcome,source_table_index
0,437.0,2025-02-15 06:14:00+00:00,"Cape Canaveral, SLC‑40",16500.0,LEO,F9 B5 B1067‑26,Success,2
1,439.0,2025-02-21 15:19:00+00:00,"Cape Canaveral, SLC‑40",17100.0,LEO,F9 B5 B1076‑21,Success,2
2,440.0,2025-02-23 01:38:00+00:00,"Vandenberg, SLC‑4E",16800.0,LEO,F9 B5 B1082‑11,Success,2
3,442.0,2025-02-27 03:34:00+00:00,"Cape Canaveral, SLC‑40",16500.0,LEO,F9 B5 B1092‑1[296],Success,2
4,,2025-08-17 15:44:00+00:00,"Vandenberg, SLC‑4E",,SSO,F9 B5 B1088‑9,,3
5,,2025-08-17 15:44:00+00:00,Launch of 24 Starlink v2 mini satellites to a ...,,Launch of 24 Starlink v2 mini satellites to a ...,Launch of 24 Starlink v2 mini satellites to a ...,,3
6,,2025-08-21 15:44:00+00:00,"Vandenberg, SLC‑4E",,SSO,F9 B5 B1081‑17,,3
7,,2025-08-21 15:44:00+00:00,Launch of 24 Starlink v2 mini satellites to a ...,,Launch of 24 Starlink v2 mini satellites to a ...,Launch of 24 Starlink v2 mini satellites to a ...,,3
8,,2025-08-24 06:45:00+00:00,"Cape Canaveral, SLC‑40",,LEO (ISS),F9 B5,,3
9,,2025-08-24 06:45:00+00:00,Commercial Resupply Services mission to the IS...,,Commercial Resupply Services mission to the IS...,Commercial Resupply Services mission to the IS...,,3


## 7) Light QA

In [9]:

print("Missing values per column:\n", data.isna().sum(), "\n")
print("Launch sites (top 10):\n", data['launch_site'].value_counts(dropna=False).head(10), "\n")
print("Orbits (top 10):\n", data['orbit'].value_counts(dropna=False).head(10))

Missing values per column:
 flightnumber          132
date                  549
launch_site             2
payload_mass          152
orbit                   3
booster_version         2
mission_outcome       124
source_table_index      0
dtype: int64 

Launch sites (top 10):
 launch_site
Cape Canaveral, SLC‑40                                                                                                                                                            113
Vandenberg, SLC‑4E                                                                                                                                                                 76
Kennedy, LC‑39A                                                                                                                                                                    60
Vandenberg, SLC-4E                                                                                                                                                     

## 8) Persist to CSV & SQLite

In [10]:

out_dir = Path("data"); out_dir.mkdir(parents=True, exist_ok=True)
csv_path = out_dir / "wikipedia_spacex_launches_clean.csv"
data.to_csv(csv_path, index=False)
print("Saved CSV ->", csv_path.resolve())

conn = sqlite3.connect("spacex_wiki.db")
data.to_sql("wiki_launches", conn, if_exists="replace", index=False)
conn.close()
print("Saved table 'wiki_launches' to spacex_wiki.db")

Saved CSV -> C:\Users\USER\Downloads\data\wikipedia_spacex_launches_clean.csv
Saved table 'wiki_launches' to spacex_wiki.db


## 9) Outcome previews

In [11]:

display(data.head(20))
display(data.describe(include="all").T.head(20))

Unnamed: 0,flightnumber,date,launch_site,payload_mass,orbit,booster_version,mission_outcome,source_table_index
0,437.0,2025-02-15 06:14:00+00:00,"Cape Canaveral, SLC‑40",16500.0,LEO,F9 B5 B1067‑26,Success,2
1,439.0,2025-02-21 15:19:00+00:00,"Cape Canaveral, SLC‑40",17100.0,LEO,F9 B5 B1076‑21,Success,2
2,440.0,2025-02-23 01:38:00+00:00,"Vandenberg, SLC‑4E",16800.0,LEO,F9 B5 B1082‑11,Success,2
3,442.0,2025-02-27 03:34:00+00:00,"Cape Canaveral, SLC‑40",16500.0,LEO,F9 B5 B1092‑1[296],Success,2
4,,2025-08-17 15:44:00+00:00,"Vandenberg, SLC‑4E",,SSO,F9 B5 B1088‑9,,3
5,,2025-08-17 15:44:00+00:00,Launch of 24 Starlink v2 mini satellites to a ...,,Launch of 24 Starlink v2 mini satellites to a ...,Launch of 24 Starlink v2 mini satellites to a ...,,3
6,,2025-08-21 15:44:00+00:00,"Vandenberg, SLC‑4E",,SSO,F9 B5 B1081‑17,,3
7,,2025-08-21 15:44:00+00:00,Launch of 24 Starlink v2 mini satellites to a ...,,Launch of 24 Starlink v2 mini satellites to a ...,Launch of 24 Starlink v2 mini satellites to a ...,,3
8,,2025-08-24 06:45:00+00:00,"Cape Canaveral, SLC‑40",,LEO (ISS),F9 B5,,3
9,,2025-08-24 06:45:00+00:00,Commercial Resupply Services mission to the IS...,,Commercial Resupply Services mission to the IS...,Commercial Resupply Services mission to the IS...,,3


Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
flightnumber,462.0,,,,401.0,286.0,343.25,401.0,458.75,516.0,66.755617
date,45.0,,,,2026-07-19 18:23:07.999999744+00:00,2025-02-15 06:14:00+00:00,2025-08-25 02:05:00+00:00,2027-01-01 00:00:00+00:00,2027-01-01 00:00:00+00:00,2027-08-01 00:00:00+00:00,
launch_site,592.0,232.0,"Cape Canaveral, SLC‑40",113.0,,,,,,,
payload_mass,442.0,,,,6827.386199,0.0,23.0,1500.0,16300.0,17500.0,7752.500761
orbit,591.0,240.0,LEO,183.0,,,,,,,
booster_version,592.0,471.0,F9 B5,40.0,,,,,,,
mission_outcome,470.0,175.0,Success,236.0,,,,,,,
source_table_index,594.0,,,,1.956229,1.0,1.0,2.0,2.0,5.0,1.197849
