# 3. Scrap Information

We'll scrap some information from Wikipedia about the Falcon 9 launches which we'll incorporate to our data.

- extract information from a html table from Wikipedia
- parse the table and convert it to a Pandas DataFrame
- save the resulting dataframe to a `03_scrapped_data.csv` file


In [1]:
import requests
from bs4 import BeautifulSoup
import unicodedata
import pandas as pd
from pathlib import Path

import helpers as hlp

## Setup


In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

STATIC_URL = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

OUTPUT_FILE = hlp.DATA_DIR / Path("03_scrapped_data.csv")

DATE_AND_TIME_COL = 0
VERSION_BOOSTER_COL = 1
LAUNCH_SITE_COL = 2
PAYLOAD_COL = 3
PAYLOAD_MASS_COL = 4
ORBIT_COL = 5
CUSTOMER_COL = 6
LAUNCH_OUTCOME_COL = 7
BOOSTER_LANDING_COL = 8

## Helper functions


In [3]:
def extract_date_and_time(table_cell):
    """This function returns the data and time from the HTML  table cell
    Input: the  element of a table data cell extracts extra row"""
    datatimelist = [data_time.strip() for data_time in list(table_cell.strings)][0:2]
    return datatimelist[0].strip(","), datatimelist[1]


def extract_booster_version(table_cell):
    """This function returns the booster version from the HTML table cell
    Input: the  element of a table data cell extracts extra row"""
    out = list(table_cell.strings)[0]
    return out.encode("ascii", "ignore").decode()


def extract_launch_site(table_cell):
    return table_cell.a.string


def extract_payload(table_cell):
    return table_cell.a.string


def extract_payload_mass(table_cell):
    mass = unicodedata.normalize("NFKD", table_cell.text).strip().lower()
    mass = "".join(mass[: mass.find("kg")].split(","))
    if mass:
        try:
            return float(mass)
        except ValueError:
            pass
    return 0


def extract_orbit(table_cell):
    return table_cell.a.string


def extract_customer(table_cell):
    try:
        return table_cell.a.string
    except AttributeError:
        return ""


def extract_launch_outcome(table_cell) -> str:
    return list(table_cell.strings)[0].strip().lower()


def extract_booster_landing(table_cell) -> str:
    return list(table_cell.strings)[0].strip().lower()


def normalize_column_name(col_name: str) -> str:
    """MISC.
    Given a string (column names), normalize it by:
    - converting it to lowercase
    - adding an underscore between words
    - removing symbols such as: ['(', ')', '.']"""

    def remove_symbols(col_name: str) -> str:
        symbols = ["(", ")", "."]
        col_name = col_name.strip().lower()

        for symbol in symbols:
            col_name = col_name.replace(symbol, "")
        return "_".join(col_name.split())

    return remove_symbols(col_name)


def extract_column_from_header(row) -> str | None:
    """This function returns the landing status from the HTML table cell
    Input: the  element of a table data cell extracts extra row"""
    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()

    colunm_name = " ".join(row.contents)

    # Filter the digit and empty names
    if not (colunm_name.strip().isdigit()):
        # colunm_name = colunm_name.strip()
        return hlp.normalize_column_name(colunm_name)
    return None


def extract_column_names_from_table(table) -> list[str]:
    # Apply find_all() function with `th` element on first_launch_table
    # Iterate each th element and apply the provided extract_column_from_header() to get a column name
    # Append the Non-empty column name (`if name is not None and len(name) > 0`) into a list called column_names
    column_names = []
    for th in table.find_all("th"):
        col_name = extract_column_from_header(th)
        if col_name is not None and len(col_name) > 0:
            column_names.append(col_name)
    return column_names

## Request & Parse the HTML


In [4]:
response = requests.get(STATIC_URL)
response.status_code

200

In [5]:
soup = BeautifulSoup(response.content, "html.parser")
soup.title.string

'List of Falcon 9 and Falcon Heavy launches - Wikipedia'

In [6]:
html_tables = soup.find_all("table")
len(html_tables)

25

In [7]:
TARGET_TABLE = 2
first_launch_table = html_tables[TARGET_TABLE]

In [8]:
column_names = ["flight_number"]
column_names += extract_column_names_from_table(first_launch_table)
column_names.remove("flight_no")

# Add some extra columns
column_names += ["version_booster", "booster_landing", "date", "time"]

column_names

['flight_number',
 'date_and_time',
 'launch_site',
 'payload',
 'payload_mass',
 'orbit',
 'customer',
 'launch_outcome',
 'version_booster',
 'booster_landing',
 'date',
 'time']

## Create a new dataframe from the html table


In [9]:
def create_dict_from_column_names(column_names: list) -> dict[str, list]:
    # Initializes a dict to host values
    result = dict.fromkeys(column_names)
    del result["date_and_time"]
    for key, _ in result.items():
        result[key] = []
    return result


launch_dict = create_dict_from_column_names(column_names)
launch_dict.keys()

dict_keys(['flight_number', 'launch_site', 'payload', 'payload_mass', 'orbit', 'customer', 'launch_outcome', 'version_booster', 'booster_landing', 'date', 'time'])

In [10]:
def retrieve_tables(table_attrs: dict):
    return soup.find_all("table", table_attrs)


tables = retrieve_tables(table_attrs={"class": "wikitable plainrowheaders collapsible"})


In [11]:
def populate_launch_dict(column_names) -> dict[str, list]:
    launch_dict = create_dict_from_column_names(column_names)
    for table in tables:
        rows = table.find_all("tr")
        for row in rows[1:]:
            if row.th and row.th.string:  # if 1st column is TH, extract values
                flight_number = row.th.string.strip()
                flight_data = row.find_all("td")

                date, time = extract_date_and_time(flight_data[DATE_AND_TIME_COL])
                version_booster = extract_booster_version(
                    flight_data[VERSION_BOOSTER_COL]
                )
                launch_site = extract_launch_site(flight_data[LAUNCH_SITE_COL])
                payload = extract_payload(flight_data[PAYLOAD_COL])
                payload_mass = extract_payload_mass(flight_data[PAYLOAD_MASS_COL])
                orbit = extract_orbit(flight_data[ORBIT_COL])
                customer = extract_customer(flight_data[CUSTOMER_COL])
                launch_outcome = extract_launch_outcome(flight_data[LAUNCH_OUTCOME_COL])
                booster_landing = extract_booster_landing(
                    flight_data[BOOSTER_LANDING_COL]
                )

                launch_dict["flight_number"].append(flight_number)
                launch_dict["date"].append(date)
                launch_dict["version_booster"].append(version_booster)
                launch_dict["payload_mass"].append(payload_mass)
                launch_dict["orbit"].append(orbit)
                launch_dict["launch_site"].append(launch_site)
                launch_dict["launch_outcome"].append(launch_outcome)
                launch_dict["time"].append(time)
                launch_dict["payload"].append(payload)
                launch_dict["customer"].append(customer)
                launch_dict["booster_landing"].append(booster_landing)
    return launch_dict

In [12]:
launch_dict = populate_launch_dict(column_names)

## Save information to file


In [13]:
df = pd.DataFrame({key: pd.Series(value) for key, value in launch_dict.items()})
df.head()

Unnamed: 0,flight_number,launch_site,payload,payload_mass,orbit,customer,launch_outcome,version_booster,booster_landing,date,time
0,1,CCAFS,Dragon Spacecraft Qualification Unit,0.0,LEO,SpaceX,success,F9 v1.0,failure,4 June 2010,18:45
1,2,CCAFS,Dragon,0.0,LEO,NASA,success,F9 v1.0,failure,8 December 2010,15:43
2,3,CCAFS,Dragon,525.0,LEO,NASA,success,F9 v1.0,no attempt,22 May 2012,07:44
3,4,CCAFS,SpaceX CRS-1,4700.0,LEO,NASA,success,F9 v1.0,no attempt,8 October 2012,00:35
4,5,CCAFS,SpaceX CRS-2,4877.0,LEO,NASA,success,F9 v1.0,no attempt,1 March 2013,15:10


In [14]:
df.to_csv(OUTPUT_FILE, encoding="utf-8", mode="w", header=True, index=False)