# 1. Data collection

Optimized code from SpaceX Capstone, module 1.

Highlights:

- Optimized functions using the API to make fewer calls to the _spacex_ server.
- Save the data to a `.csv` file for later use


## Imports


In [16]:
import requests
import pandas as pd
import datetime
from pathlib import Path

import helpers as hlp

## Setup


In [17]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

ROCKETS_URL = "https://api.spacexdata.com/v4/rockets/"
LAUNCH_SITE_URL = "https://api.spacexdata.com/v4/launchpads/"
PAYLOADS_URL = "https://api.spacexdata.com/v4/payloads/"
CORES_URL = "https://api.spacexdata.com/v4/cores/"

SPACEX_URL = "https://api.spacexdata.com/v4/launches/past"

OUTPUT_FILE = hlp.DATA_DIR / Path("01_collected_data.csv")


## Get the data


In [18]:
# Request rocket launch data
response = requests.get(SPACEX_URL)
data = pd.json_normalize(response.json())  # convert json into a pandas DataFrame

In [19]:
print(len(data))
print(type(data))

187
<class 'pandas.core.frame.DataFrame'>


## Normalize data


In [20]:
# Lets take a subset of our dataframe keeping only the features we want and the flight number, and date_utc.
data = data[["rocket", "payloads", "launchpad", "cores", "flight_number", "date_utc"]]

# We will remove rows with multiple cores because those are falcon rockets with 2 extra rocket boosters and rows that have multiple payloads in a single rocket.
data = data[data["cores"].map(len) == 1]
data = data[data["payloads"].map(len) == 1]

# Since payloads and cores are lists of size 1 we will also extract the single value in the list and replace the feature.
data["cores"] = data["cores"].map(lambda x: x[0])
data["payloads"] = data["payloads"].map(lambda x: x[0])

# We also want to convert the date_utc to a datetime datatype and then extracting the date leaving the time
data["date"] = pd.to_datetime(data["date_utc"]).dt.date

# Using the date we will restrict the dates of the launches to those before December 13th, 2020
data = data[data["date"] <= datetime.date(2020, 11, 13)]

In [21]:
data.head(4)

Unnamed: 0,rocket,payloads,launchpad,cores,flight_number,date_utc,date
0,5e9d0d95eda69955f709d1eb,5eb0e4b5b6c3bb0006eeb1e1,5e9e4502f5090995de566f86,"{'core': '5e9e289df35918033d3b2623', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",1,2006-03-24T22:30:00.000Z,2006-03-24
1,5e9d0d95eda69955f709d1eb,5eb0e4b6b6c3bb0006eeb1e2,5e9e4502f5090995de566f86,"{'core': '5e9e289ef35918416a3b2624', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",2,2007-03-21T01:10:00.000Z,2007-03-21
3,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e5,5e9e4502f5090995de566f86,"{'core': '5e9e289ef3591855dc3b2626', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",4,2008-09-28T23:15:00.000Z,2008-09-28
4,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e6,5e9e4502f5090995de566f86,"{'core': '5e9e289ef359184f103b2627', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",5,2009-07-13T03:35:00.000Z,2009-07-13


## Helper functions


In [22]:
def extract_booster_version(data) -> list:
    """Takes the dataset and uses the rocket column to call the API and append the data to the list. OPTIMIZED"""

    unique_rockets = {}
    booster_version = []
    for rocket_id in data["rocket"]:
        rocket_name = unique_rockets.get(rocket_id, None)
        if rocket_name is None:
            response = requests.get(ROCKETS_URL + str(rocket_id)).json()
            rocket_name = response["name"]
            unique_rockets[rocket_id] = rocket_name
        booster_version.append(rocket_name)

    return booster_version

In [23]:
def extract_launch_site(data) -> tuple[list, list, list]:
    """Takes the dataset and uses the launchpad column to call the API and append the data to the list. OPTIMIZED"""

    unique_launchpads = {}
    longitude = []
    latitude = []
    launch_site = []
    for launchpad_id in data["launchpad"]:
        launchpad = unique_launchpads.get(launchpad_id, None)
        if launchpad is not None:
            lon = launchpad["lon"]
            lat = launchpad["lat"]
            site = launchpad["launch_site"]
        else:
            response = requests.get(LAUNCH_SITE_URL + str(launchpad_id)).json()
            lon = response["longitude"]
            lat = response["latitude"]
            site = response["name"]
            unique_launchpads[launchpad_id] = {
                "lon": lon,
                "lat": lat,
                "launch_site": site,
            }
        longitude.append(lon)
        latitude.append(lat)
        launch_site.append(site)
    return (longitude, latitude, launch_site)

In [24]:
def extract_payload_data(data) -> tuple[list, list]:
    """Takes the dataset and uses the payloads column to call the API and append the data to the lists. OPTIMIZED"""

    unique_payloads = {}
    payload_masses = []
    orbits = []
    for payload_id in data["payloads"]:
        payload = unique_payloads.get(payload_id, None)
        if payload is not None:
            mass = payload["mass_kg"]
            orbit = payload["orbit"]
        else:
            response = requests.get(PAYLOADS_URL + str(payload_id)).json()
            mass = response["mass_kg"]
            orbit = response["orbit"]
            unique_payloads[payload_id] = {"mass_kg": mass, "orbit": orbit}
        payload_masses.append(mass)
        orbits.append(orbit)

    return payload_masses, orbits

In [25]:
def extract_core_data(data):
    """Takes the dataset and uses the cores column to call the API and append the data to the lists"""

    block = []
    reused_count = []
    serial = []
    outcome = []
    flights = []
    grid_fins = []
    reused = []
    legs = []
    landing_pad = []
    for core in data["cores"]:
        core_id = core.get("core", None)
        if core_id is not None:
            response = requests.get(CORES_URL + core_id).json()
            block.append(response["block"])
            reused_count.append(response["reuse_count"])
            serial.append(response["serial"])
        else:
            block.append(None)
            reused_count.append(None)
            serial.append(None)
        outcome.append(str(core["landing_success"]) + " " + str(core["landing_type"]))
        flights.append(core["flight"])
        grid_fins.append(core["gridfins"])
        reused.append(core["reused"])
        legs.append(core["legs"])
        landing_pad.append(core["landpad"])
    return (
        block,
        reused_count,
        serial,
        outcome,
        flights,
        grid_fins,
        reused,
        legs,
        landing_pad,
    )

## Get the information into lists

For checking purposes, the size of all our lists should be 94.


In [26]:
booster_version = extract_booster_version(data)

longitude, latitude, launch_site = extract_launch_site(data)
print(len(booster_version), len(longitude), len(latitude), len(launch_site))

94 94 94 94


In [27]:
payload_mass, orbit = extract_payload_data(data)
print(len(payload_mass), len(orbit))

94 94


In [28]:
block, reused_count, serial, outcome, flights, grid_fins, reused, legs, landing_pad = (
    extract_core_data(data)
)
print(
    len(block),
    len(reused_count),
    len(serial),
    len(outcome),
    len(flights),
    len(grid_fins),
    len(reused),
    len(legs),
    len(landing_pad),
)

94 94 94 94 94 94 94 94 94


## Create a unified dataframe


In [29]:
data_df = pd.DataFrame(
    data={
        "flight_number": data["flight_number"],
        "date": data["date"],
        "booster_version": booster_version,
        "payload_mass": payload_mass,
        "orbit": orbit,
        "launch_site": launch_site,
        "outcome": outcome,
        "flights": flights,
        "grid_fins": grid_fins,
        "reused": reused,
        "legs": legs,
        "landing_pad": landing_pad,
        "block": block,
        "reused_count": reused_count,
        "serial": serial,
        "longitude": longitude,
        "latitude": latitude,
    }
)

## Save the clean data


In [30]:
data_df.to_csv(OUTPUT_FILE, encoding="utf-8", mode="w", header=True, index=False)