
# Data Wrangling — SpaceX Launch Dataset

**Author:** Iliya Pezeshki
**Course:** IBM Data Science Professional Certificate — Capstone  
**Objective:** Load the SpaceX launch dataset, clean types, filter scope (drop Falcon 1), handle missing values, engineer simple features, apply one‑hot encoding to categoricals, and persist clean feature tables for EDA/ML.


In [1]:

import os, requests
import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option('display.max_columns', 120)
print("Libraries imported.")

Libraries imported.


## 1) Load raw data (prefer local CSV; fallback to SpaceX API)

In [2]:

DATA_CSV = Path('data/spacex_launches_clean.csv')

def load_or_fetch(DATA_CSV: Path) -> pd.DataFrame:
    if DATA_CSV.exists():
        print("Loading local CSV →", DATA_CSV.resolve())
        df = pd.read_csv(DATA_CSV)
        return df
    else:
        print("Local CSV not found. Fallback: fetching from SpaceX API (v4).")
        BASE = "https://api.spacexdata.com/v4"
        launches = requests.get(f"{BASE}/launches", timeout=60); launches.raise_for_status()
        launchpads = requests.get(f"{BASE}/launchpads", timeout=60); launchpads.raise_for_status()
        rockets = requests.get(f"{BASE}/rockets", timeout=60); rockets.raise_for_status()
        payloads = requests.get(f"{BASE}/payloads", timeout=60); payloads.raise_for_status()
        landpads = requests.get(f"{BASE}/landpads", timeout=60); landpads.raise_for_status()
        
        L, LP, R, P, LD = launches.json(), launchpads.json(), rockets.json(), payloads.json(), landpads.json()
        lp_name = {x['id']: x.get('name') for x in LP}
        lp_full = {x['id']: x.get('full_name') for x in LP}
        rk_name = {x['id']: x.get('name') for x in R}
        py_mass = {x['id']: x.get('mass_kg') for x in P}
        py_orbit = {x['id']: x.get('orbit') for x in P}
        ld_name = {x['id']: x.get('name') for x in LD}
        
        def first_or_none(lst):
            return lst[0] if isinstance(lst, list) and len(lst) > 0 else None
        
        rows = []
        for z in L:
            core0 = first_or_none(z.get('cores', [])) or {}
            payload_ids = z.get('payloads', [])
            masses = [py_mass.get(pid) for pid in payload_ids if pid in py_mass]
            orbits = [py_orbit.get(pid) for pid in payload_ids if pid in py_orbit]
            rows.append({
                'FlightNumber': z.get('flight_number'),
                'MissionName': z.get('name'),
                'DateUTC': z.get('date_utc'),
                'Rocket': rk_name.get(z.get('rocket')),
                'LaunchSite': lp_name.get(z.get('launchpad')) or lp_full.get(z.get('launchpad')),
                'PayloadMass': float(np.nansum(masses)) if masses else np.nan,
                'Orbit': orbits[0] if orbits else None,
                'LandingType': core0.get('landing_type'),
                'LandingPad': ld_name.get(core0.get('landpad')) if core0.get('landpad') else None,
                'Class': 1 if core0.get('landing_success') is True else 0
            })
        df = pd.DataFrame(rows).sort_values('FlightNumber').reset_index(drop=True)
        return df

df = load_or_fetch(DATA_CSV)
print("Raw shape:", df.shape)
df.head(10)

Loading local CSV → C:\Users\USER\Downloads\data\spacex_launches_clean.csv
Raw shape: (205, 10)


Unnamed: 0,FlightNumber,MissionName,DateUTC,Rocket,LaunchSite,PayloadMass,Orbit,LandingType,LandingPad,Class
0,1,FalconSat,2006-03-24T22:30:00.000Z,Falcon 1,Kwajalein Atoll,20.0,LEO,,,0
1,2,DemoSat,2007-03-21T01:10:00.000Z,Falcon 1,Kwajalein Atoll,0.0,LEO,,,0
2,3,Trailblazer,2008-08-03T03:34:00.000Z,Falcon 1,Kwajalein Atoll,0.0,LEO,,,0
3,4,RatSat,2008-09-28T23:15:00.000Z,Falcon 1,Kwajalein Atoll,165.0,LEO,,,0
4,5,RazakSat,2009-07-13T03:35:00.000Z,Falcon 1,Kwajalein Atoll,200.0,LEO,,,0
5,6,Falcon 9 Test Flight,2010-06-04T18:45:00.000Z,Falcon 9,CCSFS SLC 40,0.0,LEO,,,0
6,7,COTS 1,2010-12-08T15:43:00.000Z,Falcon 9,CCSFS SLC 40,0.0,LEO,,,0
7,8,COTS 2,2012-05-22T07:44:00.000Z,Falcon 9,CCSFS SLC 40,525.0,LEO,,,0
8,9,CRS-1,2012-10-08T00:35:00.000Z,Falcon 9,CCSFS SLC 40,800.0,ISS,,,0
9,10,CRS-2,2013-03-01T19:10:00.000Z,Falcon 9,CCSFS SLC 40,677.0,ISS,,,0


## 2) Standardize types (datetime, numeric)

In [3]:

df['DateUTC'] = pd.to_datetime(df['DateUTC'], errors='coerce', utc=True)
df['PayloadMass'] = pd.to_numeric(df['PayloadMass'], errors='coerce')
df['FlightNumber'] = pd.to_numeric(df['FlightNumber'], errors='coerce')
print(df.dtypes.head(10))
df[['DateUTC','PayloadMass','FlightNumber']].head()

FlightNumber                  int64
MissionName                  object
DateUTC         datetime64[ns, UTC]
Rocket                       object
LaunchSite                   object
PayloadMass                 float64
Orbit                        object
LandingType                  object
LandingPad                   object
Class                         int64
dtype: object


Unnamed: 0,DateUTC,PayloadMass,FlightNumber
0,2006-03-24 22:30:00+00:00,20.0,1
1,2007-03-21 01:10:00+00:00,0.0,2
2,2008-08-03 03:34:00+00:00,0.0,3
3,2008-09-28 23:15:00+00:00,165.0,4
4,2009-07-13 03:35:00+00:00,200.0,5


## 3) Filter scope

In [4]:

if 'Rocket' in df.columns:
    before = df.shape[0]
    df = df[~df['Rocket'].fillna('').str.contains('Falcon 1', case=False, na=False)].copy()
    after = df.shape[0]
    print(f"Dropped {before - after} Falcon 1 rows (if any). New shape: {df.shape}")
else:
    print("Column 'Rocket' not found; skipping Falcon 1 filter.")

Dropped 5 Falcon 1 rows (if any). New shape: (200, 10)


## 4) Handle missing values

In [5]:

print("Missing before:\n", df.isna().sum())

# Example: impute PayloadMass with median (optional)
if df['PayloadMass'].notna().any():
    median_mass = df['PayloadMass'].median()
    df['PayloadMass'] = df['PayloadMass'].fillna(median_mass)

# Keep categorical NaNs so one-hot encoding can handle 'unknown' if using dummy_na=True (below).
print("\nMissing after:\n", df.isna().sum())

Missing before:
 FlightNumber     0
MissionName      0
DateUTC          0
Rocket           0
LaunchSite       0
PayloadMass     12
Orbit           13
LandingType     42
LandingPad      49
Class            0
dtype: int64

Missing after:
 FlightNumber     0
MissionName      0
DateUTC          0
Rocket           0
LaunchSite       0
PayloadMass      0
Orbit           13
LandingType     42
LandingPad      49
Class            0
dtype: int64


## 5) Drop irrelevant columns (text/URLs not used in modeling)

In [6]:

drop_cols = [c for c in ['MissionName','LandingType'] if c in df.columns]
df_model = df.drop(columns=drop_cols).copy()
print("Dropped:", drop_cols)
df_model.head(5)

Dropped: ['MissionName', 'LandingType']


Unnamed: 0,FlightNumber,DateUTC,Rocket,LaunchSite,PayloadMass,Orbit,LandingPad,Class
5,6,2010-06-04 18:45:00+00:00,Falcon 9,CCSFS SLC 40,0.0,LEO,,0
6,7,2010-12-08 15:43:00+00:00,Falcon 9,CCSFS SLC 40,0.0,LEO,,0
7,8,2012-05-22 07:44:00+00:00,Falcon 9,CCSFS SLC 40,525.0,LEO,,0
8,9,2012-10-08 00:35:00+00:00,Falcon 9,CCSFS SLC 40,800.0,ISS,,0
9,10,2013-03-01 19:10:00+00:00,Falcon 9,CCSFS SLC 40,677.0,ISS,,0


## 6) Engineer simple features (Year)

In [7]:

df_model['Year'] = df_model['DateUTC'].dt.year
df_model.head(3)

Unnamed: 0,FlightNumber,DateUTC,Rocket,LaunchSite,PayloadMass,Orbit,LandingPad,Class,Year
5,6,2010-06-04 18:45:00+00:00,Falcon 9,CCSFS SLC 40,0.0,LEO,,0,2010
6,7,2010-12-08 15:43:00+00:00,Falcon 9,CCSFS SLC 40,0.0,LEO,,0,2010
7,8,2012-05-22 07:44:00+00:00,Falcon 9,CCSFS SLC 40,525.0,LEO,,0,2012


## 7) One-hot encode categoricals (Orbit, LaunchSite, LandingPad)

In [8]:

categoricals = [c for c in ['Orbit','LaunchSite','LandingPad'] if c in df_model.columns]
print("Categoricals to encode:", categoricals)

# Use pandas.get_dummies with dummy_na=True to preserve missing category as a column
features = pd.get_dummies(
    df_model.drop(columns=['Class']), 
    columns=categoricals, 
    dummy_na=True
)

# Target
y = df_model['Class'].astype(int)

print("Features shape:", features.shape)
print("Target shape:", y.shape)
features.head(5)

Categoricals to encode: ['Orbit', 'LaunchSite', 'LandingPad']
Features shape: (200, 30)
Target shape: (200,)


Unnamed: 0,FlightNumber,DateUTC,Rocket,PayloadMass,Year,Orbit_ES-L1,Orbit_GEO,Orbit_GTO,Orbit_HCO,Orbit_HEO,Orbit_ISS,Orbit_LEO,Orbit_MEO,Orbit_PO,Orbit_SO,Orbit_SSO,Orbit_TLI,Orbit_VLEO,Orbit_nan,LaunchSite_CCSFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LaunchSite_nan,LandingPad_ASOG,LandingPad_JRTI,LandingPad_JRTI-1,LandingPad_LZ-1,LandingPad_LZ-4,LandingPad_OCISLY,LandingPad_nan
5,6,2010-06-04 18:45:00+00:00,Falcon 9,0.0,2010,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True
6,7,2010-12-08 15:43:00+00:00,Falcon 9,0.0,2010,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True
7,8,2012-05-22 07:44:00+00:00,Falcon 9,525.0,2012,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True
8,9,2012-10-08 00:35:00+00:00,Falcon 9,800.0,2012,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True
9,10,2013-03-01 19:10:00+00:00,Falcon 9,677.0,2013,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True


## 8) QA checks

In [9]:

print("Final columns:", features.columns.tolist()[:15], "...")
print("Any remaining NaNs in features?", features.isna().any().any())
print("Class balance:\n", y.value_counts())

Final columns: ['FlightNumber', 'DateUTC', 'Rocket', 'PayloadMass', 'Year', 'Orbit_ES-L1', 'Orbit_GEO', 'Orbit_GTO', 'Orbit_HCO', 'Orbit_HEO', 'Orbit_ISS', 'Orbit_LEO', 'Orbit_MEO', 'Orbit_PO', 'Orbit_SO'] ...
Any remaining NaNs in features? False
Class balance:
 Class
1    143
0     57
Name: count, dtype: int64


## 9) Persist outputs (CSV)

In [10]:

out_dir = Path('data'); out_dir.mkdir(exist_ok=True, parents=True)
X_path = out_dir / 'features_one_hot.csv'
y_path = out_dir / 'target_class.csv'

features.to_csv(X_path, index=False)
y.to_csv(y_path, index=False, header=['Class'])

print("Saved features →", X_path.resolve())
print("Saved target   →", y_path.resolve())

Saved features → C:\Users\USER\Downloads\data\features_one_hot.csv
Saved target   → C:\Users\USER\Downloads\data\target_class.csv


## 10) Preview saved outputs

In [11]:

print(pd.read_csv('data/features_one_hot.csv').shape)
print(pd.read_csv('data/target_class.csv').shape)

(200, 30)
(200, 1)
