
# 04 — EDA with SQL (SQLite)

We load `./data/launches_clean.csv` into an on-disk SQLite database and answer the required questions.
Each subsection shows the **SQL query** and the **result table**. Results are also exported to `./artifacts`.

**Inputs**
- `data/launches_clean.csv`

**Artifacts produced**
- `artifacts/spacex_capstone.sqlite` (SQLite DB with table `launches`)
- `artifacts/sql_07_unique_launch_sites.csv`
- `artifacts/sql_08_cca5_records.csv`
- `artifacts/sql_09_total_payload_nasa.csv`
- `artifacts/sql_10_avg_payload_f9v11.csv`
- `artifacts/sql_11_first_success_groundpad.csv`
- `artifacts/sql_12_drone_success_4k_6k.csv`
- `artifacts/sql_13_success_vs_failure_counts.csv`
- `artifacts/sql_14_max_payload_boosters.csv`
- `artifacts/sql_15_failed_drone_2015.csv`
- `artifacts/sql_16_rank_landing_outcomes_2010_2017.csv`


## Setup & Load to SQLite

In [4]:

# If needed:
# !pip install pandas

import pandas as pd, sqlite3, os
from pathlib import Path

DATA_DIR = Path("./data")
ARTIFACTS_DIR = Path("./artifacts")
ARTIFACTS_DIR.mkdir(exist_ok=True, parents=True)

CSV_PATH = DATA_DIR / "launches_clean.csv"
assert CSV_PATH.exists(), "Missing data/launches_clean.csv — run previous notebooks."

# Read CSV
df = pd.read_csv(CSV_PATH, parse_dates=["date_utc"], dtype={"flight_number":"Int64","year":"Int64"})
# Normalize booleans to integers for SQLite-friendly querying
if 'is_nasa' in df.columns:
    df['is_nasa'] = df['is_nasa'].fillna(False).astype(bool).astype(int)
if 'launch_success' in df.columns:
    # In case it's 'True'/'False' strings from CSV; coerce to 0/1
    ls = df['launch_success'].map({True:1, False:0, "True":1, "False":0, 1:1, 0:0})
    df['launch_success'] = ls.fillna(0).astype(int)

# Write to SQLite
DB_PATH = ARTIFACTS_DIR / "spacex_capstone.sqlite"
conn = sqlite3.connect(DB_PATH)
df.to_sql("launches", conn, if_exists="replace", index=False)

print("SQLite DB:", DB_PATH.resolve())
print("Rows in 'launches':", pd.read_sql_query("SELECT COUNT(*) AS n FROM launches", conn))
df.head(3)


SQLite DB: /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/spacex_capstone.sqlite
Rows in 'launches':      n
0  192


Unnamed: 0,flight_number,date_utc,year,date,launch_site,site_region,site_locality,site_lat,site_lon,rocket_name,...,launch_success,landing_outcome,landing_outcome_kind,payload_id,payload_name,payload_mass_kg,orbit,customers,nationalities,is_nasa
0,6,2010-06-04 18:45:00+00:00,2010,2010-06-04,CCSFS SLC 40,Florida,Cape Canaveral,28.561857,-80.577366,Falcon 9,...,1,No attempt,no_attempt,5eb0e4b7b6c3bb0006eeb1e7,Dragon Qualification Unit,,LEO,SpaceX,United States,0
1,7,2010-12-08 15:43:00+00:00,2010,2010-12-08,CCSFS SLC 40,Florida,Cape Canaveral,28.561857,-80.577366,Falcon 9,...,1,No attempt,no_attempt,5eb0e4b9b6c3bb0006eeb1e8,COTS Demo Flight 1,,LEO,NASA(COTS),United States,1
2,7,2010-12-08 15:43:00+00:00,2010,2010-12-08,CCSFS SLC 40,Florida,Cape Canaveral,28.561857,-80.577366,Falcon 9,...,1,No attempt,no_attempt,5eb0e4b9b6c3bb0006eeb1e9,Cubesats,,LEO,NRO,,0


### Helper: run SQL & save result

In [7]:

from pandas import DataFrame

def run_sql(sql: str, params: dict | tuple | None = None, save_as: str | None = None) -> DataFrame:
    print("SQL:\n" + sql.strip())
    res = pd.read_sql_query(sql, conn, params=params)
    if save_as:
        out_path = ARTIFACTS_DIR / save_as
        res.to_csv(out_path, index=False)
        print("Saved ->", out_path.resolve())
    return res


## (07) Find all unique launch site names — query + result

In [10]:

sql_07 = '''
SELECT DISTINCT launch_site
FROM launches
WHERE launch_site IS NOT NULL
ORDER BY launch_site;
'''
run_sql(sql_07, save_as="sql_07_unique_launch_sites.csv")


SQL:
SELECT DISTINCT launch_site
FROM launches
WHERE launch_site IS NOT NULL
ORDER BY launch_site;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_07_unique_launch_sites.csv


Unnamed: 0,launch_site
0,CCSFS SLC 40
1,KSC LC 39A
2,VAFB SLC 4E


## (08) Find 5 records where the launch site begins with `CCA` — query + result

In [13]:

sql_08 = '''
SELECT flight_number, date(substr(date_utc,1,10)) AS date_utc, launch_site
FROM launches
WHERE launch_site LIKE 'CCA%%'
ORDER BY flight_number
LIMIT 5;
'''
run_sql(sql_08, save_as="sql_08_cca5_records.csv")


SQL:
SELECT flight_number, date(substr(date_utc,1,10)) AS date_utc, launch_site
FROM launches
WHERE launch_site LIKE 'CCA%%'
ORDER BY flight_number
LIMIT 5;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_08_cca5_records.csv


Unnamed: 0,flight_number,date_utc,launch_site


## (09) Total payload carried by NASA boosters — query + result

In [16]:

# Using the engineered `is_nasa` flag from wrangling (1 for NASA payload customers)
sql_09 = '''
SELECT SUM(payload_mass_kg) AS total_payload_mass_kg
FROM launches
WHERE is_nasa = 1;
'''
run_sql(sql_09, save_as="sql_09_total_payload_nasa.csv")


SQL:
SELECT SUM(payload_mass_kg) AS total_payload_mass_kg
FROM launches
WHERE is_nasa = 1;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_09_total_payload_nasa.csv


Unnamed: 0,total_payload_mass_kg
0,93695.7


## (10) Average payload mass for booster version `F9 v1.1` — query + result

In [19]:

sql_10 = '''
SELECT AVG(payload_mass_kg) AS avg_payload_mass_f9_v11
FROM launches
WHERE booster_version = 'F9 v1.1'
  AND payload_mass_kg IS NOT NULL;
'''
run_sql(sql_10, save_as="sql_10_avg_payload_f9v11.csv")


SQL:
SELECT AVG(payload_mass_kg) AS avg_payload_mass_f9_v11
FROM launches
WHERE booster_version = 'F9 v1.1'
  AND payload_mass_kg IS NOT NULL;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_10_avg_payload_f9v11.csv


Unnamed: 0,avg_payload_mass_f9_v11
0,2532.8


## (11) Date of first successful ground pad landing — query + result

In [22]:

sql_11 = '''
SELECT date(substr(date_utc,1,10)) AS first_success_groundpad_date
FROM launches
WHERE landing_outcome LIKE 'Success (ground pad)%'
ORDER BY date(substr(date_utc,1,10)) ASC
LIMIT 1;
'''
run_sql(sql_11, save_as="sql_11_first_success_groundpad.csv")


SQL:
SELECT date(substr(date_utc,1,10)) AS first_success_groundpad_date
FROM launches
WHERE landing_outcome LIKE 'Success (ground pad)%'
ORDER BY date(substr(date_utc,1,10)) ASC
LIMIT 1;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_11_first_success_groundpad.csv


Unnamed: 0,first_success_groundpad_date
0,2015-12-22


## (12) Boosters with successful drone ship landing & payload mass 4000–6000 kg — query + result

In [25]:

sql_12 = '''
SELECT DISTINCT core_serial,
       flight_number,
       payload_name,
       payload_mass_kg,
       launch_site,
       date(substr(date_utc,1,10)) AS date_utc
FROM launches
WHERE landing_outcome LIKE 'Success (drone ship)%'
  AND payload_mass_kg BETWEEN 4000 AND 6000
ORDER BY payload_mass_kg DESC, flight_number ASC;
'''
run_sql(sql_12, save_as="sql_12_drone_success_4k_6k.csv")


SQL:
SELECT DISTINCT core_serial,
       flight_number,
       payload_name,
       payload_mass_kg,
       launch_site,
       date(substr(date_utc,1,10)) AS date_utc
FROM launches
WHERE landing_outcome LIKE 'Success (drone ship)%'
  AND payload_mass_kg BETWEEN 4000 AND 6000
ORDER BY payload_mass_kg DESC, flight_number ASC;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_12_drone_success_4k_6k.csv


Unnamed: 0,core_serial,flight_number,payload_name,payload_mass_kg,launch_site,date_utc
0,B1046,67,Telkom-4,5800.0,CCSFS SLC 40,2018-08-07
1,B1021,38,SES-10,5300.0,KSC LC 39A,2017-03-30
2,B1031,49,SES-11 / Echostar 105,5200.0,KSC LC 39A,2017-10-11
3,B1048,75,Nusantara Satu (PSN-6),5000.0,CCSFS SLC 40,2019-02-22
4,B1059,85,CRS-19,5000.0,CCSFS SLC 40,2019-12-05
5,B1022,29,JCSAT-2B,4696.0,CCSFS SLC 40,2016-05-06
6,B1026,33,JCSAT-16,4600.0,CCSFS SLC 40,2016-08-14
7,B1046,71,SSO-A,4000.0,VAFB SLC 4E,2018-12-03


## (13) Count total successful vs failed **mission** outcomes — query + result

In [28]:

# Deduplicate at the flight level to avoid counting multiple payload rows
sql_13 = '''
WITH launch_outcomes AS (
    SELECT DISTINCT flight_number, launch_success
    FROM launches
    WHERE launch_success IS NOT NULL
)
SELECT CASE WHEN launch_success = 1 THEN 'Success'
            WHEN launch_success = 0 THEN 'Failure'
            ELSE 'Unknown' END AS mission_outcome,
       COUNT(*) AS count
FROM launch_outcomes
GROUP BY mission_outcome
ORDER BY count DESC;
'''
run_sql(sql_13, save_as="sql_13_success_vs_failure_counts.csv")


SQL:
WITH launch_outcomes AS (
    SELECT DISTINCT flight_number, launch_success
    FROM launches
    WHERE launch_success IS NOT NULL
)
SELECT CASE WHEN launch_success = 1 THEN 'Success'
            WHEN launch_success = 0 THEN 'Failure'
            ELSE 'Unknown' END AS mission_outcome,
       COUNT(*) AS count
FROM launch_outcomes
GROUP BY mission_outcome
ORDER BY count DESC;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_13_success_vs_failure_counts.csv


Unnamed: 0,mission_outcome,count
0,Success,176
1,Failure,3


## (14) Booster(s) carrying maximum payload mass — query + result

In [31]:

sql_14 = '''
WITH mx AS (SELECT MAX(payload_mass_kg) AS max_mass FROM launches)
SELECT core_serial, flight_number, payload_name, payload_mass_kg,
       launch_site, date(substr(date_utc,1,10)) AS date_utc
FROM launches, mx
WHERE payload_mass_kg = mx.max_mass;
'''
run_sql(sql_14, save_as="sql_14_max_payload_boosters.csv")


SQL:
WITH mx AS (SELECT MAX(payload_mass_kg) AS max_mass FROM launches)
SELECT core_serial, flight_number, payload_name, payload_mass_kg,
       launch_site, date(substr(date_utc,1,10)) AS date_utc
FROM launches, mx
WHERE payload_mass_kg = mx.max_mass;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_14_max_payload_boosters.csv


Unnamed: 0,core_serial,flight_number,payload_name,payload_mass_kg,launch_site,date_utc
0,B1048,84,Starlink-1,15600.0,CCSFS SLC 40,2019-11-11
1,B1049,87,Starlink-2,15600.0,CCSFS SLC 40,2020-01-07
2,B1051,89,Starlink-3,15600.0,CCSFS SLC 40,2020-01-29
3,B1056,90,Starlink-4,15600.0,CCSFS SLC 40,2020-02-17
4,B1048,92,Starlink-5,15600.0,KSC LC 39A,2020-03-18
5,B1051,93,Starlink-6,15600.0,KSC LC 39A,2020-04-22
6,B1049,95,Starlink-7,15600.0,CCSFS SLC 40,2020-06-04
7,B1059,96,Starlink-8,15600.0,CCSFS SLC 40,2020-06-13
8,B1051,99,Starlink-9,15600.0,KSC LC 39A,2020-08-07
9,B1049,100,Starlink-10,15600.0,CCSFS SLC 40,2020-08-18


## (15) Failed drone ship landings in **2015** with booster versions & launch sites — query + result

In [34]:

sql_15 = '''
SELECT flight_number,
       date(substr(date_utc,1,10)) AS date_utc,
       booster_version,
       core_serial,
       launch_site
FROM launches
WHERE landing_outcome LIKE 'Failure (drone ship)%'
  AND substr(date_utc,1,4) = '2015'
ORDER BY flight_number;
'''
run_sql(sql_15, save_as="sql_15_failed_drone_2015.csv")


SQL:
SELECT flight_number,
       date(substr(date_utc,1,10)) AS date_utc,
       booster_version,
       core_serial,
       launch_site
FROM launches
WHERE landing_outcome LIKE 'Failure (drone ship)%'
  AND substr(date_utc,1,4) = '2015'
ORDER BY flight_number;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_15_failed_drone_2015.csv


Unnamed: 0,flight_number,date_utc,booster_version,core_serial,launch_site
0,19,2015-01-10,F9 v1.1,B1012,CCSFS SLC 40
1,22,2015-04-14,F9 v1.1,B1015,CCSFS SLC 40


## (16) Rank landing outcomes between **2010-06-04** and **2017-03-20** — query + result

In [37]:

sql_16 = '''
SELECT landing_outcome,
       COUNT(*) AS count
FROM launches
WHERE landing_outcome IS NOT NULL
  AND substr(date_utc,1,10) BETWEEN '2010-06-04' AND '2017-03-20'
GROUP BY landing_outcome
ORDER BY count DESC;
'''
run_sql(sql_16, save_as="sql_16_rank_landing_outcomes_2010_2017.csv")


SQL:
SELECT landing_outcome,
       COUNT(*) AS count
FROM launches
WHERE landing_outcome IS NOT NULL
  AND substr(date_utc,1,10) BETWEEN '2010-06-04' AND '2017-03-20'
GROUP BY landing_outcome
ORDER BY count DESC;
Saved -> /Users/johnpaulsandiego/Desktop/kData/data-science-capstone/artifacts/sql_16_rank_landing_outcomes_2010_2017.csv


Unnamed: 0,landing_outcome,count
0,No attempt,15
1,Failure (drone ship),6
2,Success (drone ship),5
3,Success (ground pad),3
4,Success,3
5,Unknown,2
6,Failure,2


### Close DB (optional)

In [40]:

conn.close()
print("Closed:", str(DB_PATH))


Closed: artifacts/spacex_capstone.sqlite
