# spaceX Api Fetching Dataset


In [2]:
# -*- coding: utf-8 -*-
"""
SpaceX Comprehensive Data Scraper for Google Colab
Fetches detailed launch data including payloads, boosters, and pads.
"""

# 1. Install necessary libraries (usually pre-installed in Colab, but good practice)
# Run this cell first.
!pip install requests pandas

import requests
import pandas as pd
import time
import datetime # For formatting date/time

# For saving to Google Drive (optional)
from google.colab import drive
from IPython.display import display # For better DataFrame display

# --- API Endpoints ---
BASE_URL = "https://api.spacexdata.com/v5"
V4_BASE_URL = "https://api.spacexdata.com/v4" # Some endpoints are still v4

# --- Helper function to fetch data from a given URL ---
def fetch_api_data(url):
    """Fetches JSON data from a given API URL with error handling."""
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

# --- Main Data Fetching Function ---
def get_spacex_comprehensive_data():
    """
    Fetches comprehensive SpaceX launch data with all specified columns.
    Returns a pandas DataFrame.
    """
    print("Starting data fetching process...")

    # 1. Pre-fetch reference data (Rockets, Launchpads, Landing Pads)
    # This reduces redundant API calls significantly.
    print("Pre-fetching reference data (rockets, launchpads, landingpads)...")

    rockets_data = fetch_api_data(f"{V4_BASE_URL}/rockets")
    rockets_lookup = {rocket['id']: rocket for rocket in rockets_data} if rockets_data else {}

    launchpads_data = fetch_api_data(f"{V4_BASE_URL}/launchpads")
    launchpads_lookup = {lp['id']: lp for lp in launchpads_data} if launchpads_data else {}

    landingpads_data = fetch_api_data(f"{V4_BASE_URL}/landpads")
    landingpads_lookup = {lp['id']: lp for lp in landingpads_data} if landingpads_data else {}

    if not (rockets_lookup and launchpads_lookup and landingpads_lookup):
        print("Failed to pre-fetch all reference data. Exiting.")
        return pd.DataFrame()

    print("Reference data pre-fetched successfully.")

    # 2. Fetch all past launches
    launches_url = f"{BASE_URL}/launches/past"
    all_launches = fetch_api_data(launches_url)

    if not all_launches:
        print("Failed to fetch launch data. Exiting.")
        return pd.DataFrame()

    print(f"Successfully fetched {len(all_launches)} past launches. Processing details...")

    # Prepare a list to hold processed launch records
    launch_records = []
    processed_count = 0

    for launch in all_launches:
        record = {}

        # --- Direct Launch Fields ---
        record['Flight No.'] = launch.get('flight_number')
        record['FlightNumber'] = launch.get('flight_number') # Redundant, but matching requested columns
        record['Date'] = launch.get('date_utc')
        # Extract Time
        try:
            dt_object = datetime.datetime.fromisoformat(launch.get('date_utc').replace('Z', '+00:00'))
            record['Time'] = dt_object.strftime('%H:%M:%S')
        except (ValueError, TypeError):
            record['Time'] = None

        record['Launchoutcome'] = launch.get('success')
        record['Outcome'] = launch.get('success') # Redundant, but matching requested columns

        # --- Rocket (BoosterVersion) ---
        rocket_id = launch.get('rocket')
        rocket_info = rockets_lookup.get(rocket_id, {})
        record['BoosterVersion'] = rocket_info.get('name')
        record['Version Booster'] = rocket_info.get('name') # Redundant

        # --- Launch Site Details ---
        launchpad_id = launch.get('launchpad')
        launchpad_info = launchpads_lookup.get(launchpad_id, {})
        record['Launch site'] = launchpad_info.get('full_name')
        record['LaunchSite'] = launchpad_info.get('full_name') # Redundant
        record['Longitude'] = launchpad_info.get('longitude')
        record['Latitude'] = launchpad_info.get('latitude')

        # --- Payload Details (Mass, Orbit, Customer) ---
        # Sum mass, collect unique orbits and customers for all payloads on this launch
        total_payload_mass_kg = 0
        orbits = set()
        customers = set()

        payload_ids = launch.get('payloads', [])

        # In case payloads are empty or not found
        if not payload_ids:
            record['Payload'] = None
            record['PayloadMass'] = None
            record['Orbit'] = None
            record['Customer'] = None
        else:
            payload_names = []
            for p_id in payload_ids:
                payload_details = fetch_api_data(f"{V4_BASE_URL}/payloads/{p_id}")
                time.sleep(0.02) # Small delay for each payload API call

                if payload_details:
                    total_payload_mass_kg += (payload_details.get('mass_kg') or 0)
                    if payload_details.get('orbit'):
                        orbits.add(payload_details['orbit'])
                    if payload_details.get('customers'):
                        for cust in payload_details['customers']:
                            customers.add(cust)
                    if payload_details.get('name'):
                        payload_names.append(payload_details['name'])

            record['Payload'] = ", ".join(payload_names) if payload_names else None
            record['PayloadMass'] = total_payload_mass_kg if total_payload_mass_kg > 0 else None
            record['Orbit'] = ", ".join(sorted(list(orbits))) if orbits else None
            record['Customer'] = ", ".join(sorted(list(customers))) if customers else None


        # --- Core (Booster) Details ---
        cores = launch.get('cores', [])
        if cores:
            first_core = cores[0] # Focus on the first core, which is typically the main booster

            # Core ID / Serial
            record['Serial'] = first_core.get('core') # This is the core's unique ID (e.g., '5e9e289f83ea0a2e32861e06')
            # The 'serial' field is sometimes available directly on the core object, if you fetched it separately,
            # but for what's available in the launch object, 'core' ID is the main identifier.
            # You might need another API call to get a more human-readable serial like 'B1051'
            # if 'first_core.get('serial')' doesn't directly give it.
            # Let's try to fetch the full core object for the human-readable serial if needed.
            core_human_serial = None
            if record['Serial']:
                core_details = fetch_api_data(f"{V4_BASE_URL}/cores/{record['Serial']}")
                if core_details:
                    core_human_serial = core_details.get('serial')
                time.sleep(0.02) # Small delay
            record['Serial'] = core_human_serial if core_human_serial else record['Serial']


            record['Flights'] = first_core.get('flights')
            record['GridFins'] = first_core.get('gridfins')
            record['Reused'] = first_core.get('reused')
            record['Legs'] = first_core.get('legs')
            record['Block'] = first_core.get('block')
            # ReusedCount from the core object itself, not instance-specific
            # This would require fetching the core details via its ID
            # record['ReusedCount'] = core_details.get('reuse_count') if core_details else None # Not in launch['cores']
            # The 'flights' count is a good proxy for how many times *this specific core* has flown.
            record['ReusedCount'] = first_core.get('flights') # Using flights from this launch's context

            # Booster Landing Outcome
            landing_success = first_core.get('landing_success')
            landing_attempt = first_core.get('landing_attempt')
            booster_landing_status = "No Attempt"
            if landing_attempt is True:
                booster_landing_status = "Success" if landing_success is True else "Failure"
            record['Booster landing'] = booster_landing_status


            # Landing Pad (if applicable)
            landing_pad_id = first_core.get('landpad')
            landing_pad_info = landingpads_lookup.get(landing_pad_id, {})
            record['LandingPad'] = landing_pad_info.get('full_name')

        else: # No core information
            record['Serial'] = None
            record['Flights'] = None
            record['GridFins'] = None
            record['Reused'] = None
            record['Legs'] = None
            record['Block'] = None
            record['ReusedCount'] = None
            record['Booster landing'] = "N/A"
            record['LandingPad'] = None

        launch_records.append(record)
        processed_count += 1
        if processed_count % 50 == 0:
            print(f"Processed {processed_count}/{len(all_launches)} launches...")

    print(f"Finished processing {len(all_launches)} launches.")
    df = pd.DataFrame(launch_records)
    return df

if __name__ == "__main__":
    spacex_df = get_spacex_comprehensive_data()

    if not spacex_df.empty:
        print("\nSpaceX Comprehensive Launch Data (first 5 rows):")
        display(spacex_df.head()) # Use display() for better Colab output

        print(f"\nTotal launches fetched and processed: {len(spacex_df)}")
        print("\nAll columns available:")
        print(spacex_df.columns.tolist())

        # --- Colab Specific: Saving the DataFrame ---

        # Option 1: Save to a CSV file in the Colab temporary storage
        # This file will be lost when the Colab runtime disconnects.
        csv_filename = "spacex_comprehensive_launches.csv"
        spacex_df.to_csv(csv_filename, index=False)
        print(f"\nData saved to {csv_filename} in Colab environment.")
        print(f"You can download this file from the 'Files' tab (left sidebar) or using: from google.colab import files; files.download('{csv_filename}')")

        # Option 2: Save to Google Drive (persistent storage)
        # This requires authentication the first time you run it in a session.
        try:
            print("\nAttempting to mount Google Drive for persistent storage...")
            drive.mount('/content/drive', force_remount=True) # force_remount=True can help if issues persist
            drive_path = '/content/drive/MyDrive/spacex_data_comprehensive/' # Create this folder in your Drive if it doesn't exist
            # Ensure the directory exists
            import os
            os.makedirs(drive_path, exist_ok=True)
            drive_csv_path = os.path.join(drive_path, csv_filename)
            spacex_df.to_csv(drive_csv_path, index=False)
            print(f"Data saved to Google Drive at: {drive_csv_path}")
        except Exception as e:
            print(f"\nCould not mount Google Drive or save to Drive: {e}")
            print("To save to Drive, you need to authenticate your Google account (follow prompts).")

    else:
        print("Failed to retrieve SpaceX comprehensive launch data.")

Starting data fetching process...
Pre-fetching reference data (rockets, launchpads, landingpads)...
Reference data pre-fetched successfully.
Successfully fetched 187 past launches. Processing details...
Processed 50/187 launches...
Processed 100/187 launches...
Processed 150/187 launches...
Finished processing 187 launches.

SpaceX Comprehensive Launch Data (first 5 rows):


Unnamed: 0,Flight No.,FlightNumber,Date,Time,Launchoutcome,Outcome,BoosterVersion,Version Booster,Launch site,LaunchSite,...,Customer,Serial,Flights,GridFins,Reused,Legs,Block,ReusedCount,Booster landing,LandingPad
0,1,1,2006-03-24T22:30:00.000Z,22:30:00,False,False,Falcon 1,Falcon 1,Kwajalein Atoll Omelek Island,Kwajalein Atoll Omelek Island,...,DARPA,Merlin1A,,False,False,False,,,No Attempt,
1,2,2,2007-03-21T01:10:00.000Z,01:10:00,False,False,Falcon 1,Falcon 1,Kwajalein Atoll Omelek Island,Kwajalein Atoll Omelek Island,...,DARPA,Merlin2A,,False,False,False,,,No Attempt,
2,3,3,2008-08-03T03:34:00.000Z,03:34:00,False,False,Falcon 1,Falcon 1,Kwajalein Atoll Omelek Island,Kwajalein Atoll Omelek Island,...,"NASA, ORS",Merlin1C,,False,False,False,,,No Attempt,
3,4,4,2008-09-28T23:15:00.000Z,23:15:00,True,True,Falcon 1,Falcon 1,Kwajalein Atoll Omelek Island,Kwajalein Atoll Omelek Island,...,SpaceX,Merlin2C,,False,False,False,,,No Attempt,
4,5,5,2009-07-13T03:35:00.000Z,03:35:00,True,True,Falcon 1,Falcon 1,Kwajalein Atoll Omelek Island,Kwajalein Atoll Omelek Island,...,ATSB,Merlin3C,,False,False,False,,,No Attempt,



Total launches fetched and processed: 187

All columns available:
['Flight No.', 'FlightNumber', 'Date', 'Time', 'Launchoutcome', 'Outcome', 'BoosterVersion', 'Version Booster', 'Launch site', 'LaunchSite', 'Longitude', 'Latitude', 'Payload', 'PayloadMass', 'Orbit', 'Customer', 'Serial', 'Flights', 'GridFins', 'Reused', 'Legs', 'Block', 'ReusedCount', 'Booster landing', 'LandingPad']

Data saved to spacex_comprehensive_launches.csv in Colab environment.
You can download this file from the 'Files' tab (left sidebar) or using: from google.colab import files; files.download('spacex_comprehensive_launches.csv')

Attempting to mount Google Drive for persistent storage...
Mounted at /content/drive
Data saved to Google Drive at: /content/drive/MyDrive/spacex_data_comprehensive/spacex_comprehensive_launches.csv


In [3]:
column_list = spacex_df.columns.tolist()
print(column_list)

['Flight No.', 'FlightNumber', 'Date', 'Time', 'Launchoutcome', 'Outcome', 'BoosterVersion', 'Version Booster', 'Launch site', 'LaunchSite', 'Longitude', 'Latitude', 'Payload', 'PayloadMass', 'Orbit', 'Customer', 'Serial', 'Flights', 'GridFins', 'Reused', 'Legs', 'Block', 'ReusedCount', 'Booster landing', 'LandingPad']
