In [1]:
# --- Step 1: Install necessary libraries (if not already installed in Colab) ---
# The 'requests' library is usually pre-installed in Colab, but it's good practice to include this.
# Run this cell once if you encounter a ModuleNotFoundError for 'requests'.
!pip install requests

# --- Step 2: Import libraries ---
import requests
import json
import time # Import time for potential delays to respect API rate limits

# --- Step 3: Define the scraping functions ---

def get_spacex_data(endpoint="launches", version="v3", limit=None, offset=None, filters=None, pretty=False):
    """
    Fetches data from the SpaceX API.

    Args:
        endpoint (str): The API endpoint (e.g., "launches", "capsules").
        version (str): The API version (e.g., "v3").
        limit (int, optional): Limits the number of results returned.
        offset (int, optional): Offsets or skips results from the beginning.
        filters (list, optional): A list of fields to include in the response (JSON field masking).
                                  Example: ["flight_number", "mission_name", "rocket/rocket_name"]
        pretty (bool, optional): If True, pretty-prints the JSON response.

    Returns:
        list or dict: The JSON data from the API.
    """
    base_url = f"https://api.spacexdata.com/{version}"
    url = f"{base_url}/{endpoint}"

    params = {}
    if limit is not None:
        params['limit'] = limit
    if offset is not None:
        params['offset'] = offset
    if filters:
        params['filter'] = ','.join(filters)
    if pretty:
        params['pretty'] = 'true'

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

def scrape_all_launches(version="v3"):
    """
    Scrapes all launch data from the SpaceX API, handling pagination.

    Args:
        version (str): The API version (e.g., "v3").

    Returns:
        list: A list containing all launch data.
    """
    all_launches = []
    offset = 0
    limit = 100  # You can adjust this limit based on API limits and performance

    while True:
        print(f"Fetching launches with offset: {offset}")
        launches_batch = get_spacex_data(endpoint="launches", version=version, limit=limit, offset=offset)

        if launches_batch is None:
            print("Failed to retrieve data for this batch. Stopping.")
            break

        if not launches_batch:
            # No more data to fetch (empty list returned)
            print("No more launches to fetch. All data scraped.")
            break

        all_launches.extend(launches_batch)
        offset += limit

        # Optional: Add a small delay to avoid hitting API rate limits too aggressively
        # time.sleep(0.1) # Sleep for 100 milliseconds between requests if needed

    return all_launches

# --- Step 4: Execute the scraping and save data ---

if __name__ == "__main__":
    print("--- Scraping the latest launch data (pretty-printed) ---")
    latest_launch_url = "https://api.spacexdata.com/v3/launches/latest"
    try:
        response_latest = requests.get(latest_launch_url, params={'pretty': 'true'})
        response_latest.raise_for_status()
        latest_launch_data = response_latest.json()
        print(json.dumps(latest_launch_data, indent=2))
    except requests.exceptions.RequestException as e:
        print(f"Error fetching latest launch data: {e}")

    print("\n--- Scraping all launch data (handling pagination) ---")
    all_spacex_launches = scrape_all_launches(version="v3")

    if all_spacex_launches:
        print(f"\nSuccessfully scraped {len(all_spacex_launches)} launches.")
        # Example of how to save the data to a JSON file in Colab's temporary storage
        output_filename = "spacex_launches_data.json"
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(all_spacex_launches, f, indent=2)
        print(f"Data saved to {output_filename}")

        # You can now work with 'all_spacex_launches' which contains all the data
        # For example, print the mission name and flight number of the first 5 launches:
        print("\nFirst 5 scraped launches:")
        for i, launch in enumerate(all_spacex_launches[:5]):
            print(f"  Launch {i+1}: Mission Name: {launch.get('mission_name')}, Flight Number: {launch.get('flight_number')}")
    else:
        print("No launch data was scraped.")

    print("\n--- Example: Fetching upcoming capsules with filters ---")
    upcoming_capsules_filters = ["capsule_serial", "status", "original_launch"]
    upcoming_capsules_data = get_spacex_data(
        endpoint="capsules/upcoming",
        version="v3",
        filters=upcoming_capsules_filters,
        pretty=True
    )
    if upcoming_capsules_data:
        print(json.dumps(upcoming_capsules_data, indent=2))
    else:
        print("Failed to fetch upcoming capsules data.")

# --- Step 5: (Optional) Download the saved file from Colab ---
# Run this cell after the scraping is complete to download the generated JSON file
from google.colab import files
files.download(output_filename)

--- Scraping the latest launch data (pretty-printed) ---
Error fetching latest launch data: 404 Client Error: Not Found for url: https://api.spacexdata.com/v3/launches/latest?pretty=true

--- Scraping all launch data (handling pagination) ---
Fetching launches with offset: 0
Fetching launches with offset: 100
Fetching launches with offset: 200
No more launches to fetch. All data scraped.

Successfully scraped 111 launches.
Data saved to spacex_launches_data.json

First 5 scraped launches:
  Launch 1: Mission Name: FalconSat, Flight Number: 1
  Launch 2: Mission Name: DemoSat, Flight Number: 2
  Launch 3: Mission Name: Trailblazer, Flight Number: 3
  Launch 4: Mission Name: RatSat, Flight Number: 4
  Launch 5: Mission Name: RazakSat, Flight Number: 5

--- Example: Fetching upcoming capsules with filters ---
[
  {
    "capsule_serial": "C202",
    "status": "active",
    "original_launch": null
  },
  {
    "capsule_serial": "C203",
    "status": "active",
    "original_launch": null
  }

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>