In [14]:
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')
# Request (5 points)
base_url = "https://api.nasa.gov/DONKI/"
specifier = "CME"
start_date = "2024-01-01"
end_date = "2024-05-01"
api_key = NASA_API_KEY  # Replace with your NASA API key

# Construct the query URL
query_url_CME = f"{base_url}{specifier}?startDate={start_date}&endDate={end_date}&api_key={api_key}"

# Make the GET request and store the JSON data
cme_response = requests.get(query_url_CME)
if cme_response.status_code == 200:
    cme_json = cme_response.json()
else:
    raise Exception(f"Failed to fetch data: {cme_response.status_code}")

# Preview the first results with json.dumps
print(json.dumps(cme_json[:1], indent=4))

# Convert cme_json to a Pandas DataFrame
cme_df = pd.DataFrame(cme_json)

# Preparation for loop (6 points)
expanded_rows = []  # Create an empty list

# Inside the cme.index for loop (20 points)
for index, row in cme_df.iterrows():  # Loop through cme.index list
    # Extract relevant columns
    activityID = row.get('activityID', None)
    startTime = row.get('startTime', None)
    linkedEvents = row.get('linkedEvents', [])


  # Ensure linkedEvents is a list; if None, use an empty list
    if linkedEvents is None:
        linkedEvents = []
        
    # Inner loop to iterate through linkedEvents
    for event in linkedEvents:
        expanded_rows.append({
            "activityID": activityID,
            "startTime": startTime,
            "linkedEventActivityID": event.get('activityID', None)
        })

# Create a new DataFrame from expanded_rows
expanded_df = pd.DataFrame(expanded_rows)

# Function extract_activityID_from_dict (14 points)
def extract_activityID_from_dict(event_dict):
    """
    Extracts the 'activityID' from a dictionary if it exists.
    """
    try:
        return event_dict.get('activityID')
    except AttributeError as e:
        print(f"Error: {e}")
        return None

# Apply the function with lambda
expanded_df['GST_ActivityID'] = expanded_df['linkedEventActivityID'].apply(lambda x: extract_activityID_from_dict({'activityID': x}))

# Cleaning (5 points)
# Convert GST_ActivityID to string
expanded_df['GST_ActivityID'] = expanded_df['GST_ActivityID'].astype(str)

# Convert startTime to datetime and rename it
expanded_df['startTime_CME'] = pd.to_datetime(expanded_df['startTime'], errors='coerce')

# Rename activityID to cmeID
expanded_df.rename(columns={'activityID': 'cmeID'}, inplace=True)

# Drop unnecessary columns
expanded_df.drop(columns=['linkedEventActivityID', 'startTime'], inplace=True)

# Filter rows where GST_ActivityID contains 'GST'
filtered_df = expanded_df[expanded_df['GST_ActivityID'].str.contains('GST', na=False)]

# Verify the final DataFrame
print("Final DataFrame:")
print(filtered_df.head())

[
    {
        "activityID": "2024-01-01T17:00:00-CME-001",
        "catalog": "M2M_CATALOG",
        "startTime": "2024-01-01T17:00Z",
        "instruments": [
            {
                "displayName": "SOHO: LASCO/C2"
            },
            {
                "displayName": "SOHO: LASCO/C3"
            }
        ],
        "sourceLocation": "",
        "activeRegionNum": null,
        "note": "Faint CME with the source is likely the minor movement of field lines behind the limb in SE in AIA 171 starting around 2024-01-01T16:30Z. Fully covered by data gap in STEREO A.",
        "submissionTime": "2024-01-02T13:40Z",
        "versionId": 1,
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/28435/-1",
        "cmeAnalyses": [
            {
                "isMostAccurate": true,
                "time21_5": "2024-01-02T01:01Z",
                "latitude": -64.0,
                "longitude": null,
                "halfAngle": 26.0,
                "speed": 416.0,


In [19]:
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')
# Request (5 points)
base_url = "https://api.nasa.gov/DONKI/"
specifier = "GST"
start_date = "2024-01-01"
end_date = "2024-05-01"
api_key = NASA_API_KEY  # Replace with your NASA API key



# Construct the query URL for GST
query_url_GST = f"{base_url}{specifier}?startDate={start_date}&endDate={end_date}&api_key={api_key}"

# Make the GET request to retrieve GST data
gst_response = requests.get(query_url_GST)
if gst_response.status_code == 200:
    gst_json = gst_response.json()  # Store the JSON data
else:
    raise Exception(f"Failed to fetch GST data: {gst_response.status_code}")

# Preview the first results using json.dumps with indent=4
print("Preview of the first GST result:")
print(json.dumps(gst_json[:1], indent=4))

# Convert gst_json to a Pandas DataFrame
gst_df = pd.DataFrame(gst_json)

# Verify the structure of the DataFrame
print("GST DataFrame:")
print(gst_df.head())

# Ensure the linkedEvents column is not missing and apply explode
if 'linkedEvents' in gst_df.columns:
    # Explode the linkedEvents column to expand nested lists into individual rows
    gst_df = gst_df.explode('linkedEvents')
else:
    raise KeyError("'linkedEvents' column is missing from the GST DataFrame.")

# Reset the index
gst_df.reset_index(drop=True, inplace=True)

# Drop rows with missing values in any column
gst_df.dropna(inplace=True)

# Verify the transformed DataFrame
print("Expanded and cleaned GST DataFrame:")
print(gst_df.head())

# Define the extract_activityID_from_dict function
def extract_activityID_from_dict(event_dict):
    """
    Extracts the 'activityID' from a dictionary if it exists.
    Handles errors gracefully with a try-except block.
    """
    try:
        return event_dict.get('activityID')
    except AttributeError as e:
        print(f"Error: Expected a dictionary but got {type(event_dict)}. Details: {e}")
        return None

# Apply the function with lambda to extract 'activityID' from the 'linkedEvents' column
gst_df['CME_ActivityID'] = gst_df['linkedEvents'].apply(
    lambda x: extract_activityID_from_dict(x) if isinstance(x, dict) else None
)

# Verify the results
print("GST DataFrame after extracting CME_ActivityID:")
#print(gst_df[['activityID', 'CME_ActivityID']].head())

# 1. Convert CME_ActivityID column to string format using the supplied extract_activityID_from_dict function
gst_df['CME_ActivityID'] = gst_df['CME_ActivityID'].apply(lambda x: str(x) if pd.notna(x) else None)

# 2. Convert startTime to datetime and rename it to startTime_GST
gst_df['startTime'] = pd.to_datetime(gst_df['startTime'], errors='coerce')
gst_df.rename(columns={'startTime': 'startTime_GST'}, inplace=True)

# 3. Rename activityID column to gstID
gst_df.rename(columns={'activityID': 'gstID'}, inplace=True)

# 4. Filter gst DataFrame to only keep rows where GST_ActivityID contains 'CME'
gst_df = gst_df[gst_df['CME_ActivityID'].str.contains('CME', na=False)]

# Verify the updated DataFrame
print("Updated GST DataFrame:")
print(gst_df.head())


Preview of the first GST result:
[
    {
        "gstID": "2024-03-03T18:00:00-GST-001",
        "startTime": "2024-03-03T18:00Z",
        "allKpIndex": [
            {
                "observedTime": "2024-03-03T21:00Z",
                "kpIndex": 5.67,
                "source": "NOAA"
            }
        ],
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/29440/-1",
        "linkedEvents": [
            {
                "activityID": "2024-02-28T17:48:00-CME-001"
            },
            {
                "activityID": "2024-03-03T08:47:00-IPS-001"
            }
        ],
        "submissionTime": "2024-03-03T21:35Z",
        "versionId": 1
    }
]
GST DataFrame:
                         gstID          startTime  \
0  2024-03-03T18:00:00-GST-001  2024-03-03T18:00Z   
1  2024-03-23T21:00:00-GST-001  2024-03-23T21:00Z   
2  2024-03-24T12:00:00-GST-001  2024-03-24T12:00Z   
3  2024-04-19T18:00:00-GST-001  2024-04-19T18:00Z   

                                   

In [20]:
# Merge the two DataFrames on their respective ID columns
merged_df = pd.merge(
    gst_df,
    filtered_df,
    left_on=['gstID', 'CME_ActivityID'],
    right_on=['GST_ActivityID', 'cmeID']
)

# Verify the number of rows in the merged DataFrame
print(f"Number of rows in the merged DataFrame: {len(merged_df)}")
print(f"Number of rows in GST DataFrame: {len(gst_df)}")
print(f"Number of rows in CME DataFrame: {len(filtered_df)}")

# Compute the time difference between startTime_GST and startTime_CME
merged_df['timeDiff'] = (merged_df['startTime_GST'] - merged_df['startTime_CME']).dt.total_seconds()

# Use describe() to compute mean and median time
time_diff_stats = merged_df['timeDiff'].describe()
mean_time_diff = time_diff_stats['mean']
median_time_diff = merged_df['timeDiff'].median()

print(f"Mean time difference: {mean_time_diff} seconds")
print(f"Median time difference: {median_time_diff} seconds")

# Display time difference statistics
print("Time difference statistics:")
print(time_diff_stats)

# Export the merged and cleaned data to a CSV file
merged_df.to_csv('merged_cme_gst_data.csv', index=False)

print("Data exported to 'merged_cme_gst_data.csv'")

Number of rows in the merged DataFrame: 4
Number of rows in GST DataFrame: 4
Number of rows in CME DataFrame: 4
Mean time difference: 244965.0 seconds
Median time difference: 235410.0 seconds
Time difference statistics:
count         4.000000
mean     244965.000000
std      140829.883547
min      123120.000000
25%      124155.000000
50%      235410.000000
75%      356220.000000
max      385920.000000
Name: timeDiff, dtype: float64
Data exported to 'merged_cme_gst_data.csv'
