### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
!pip install python-dotenv
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime

## Load the NASA_API_KEY from the env file
from dotenv import load_dotenv
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


### CME Data

In [2]:
# Set the base URL to NASA's DONKI API:
base_url = "https://kauai.ccmc.gsfc.nasa.gov/DONKI/WS/get/CME?startDate=yyyy-MM-dd&endDate=yyyy-MM-dd"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
query_url = f"{base_url}{CME}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"

In [3]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(query_url)

In [4]:
# Convert cme_json to a Pandas DataFrame
cme_df = pd.DataFrame(cme_response.json())

# Keep only the columns: activityID, startTime, linkedEvents
cme_json = cme_response.json()



In [5]:
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
preview = json.dumps(cme_json[0], indent=4)
print(preview)

{
    "activityID": "2024-11-28T02:12:00-CME-001",
    "catalog": "M2M_CATALOG",
    "startTime": "2024-11-28T02:12Z",
    "instruments": [
        {
            "displayName": "SOHO: LASCO/C2"
        },
        {
            "displayName": "SOHO: LASCO/C3"
        },
        {
            "displayName": "STEREO A: SECCHI/COR2"
        }
    ],
    "sourceLocation": "",
    "activeRegionNum": null,
    "note": "A slow CME that is very slow to develop (speeds up a little bit with time). Its likely source is a small eruption visible near the eastern limb in GOES SUVI 171, 193, 285, 304 imagery starting around 2024-11-28T01:00Z. The flux rope for this CME is later clearly seen in coronagraph.",
    "submissionTime": "2024-11-29T14:41Z",
    "versionId": 1,
    "link": "https://kauai.ccmc.gsfc.nasa.gov/DONKI/view/CME/35383/-1",
    "cmeAnalyses": [
        {
            "isMostAccurate": true,
            "time21_5": "2024-11-28T18:23Z",
            "latitude": 9.0,
            "longitude

In [6]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
Remove_missing_linkedEvents = [item for item in cme_json if 'linkedEvents' in item]


In [7]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents'
# and adds the elements individually to a list of dictionaries where each row is one element
nested_loop = []

# Initialize an empty list to store the expanded rows
list_of_rows = []

# Iterate over each index in the DataFrame
for index in Remove_missing_linkedEvents:
    # Check if 'lindedEvents' key exists and its value is a list and not a None
    if 'linkedEvents' in index and isinstance(index['linkedEvents'], list) and index['linkedEvents'] is not None:
       # Iterate over each dictionary in the list
       for dictionary in index['linkedEvents']:

        # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
        list_of_rows.append({
            'activityID': index['activityID'],
            'startTime': index['startTime'],
            'linkedEvents': dictionary
        })

# Create a new DataFrame from the expanded rows
df = pd.DataFrame(list_of_rows)


In [8]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
def extract_activityID_from_dict(row):
  function = row.get('activityID')
  return function
        # Log the error or print it for debugging
  error.log(f"Error processing row: {row}")
  print(f"Error processing row: {row}")
  display(f"Error processing row: {row}")




In [9]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
df['GST_ActivityID'] = df['linkedEvents'].apply(lambda row: extract_activityID_from_dict(row))
display(df)


Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2024-12-04T10:24:00-CME-001,2024-12-04T10:24Z,{'activityID': '2024-12-04T09:52:00-FLR-001'},2024-12-04T09:52:00-FLR-001
1,2024-12-08T09:24:00-CME-001,2024-12-08T09:24Z,{'activityID': '2024-12-08T08:50:00-FLR-001'},2024-12-08T08:50:00-FLR-001
2,2024-12-08T09:24:00-CME-001,2024-12-08T09:24Z,{'activityID': '2024-12-10T11:58:00-IPS-001'},2024-12-10T11:58:00-IPS-001
3,2024-12-10T03:23:00-CME-001,2024-12-10T03:23Z,{'activityID': '2024-12-10T03:12:00-FLR-001'},2024-12-10T03:12:00-FLR-001
4,2024-12-10T07:00:00-CME-001,2024-12-10T07:00Z,{'activityID': '2024-12-10T06:36:00-FLR-001'},2024-12-10T06:36:00-FLR-001
5,2024-12-10T07:24:00-CME-001,2024-12-10T07:24Z,{'activityID': '2024-12-10T07:03:00-FLR-001'},2024-12-10T07:03:00-FLR-001
6,2024-12-10T11:12:00-CME-001,2024-12-10T11:12Z,{'activityID': '2024-12-10T10:55:00-FLR-001'},2024-12-10T10:55:00-FLR-001
7,2024-12-10T15:24:00-CME-001,2024-12-10T15:24Z,{'activityID': '2024-12-10T14:36:00-FLR-001'},2024-12-10T14:36:00-FLR-001
8,2024-12-10T18:12:00-CME-001,2024-12-10T18:12Z,{'activityID': '2024-12-10T17:54:00-FLR-001'},2024-12-10T17:54:00-FLR-001
9,2024-12-11T06:24:00-CME-001,2024-12-11T06:24Z,{'activityID': '2024-12-11T05:54:00-FLR-001'},2024-12-11T05:54:00-FLR-001


In [10]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
missing_GST_ActivityID = df.dropna(subset=['GST_ActivityID'])


In [11]:
# print out the datatype of each column in this DataFrame:
column_datatypes = missing_GST_ActivityID.dtypes
print(column_datatypes)

activityID        object
startTime         object
linkedEvents      object
GST_ActivityID    object
dtype: object


In [12]:
# Convert the 'GST_ActivityID' column to string format
missing_GST_ActivityID['GST_ActivityID'] = missing_GST_ActivityID['GST_ActivityID'].astype(str)

# Convert the 'cmeID' column to string format
cmeID = missing_GST_ActivityID['activityID'].astype(str)
# Convert startTime to datetime format
startTime = pd.to_datetime(missing_GST_ActivityID['startTime'])
# Rename startTime to startTime_CME and activityID to cmeID
renamed_df = missing_GST_ActivityID.rename(columns={'startTime': 'startTime_CME', 'activityID': 'cmeID'})
# Drop linkedEvents
renamed_df = renamed_df.drop(columns=['linkedEvents'])
# Verify that all steps were executed correctly
display(renamed_df)

Unnamed: 0,cmeID,startTime_CME,GST_ActivityID
0,2024-12-04T10:24:00-CME-001,2024-12-04T10:24Z,2024-12-04T09:52:00-FLR-001
1,2024-12-08T09:24:00-CME-001,2024-12-08T09:24Z,2024-12-08T08:50:00-FLR-001
2,2024-12-08T09:24:00-CME-001,2024-12-08T09:24Z,2024-12-10T11:58:00-IPS-001
3,2024-12-10T03:23:00-CME-001,2024-12-10T03:23Z,2024-12-10T03:12:00-FLR-001
4,2024-12-10T07:00:00-CME-001,2024-12-10T07:00Z,2024-12-10T06:36:00-FLR-001
5,2024-12-10T07:24:00-CME-001,2024-12-10T07:24Z,2024-12-10T07:03:00-FLR-001
6,2024-12-10T11:12:00-CME-001,2024-12-10T11:12Z,2024-12-10T10:55:00-FLR-001
7,2024-12-10T15:24:00-CME-001,2024-12-10T15:24Z,2024-12-10T14:36:00-FLR-001
8,2024-12-10T18:12:00-CME-001,2024-12-10T18:12Z,2024-12-10T17:54:00-FLR-001
9,2024-12-11T06:24:00-CME-001,2024-12-11T06:24Z,2024-12-11T05:54:00-FLR-001


In [13]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.
renamed_df = renamed_df[renamed_df['GST_ActivityID'].str.contains('GST')]
display(renamed_df)


Unnamed: 0,cmeID,startTime_CME,GST_ActivityID


### GST Data

In [14]:
# Set the base URL to NASA's DONKI API:
base_url = "https://kauai.ccmc.gsfc.nasa.gov/DONKI/WS/get/GST?startDate=yyyy-MM-dd&endDate=yyyy-MM-dd"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
query_url = f"{base_url}{GST}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"

In [15]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(query_url)

In [16]:
# Convert the response variable to json and store it as a variable named gst_json
gst_response = requests.get(query_url)
gst_json = gst_response.json()

# Check if gst_json is empty before attempting to access elements
if gst_json:

    # Preview the first result in JSON format
    # Use json.dumps with argument indent=4 to format data
    preview = json.dumps(gst_json[0], indent=4)
    print(preview)
else:
    print("gst_json is empty.")





gst_json is empty.


In [17]:
# Convert gst_json to a Pandas DataFrame
gst_df = pd.DataFrame(gst_response.json())

# Check if the DataFrame is empty before selecting columns
if not gst_df.empty:
    # Keep only the columns: activityID, startTime, linkedEvents
    gst_df = gst_df[['activityID', 'startTime', 'linkedEvents']]
else:
    print("gst_df is empty.")



gst_df is empty.


In [18]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME
Remove_missing_linkedEvents = [item for item in gst_json if 'linkedEvents' in item]


In [19]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.
exploded_df = pd.DataFrame(Remove_missing_linkedEvents)

# Check if 'linkedEvents' column exists before exploding
if 'linkedEvents' in exploded_df.columns:
    exploded_df = exploded_df.explode('linkedEvents')
    exploded_df = exploded_df.dropna(subset=['linkedEvents'])
    exploded_df = exploded_df.reset_index(drop=True)
else:
    print("'linkedEvents' column not found in the DataFrame.")


'linkedEvents' column not found in the DataFrame.


In [20]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:
df.loc[:, 'CME_ActivityID'] = df['linkedEvents'].apply(lambda row: extract_activityID_from_dict(row))
display(df)

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:
missing_CME_ActivityID = df.dropna(subset=['CME_ActivityID'])
print(df)


Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID,CME_ActivityID
0,2024-12-04T10:24:00-CME-001,2024-12-04T10:24Z,{'activityID': '2024-12-04T09:52:00-FLR-001'},2024-12-04T09:52:00-FLR-001,2024-12-04T09:52:00-FLR-001
1,2024-12-08T09:24:00-CME-001,2024-12-08T09:24Z,{'activityID': '2024-12-08T08:50:00-FLR-001'},2024-12-08T08:50:00-FLR-001,2024-12-08T08:50:00-FLR-001
2,2024-12-08T09:24:00-CME-001,2024-12-08T09:24Z,{'activityID': '2024-12-10T11:58:00-IPS-001'},2024-12-10T11:58:00-IPS-001,2024-12-10T11:58:00-IPS-001
3,2024-12-10T03:23:00-CME-001,2024-12-10T03:23Z,{'activityID': '2024-12-10T03:12:00-FLR-001'},2024-12-10T03:12:00-FLR-001,2024-12-10T03:12:00-FLR-001
4,2024-12-10T07:00:00-CME-001,2024-12-10T07:00Z,{'activityID': '2024-12-10T06:36:00-FLR-001'},2024-12-10T06:36:00-FLR-001,2024-12-10T06:36:00-FLR-001
5,2024-12-10T07:24:00-CME-001,2024-12-10T07:24Z,{'activityID': '2024-12-10T07:03:00-FLR-001'},2024-12-10T07:03:00-FLR-001,2024-12-10T07:03:00-FLR-001
6,2024-12-10T11:12:00-CME-001,2024-12-10T11:12Z,{'activityID': '2024-12-10T10:55:00-FLR-001'},2024-12-10T10:55:00-FLR-001,2024-12-10T10:55:00-FLR-001
7,2024-12-10T15:24:00-CME-001,2024-12-10T15:24Z,{'activityID': '2024-12-10T14:36:00-FLR-001'},2024-12-10T14:36:00-FLR-001,2024-12-10T14:36:00-FLR-001
8,2024-12-10T18:12:00-CME-001,2024-12-10T18:12Z,{'activityID': '2024-12-10T17:54:00-FLR-001'},2024-12-10T17:54:00-FLR-001,2024-12-10T17:54:00-FLR-001
9,2024-12-11T06:24:00-CME-001,2024-12-11T06:24Z,{'activityID': '2024-12-11T05:54:00-FLR-001'},2024-12-11T05:54:00-FLR-001,2024-12-11T05:54:00-FLR-001


                     activityID          startTime  \
0   2024-12-04T10:24:00-CME-001  2024-12-04T10:24Z   
1   2024-12-08T09:24:00-CME-001  2024-12-08T09:24Z   
2   2024-12-08T09:24:00-CME-001  2024-12-08T09:24Z   
3   2024-12-10T03:23:00-CME-001  2024-12-10T03:23Z   
4   2024-12-10T07:00:00-CME-001  2024-12-10T07:00Z   
5   2024-12-10T07:24:00-CME-001  2024-12-10T07:24Z   
6   2024-12-10T11:12:00-CME-001  2024-12-10T11:12Z   
7   2024-12-10T15:24:00-CME-001  2024-12-10T15:24Z   
8   2024-12-10T18:12:00-CME-001  2024-12-10T18:12Z   
9   2024-12-11T06:24:00-CME-001  2024-12-11T06:24Z   
10  2024-12-11T10:38:00-CME-001  2024-12-11T10:38Z   
11  2024-12-13T02:48:00-CME-001  2024-12-13T02:48Z   
12  2024-12-13T04:17:00-CME-001  2024-12-13T04:17Z   
13  2024-12-14T05:36:00-CME-001  2024-12-14T05:36Z   
14  2024-12-15T01:25:00-CME-001  2024-12-15T01:25Z   
15  2024-12-15T01:25:00-CME-001  2024-12-15T01:25Z   
16  2024-12-15T01:25:00-CME-001  2024-12-15T01:25Z   
17  2024-12-15T04:23:00-CME-

In [21]:
# Convert the 'CME_ActivityID' column to string format
CME_ActivityID = missing_CME_ActivityID['CME_ActivityID'].astype(str)

# Convert the 'gstID' column to string format
gstID = missing_CME_ActivityID['activityID'].astype(str)
# Convert startTime to datetime format
startTime = pd.to_datetime(missing_CME_ActivityID['startTime'])
# Rename startTime to startTime_GST
# Convert the 'CME_ActivityID' column to string format
CME_ActivityID = missing_CME_ActivityID['CME_ActivityID'].astype(str)

# Convert the 'gstID' column to string format
gstID = missing_CME_ActivityID['activityID'].astype(str)

# Convert startTime to datetime format
startTime = pd.to_datetime(missing_CME_ActivityID['startTime'])

# Rename startTime to startTime_GST and activityID to gstID, startTime to startTime_CME and activityID to cmeID
# Use the original DataFrame (missing_CME_ActivityID) for renaming
renamed_df = missing_CME_ActivityID.rename(columns={'startTime': 'startTime_GST', 'activityID': 'gstID'})

# Renamed to startTime_GST


# Create a copy of renamed_df and rename startTime to startTime_CME and activityID to cmeID
renamed_df2 = renamed_df.rename(columns={'startTime': 'startTime_CME', 'activityID': 'cmeID'})
# Renamed to startTime_CME



# Drop linkedEvents
dropped_df = missing_CME_ActivityID.drop(columns=['linkedEvents'])

# Verify that all steps were executed correctly
print(column_datatypes)
# Drop linkedEvents
dropped_df = missing_CME_ActivityID.drop(columns=['linkedEvents'])
# Verify that all steps were executed correctly
print(column_datatypes)


activityID        object
startTime         object
linkedEvents      object
GST_ActivityID    object
dtype: object
activityID        object
startTime         object
linkedEvents      object
GST_ActivityID    object
dtype: object


In [22]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.

CME_ActivityID = df[df['CME_ActivityID'].str.contains('CME')]
display(CME_ActivityID)


Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID,CME_ActivityID


### Merge both datatsets

In [23]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.

df.merge(CME_ActivityID, left_on='GST_ActivityID', right_on='CME_ActivityID', how='left')



Unnamed: 0,activityID_x,startTime_x,linkedEvents_x,GST_ActivityID_x,CME_ActivityID_x,activityID_y,startTime_y,linkedEvents_y,GST_ActivityID_y,CME_ActivityID_y
0,2024-12-04T10:24:00-CME-001,2024-12-04T10:24Z,{'activityID': '2024-12-04T09:52:00-FLR-001'},2024-12-04T09:52:00-FLR-001,2024-12-04T09:52:00-FLR-001,,,,,
1,2024-12-08T09:24:00-CME-001,2024-12-08T09:24Z,{'activityID': '2024-12-08T08:50:00-FLR-001'},2024-12-08T08:50:00-FLR-001,2024-12-08T08:50:00-FLR-001,,,,,
2,2024-12-08T09:24:00-CME-001,2024-12-08T09:24Z,{'activityID': '2024-12-10T11:58:00-IPS-001'},2024-12-10T11:58:00-IPS-001,2024-12-10T11:58:00-IPS-001,,,,,
3,2024-12-10T03:23:00-CME-001,2024-12-10T03:23Z,{'activityID': '2024-12-10T03:12:00-FLR-001'},2024-12-10T03:12:00-FLR-001,2024-12-10T03:12:00-FLR-001,,,,,
4,2024-12-10T07:00:00-CME-001,2024-12-10T07:00Z,{'activityID': '2024-12-10T06:36:00-FLR-001'},2024-12-10T06:36:00-FLR-001,2024-12-10T06:36:00-FLR-001,,,,,
5,2024-12-10T07:24:00-CME-001,2024-12-10T07:24Z,{'activityID': '2024-12-10T07:03:00-FLR-001'},2024-12-10T07:03:00-FLR-001,2024-12-10T07:03:00-FLR-001,,,,,
6,2024-12-10T11:12:00-CME-001,2024-12-10T11:12Z,{'activityID': '2024-12-10T10:55:00-FLR-001'},2024-12-10T10:55:00-FLR-001,2024-12-10T10:55:00-FLR-001,,,,,
7,2024-12-10T15:24:00-CME-001,2024-12-10T15:24Z,{'activityID': '2024-12-10T14:36:00-FLR-001'},2024-12-10T14:36:00-FLR-001,2024-12-10T14:36:00-FLR-001,,,,,
8,2024-12-10T18:12:00-CME-001,2024-12-10T18:12Z,{'activityID': '2024-12-10T17:54:00-FLR-001'},2024-12-10T17:54:00-FLR-001,2024-12-10T17:54:00-FLR-001,,,,,
9,2024-12-11T06:24:00-CME-001,2024-12-11T06:24Z,{'activityID': '2024-12-11T05:54:00-FLR-001'},2024-12-11T05:54:00-FLR-001,2024-12-11T05:54:00-FLR-001,,,,,


In [24]:
# Verify that the new DataFrame has the same number of rows as cme and gst
row_count_cme = len(cmeID)
row_count_gst = len(gstID)
row_count_merge = len(df.merge(CME_ActivityID, left_on='GST_ActivityID', right_on='CME_ActivityID', how='left'))

# Print the row counts
print("Number of rows in CME DataFrame:", row_count_cme)
print("Number of rows in GST DataFrame:", row_count_gst)
print("Number of rows in merged DataFrame:", row_count_merge)

Number of rows in CME DataFrame: 30
Number of rows in GST DataFrame: 30
Number of rows in merged DataFrame: 30


### Computing the time it takes for a CME to cause a GST

In [60]:
# Compute the time diff between startTime_GST and startTime_CME by creating a
# new column called `timeDiff`. Define 'startTime'
def timeDiff(df):
  df['timeDiff'] = df['startTime_GST'] - df['startTime_CME']
  return df




In [69]:
# Use describe() to compute the mean and median time
# that it takes for a CME to cause a GST.

def mean_median_timeDiff(df):
  mean_timeDiff = df['timeDiff'].mean()
  median_timeDiff = df['timeDiff'].median()
  return mean_timeDiff, median_timeDiff
df.describe()






Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID,CME_ActivityID
count,30,30,30,30,30
unique,25,25,26,26,26
top,2024-12-15T01:25:00-CME-001,2024-12-15T01:25Z,{'activityID': '2024-12-21T16:12:00-SEP-001'},2024-12-21T16:12:00-SEP-001,2024-12-21T16:12:00-SEP-001
freq,3,3,2,2,2


### Exporting data in csv format

In [42]:
# Export data to CSV without the index
df.to_csv('CME_GST_Data.csv', index=False)
