### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

### CME Data

In [2]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
url = 'https://api.nasa.gov/DONKI/CME?startDate=yyyy-MM-dd&endDate=yyyy-MM-dd&api_key=uwMpN9EDjV91cQ5GdxjLpMsh0tRzr3OHieQhxtfa'

In [3]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(url)
if cme_response.status_code == 200:
    print("Request successful!")
else:
    print(f"Request failed with status code {cme_response.status_code}")

Request successful!


In [4]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = cme_response.json()

In [5]:
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
formatted_json = json.dumps(cme_json[0], indent=4)
print(formatted_json)

{
    "activityID": "2024-12-31T00:12:00-CME-001",
    "catalog": "M2M_CATALOG",
    "startTime": "2024-12-31T00:12Z",
    "instruments": [
        {
            "displayName": "SOHO: LASCO/C2"
        },
        {
            "displayName": "SOHO: LASCO/C3"
        }
    ],
    "sourceLocation": "",
    "activeRegionNum": null,
    "note": "CME first seen to the WNW in SOHO LASCO C2 starting at 2024-12-31T00:12Z. Also visible to the WNW in SOHO LASCO C3. This CME might be very faintly visible STEREO A COR2 for two frames, but it is not definitively this CME. A possible source candidate is some minor brightening and activity from AR 3936 starting about 21:48Z visible in all GOES SUVI wavelengths, but measurement does not closely match this source.",
    "submissionTime": "2024-12-31T19:41Z",
    "versionId": 1,
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/36149/-1",
    "cmeAnalyses": [
        {
            "isMostAccurate": true,
            "time21_5": "2024-12-31

In [6]:
# Convert cme_json to a Pandas DataFrame 
cme_df = pd.DataFrame(cme_json)
# Keep only the columns: activityID, startTime, linkedEvents
cme_df_filtered = cme_df[['activityID', 'startTime', 'linkedEvents']]
print(cme_df_filtered.head())

                    activityID          startTime  \
0  2024-12-31T00:12:00-CME-001  2024-12-31T00:12Z   
1  2024-12-31T13:37:00-CME-001  2024-12-31T13:37Z   
2  2024-12-31T14:12:00-CME-001  2024-12-31T14:12Z   
3  2024-12-31T16:48:00-CME-001  2024-12-31T16:48Z   
4  2025-01-01T02:24:00-CME-001  2025-01-01T02:24Z   

                                        linkedEvents  
0                                               None  
1                                               None  
2                                               None  
3                                               None  
4  [{'activityID': '2025-01-03T23:33:00-IPS-001'}...  


In [7]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme_df_filtered = cme_df_filtered.dropna(subset=['linkedEvents'])
print(cme_df_filtered.head())

                     activityID          startTime  \
4   2025-01-01T02:24:00-CME-001  2025-01-01T02:24Z   
12  2025-01-02T15:12:00-CME-001  2025-01-02T15:12Z   
13  2025-01-02T15:24:00-CME-001  2025-01-02T15:24Z   
14  2025-01-02T19:12:00-CME-001  2025-01-02T19:12Z   
16  2025-01-03T12:12:00-CME-001  2025-01-03T12:12Z   

                                         linkedEvents  
4   [{'activityID': '2025-01-03T23:33:00-IPS-001'}...  
12    [{'activityID': '2025-01-05T22:34:00-IPS-001'}]  
13    [{'activityID': '2025-01-05T22:34:00-IPS-001'}]  
14  [{'activityID': '2025-01-02T17:18:00-FLR-001'}...  
16    [{'activityID': '2025-01-03T11:29:00-FLR-001'}]  


In [8]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 

# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over each index in the DataFrame
for idx, row in cme_df_filtered.iterrows():
    # Iterate over each dictionary in the list
    for event in row['linkedEvents']:
        # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
      expanded_rows.append({
            'activityID': row['activityID'],
            'startTime': row['startTime'],
            'linkedEvent': event[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
      })
# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

TypeError: unhashable type: 'dict'

In [22]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
def extract_activityID_from_dict(linked_event_dict):
 try:
    activityID = linked_event_dict.get('activityID', None)
     if activityID is None:
            raise ValueError("activityID not found in linked event")
        
        return activityID
except Exception as e:
        print(f"Error: {e}")
        print("Problematic linked event:", linked_event_dict)
        return None

        # Log the error or print it for debugging




IndentationError: unindent does not match any outer indentation level (<string>, line 10)

In [10]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
cme_df_filtered['GST_ActivityID'] = cme_df_filtered['linkedEvents'].apply(
    lambda linked_events: [extract_activityID_from_dict(event) for event in linked_events] if isinstance(linked_events, list) else None
)

Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,{'activityID': '2013-05-04T04:52:00-IPS-001'},2013-05-04T04:52:00-IPS-001
1,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,{'activityID': '2013-05-07T04:37:00-IPS-001'},2013-05-07T04:37:00-IPS-001
2,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,{'activityID': '2013-05-12T23:30:00-IPS-001'},2013-05-12T23:30:00-IPS-001
3,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T01:53:00-FLR-001'},2013-05-13T01:53:00-FLR-001
4,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T04:12:00-SEP-001'},2013-05-13T04:12:00-SEP-001


In [11]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
cme_df_filtered = cme_df_filtered.dropna(subset=['GST_ActivityID'])
print(cme_df_filtered[['activityID', 'linkedEvents', 'GST_ActivityID']].head())

In [12]:
# print out the datatype of each column in this DataFrame:
print(cme_df_filtered.dtypes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1807 entries, 0 to 1806
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   activityID      1807 non-null   object
 1   startTime       1807 non-null   object
 2   linkedEvents    1807 non-null   object
 3   GST_ActivityID  1807 non-null   object
dtypes: object(4)
memory usage: 56.6+ KB


In [13]:
# Convert the 'GST_ActivityID' column to string format 
cme_df_filtered['GST_ActivityID'] = cme_df_filtered['GST_ActivityID'].astype(str)
# Convert startTime to datetime format  
cme_df_filtered['startTime'] = pd.to_datetime(cme_df_filtered['startTime'], errors='coerce')
# Rename startTime to startTime_CME and activityID to cmeID
cme_df_filtered = cme_df_filtered.rename(columns={'startTime': 'startTime_CME', 'activityID': 'cmeID'})
# Drop linkedEvents
cme_df_filtered = cme_df_filtered.drop(columns=['linkedEvents'])
# Verify that all steps were executed correctly
print(cme_df_filtered.dtypes)  # Check the datatypes of the columns
print(cme_df_filtered.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1807 entries, 0 to 1806
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   cmeID           1807 non-null   object             
 1   startTime_CME   1807 non-null   datetime64[ns, UTC]
 2   GST_ActivityID  1807 non-null   string             
dtypes: datetime64[ns, UTC](1), object(1), string(1)
memory usage: 42.5+ KB


In [14]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  
cme_df_filtered = cme_df_filtered[cme_df_filtered['GST_ActivityID'].str.contains('GST', na=False)]
print(cme_df_filtered.head())

Unnamed: 0,cmeID,startTime_CME,GST_ActivityID
21,2013-06-02T20:24:00-CME-001,2013-06-02 20:24:00+00:00,2013-06-07T03:00:00-GST-001
48,2013-09-29T22:40:00-CME-001,2013-09-29 22:40:00+00:00,2013-10-02T03:00:00-GST-001
90,2013-12-04T23:12:00-CME-001,2013-12-04 23:12:00+00:00,2013-12-08T00:00:00-GST-001
148,2014-02-16T14:15:00-CME-001,2014-02-16 14:15:00+00:00,2014-02-19T03:00:00-GST-001
151,2014-02-18T01:25:00-CME-001,2014-02-18 01:25:00+00:00,2014-02-20T03:00:00-GST-001


### GST Data

In [9]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
gst_url = 'https://api.nasa.gov/DONKI/GST?startDate=yyyy-MM-dd&endDate=yyyy-MM-dd&api_key=uwMpN9EDjV91cQ5GdxjLpMsh0tRzr3OHieQhxtfa' 

In [10]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(gst_url)
print(f"Status Code: {gst_response.status_code}")

Status Code: 200


In [11]:
# Convert the response variable to json and store it as a variable named gst_json
gst_json = gst_response.json()
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
if gst_json:
    print(json.dumps(gst_json[0], indent=4))
else:
    print("No data found.")

{
    "gstID": "2025-01-01T09:00:00-GST-001",
    "startTime": "2025-01-01T09:00Z",
    "allKpIndex": [
        {
            "observedTime": "2025-01-01T12:00Z",
            "kpIndex": 6.33,
            "source": "NOAA"
        },
        {
            "observedTime": "2025-01-01T15:00Z",
            "kpIndex": 6.67,
            "source": "NOAA"
        },
        {
            "observedTime": "2025-01-01T18:00Z",
            "kpIndex": 8.0,
            "source": "NOAA"
        },
        {
            "observedTime": "2025-01-01T21:00Z",
            "kpIndex": 6.67,
            "source": "NOAA"
        }
    ],
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/36159/-1",
    "linkedEvents": [
        {
            "activityID": "2024-12-29T01:23:00-CME-001"
        },
        {
            "activityID": "2024-12-29T06:24:00-CME-001"
        },
        {
            "activityID": "2024-12-31T15:44:00-IPS-001"
        },
        {
            "activityID": "2025-01-01T05:

In [16]:
# Convert gst_json to a Pandas DataFrame  
gst_df = pd.DataFrame(gst_json)

# Keep only the columns: activityID, startTime, linkedEvents
gst_df = gst_df[['gstID', 'startTime', 'linkedEvents']]
gst_df

Unnamed: 0,gstID,startTime,linkedEvents
0,2025-01-01T09:00:00-GST-001,2025-01-01T09:00Z,[{'activityID': '2024-12-29T01:23:00-CME-001'}...


In [17]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME
gst_df = gst_df.dropna(subset=['linkedEvents'])

In [18]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.
gst_df = gst_df.explode('linkedEvents')
gst_df = gst_df.reset_index(drop=True).dropna()

In [20]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:
def extract_activityID_from_dict(event):
        return event.get('activityID', None)
gst_df['CME_ActivityID'] = gst_df['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))
    
# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:
gst_df = gst_df.dropna(subset=['CME_ActivityID'])
print(gst_df.head())

                         gstID          startTime  \
0  2025-01-01T09:00:00-GST-001  2025-01-01T09:00Z   
1  2025-01-01T09:00:00-GST-001  2025-01-01T09:00Z   
2  2025-01-01T09:00:00-GST-001  2025-01-01T09:00Z   
3  2025-01-01T09:00:00-GST-001  2025-01-01T09:00Z   

                                    linkedEvents               CME_ActivityID  
0  {'activityID': '2024-12-29T01:23:00-CME-001'}  2024-12-29T01:23:00-CME-001  
1  {'activityID': '2024-12-29T06:24:00-CME-001'}  2024-12-29T06:24:00-CME-001  
2  {'activityID': '2024-12-31T15:44:00-IPS-001'}  2024-12-31T15:44:00-IPS-001  
3  {'activityID': '2025-01-01T05:45:00-MPC-001'}  2025-01-01T05:45:00-MPC-001  


In [22]:
# Convert the 'CME_ActivityID' column to string format 

# Convert the 'gstID' column to string format 

# Convert startTime to datetime format  

# Rename startTime to startTime_GST 

# Drop linkedEvents

# Verify that all steps were executed correctly


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   gstID           217 non-null    string             
 1   startTime_GST   217 non-null    datetime64[ns, UTC]
 2   CME_ActivityID  217 non-null    string             
dtypes: datetime64[ns, UTC](1), string(2)
memory usage: 5.2 KB


In [23]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  


Unnamed: 0,gstID,startTime_GST,CME_ActivityID
1,2013-06-07T03:00:00-GST-001,2013-06-07 03:00:00+00:00,2013-06-02T20:24:00-CME-001
2,2013-10-02T03:00:00-GST-001,2013-10-02 03:00:00+00:00,2013-09-29T22:40:00-CME-001
5,2013-12-08T00:00:00-GST-001,2013-12-08 00:00:00+00:00,2013-12-04T23:12:00-CME-001
7,2014-02-19T03:00:00-GST-001,2014-02-19 03:00:00+00:00,2014-02-16T14:15:00-CME-001
9,2014-02-20T03:00:00-GST-001,2014-02-20 03:00:00+00:00,2014-02-18T01:25:00-CME-001


### Merge both datatsets

In [24]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.


Unnamed: 0,gstID,startTime_GST,CME_ActivityID,cmeID,startTime_CME,GST_ActivityID
0,2013-06-07T03:00:00-GST-001,2013-06-07 03:00:00+00:00,2013-06-02T20:24:00-CME-001,2013-06-02T20:24:00-CME-001,2013-06-02 20:24:00+00:00,2013-06-07T03:00:00-GST-001
1,2013-10-02T03:00:00-GST-001,2013-10-02 03:00:00+00:00,2013-09-29T22:40:00-CME-001,2013-09-29T22:40:00-CME-001,2013-09-29 22:40:00+00:00,2013-10-02T03:00:00-GST-001
2,2013-12-08T00:00:00-GST-001,2013-12-08 00:00:00+00:00,2013-12-04T23:12:00-CME-001,2013-12-04T23:12:00-CME-001,2013-12-04 23:12:00+00:00,2013-12-08T00:00:00-GST-001
3,2014-02-19T03:00:00-GST-001,2014-02-19 03:00:00+00:00,2014-02-16T14:15:00-CME-001,2014-02-16T14:15:00-CME-001,2014-02-16 14:15:00+00:00,2014-02-19T03:00:00-GST-001
4,2014-02-20T03:00:00-GST-001,2014-02-20 03:00:00+00:00,2014-02-18T01:25:00-CME-001,2014-02-18T01:25:00-CME-001,2014-02-18 01:25:00+00:00,2014-02-20T03:00:00-GST-001


In [25]:
# Verify that the new DataFrame has the same number of rows as cme and gst


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   gstID           67 non-null     string             
 1   startTime_GST   67 non-null     datetime64[ns, UTC]
 2   CME_ActivityID  67 non-null     object             
 3   cmeID           67 non-null     object             
 4   startTime_CME   67 non-null     datetime64[ns, UTC]
 5   GST_ActivityID  67 non-null     string             
dtypes: datetime64[ns, UTC](2), object(2), string(2)
memory usage: 3.3+ KB


### Computing the time it takes for a CME to cause a GST

In [26]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.


Unnamed: 0,gstID,startTime_GST,CME_ActivityID,cmeID,startTime_CME,GST_ActivityID,timeDiff
0,2013-06-07T03:00:00-GST-001,2013-06-07 03:00:00+00:00,2013-06-02T20:24:00-CME-001,2013-06-02T20:24:00-CME-001,2013-06-02 20:24:00+00:00,2013-06-07T03:00:00-GST-001,4 days 06:36:00
1,2013-10-02T03:00:00-GST-001,2013-10-02 03:00:00+00:00,2013-09-29T22:40:00-CME-001,2013-09-29T22:40:00-CME-001,2013-09-29 22:40:00+00:00,2013-10-02T03:00:00-GST-001,2 days 04:20:00
2,2013-12-08T00:00:00-GST-001,2013-12-08 00:00:00+00:00,2013-12-04T23:12:00-CME-001,2013-12-04T23:12:00-CME-001,2013-12-04 23:12:00+00:00,2013-12-08T00:00:00-GST-001,3 days 00:48:00
3,2014-02-19T03:00:00-GST-001,2014-02-19 03:00:00+00:00,2014-02-16T14:15:00-CME-001,2014-02-16T14:15:00-CME-001,2014-02-16 14:15:00+00:00,2014-02-19T03:00:00-GST-001,2 days 12:45:00
4,2014-02-20T03:00:00-GST-001,2014-02-20 03:00:00+00:00,2014-02-18T01:25:00-CME-001,2014-02-18T01:25:00-CME-001,2014-02-18 01:25:00+00:00,2014-02-20T03:00:00-GST-001,2 days 01:35:00


In [27]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


Unnamed: 0,timeDiff
count,67
mean,2 days 21:35:13.432835820
std,1 days 00:02:46.681279427
min,1 days 05:36:00
25%,2 days 03:12:00
50%,2 days 17:48:00
75%,3 days 12:17:00
max,6 days 03:00:00


### Exporting data in csv format

In [28]:
# Export data to CSV without the index
