In [1]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


### Import Required Libraries and Set Up Environment Variables

In [2]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')



### CME Data

In [3]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
query_url = f'{base_url}CME?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}'

In [4]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(query_url)

In [5]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = cme_response.json()

In [6]:
# Preview ONLY the first element from the cme_json list you created in JSON format
# Do NOT print out the entire list
# Use json.dumps with argument indent=4 to format data
cme_json_preview = cme_json[0]
print(json.dumps(cme_json_preview, indent=4))

{
    "activityID": "2013-05-01T03:12:00-CME-001",
    "catalog": "M2M_CATALOG",
    "startTime": "2013-05-01T03:12Z",
    "instruments": [
        {
            "displayName": "SOHO: LASCO/C2"
        },
        {
            "displayName": "SOHO: LASCO/C3"
        },
        {
            "displayName": "STEREO A: SECCHI/COR2"
        },
        {
            "displayName": "STEREO B: SECCHI/COR2"
        }
    ],
    "sourceLocation": "",
    "activeRegionNum": null,
    "note": "",
    "submissionTime": "2013-08-07T16:54Z",
    "versionId": 1,
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/2349/-1",
    "cmeAnalyses": [
        {
            "isMostAccurate": true,
            "time21_5": "2013-05-01T07:07Z",
            "latitude": 12.0,
            "longitude": -120.0,
            "halfAngle": 36.0,
            "speed": 860.0,
            "type": "C",
            "featureCode": "null",
            "imageType": null,
            "measurementTechnique": "null",
   

In [7]:
# Convert cme_json to a Pandas DataFrame 
cme_df = pd.DataFrame(cme_json)
# Keep only the columns: activityID, startTime, linkedEvents
cme_short_df = cme_df[['activityID', 'startTime', 'linkedEvents']]
cme_short_df.head()

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
1,2013-05-02T05:24:00-CME-001,2013-05-02T05:24Z,
2,2013-05-02T14:36:00-CME-001,2013-05-02T14:36Z,
3,2013-05-03T18:00:00-CME-001,2013-05-03T18:00Z,
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}]


In [8]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme_short_df = cme_short_df[['activityID', 'startTime', 'linkedEvents']].dropna()
cme_short_df.head()

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}]
7,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,[{'activityID': '2013-05-12T23:30:00-IPS-001'}]
10,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,[{'activityID': '2013-05-13T01:53:00-FLR-001'}...
13,2013-05-13T16:18:00-CME-001,2013-05-13T16:18Z,[{'activityID': '2013-05-13T15:40:00-FLR-001'}...


In [9]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 

# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over each index in the DataFrame
for i in cme_short_df.index:
    # Iterate over each dictionary in the list
    activity_id = cme_short_df.loc[i, 'activityID']
    start_time = cme_short_df.loc[i, 'startTime']
    linked_events = cme_short_df.loc[i, 'linkedEvents']

    if not isinstance(linked_events, list):
        linked_events = []

        # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
    for event in linked_events:
        # Create a new dictionary for each linked event
        new_row = {
            'activityID': activity_id,
            'startTime': start_time,
            'linkedEvents': event 
        }
        expanded_rows.append(new_row)
# Create a new DataFrame from the expanded rows
cme_expanded_df = pd.DataFrame(expanded_rows)
cme_expanded_df.head()

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,{'activityID': '2013-05-04T04:52:00-IPS-001'}
1,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,{'activityID': '2013-05-07T04:37:00-IPS-001'}
2,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,{'activityID': '2013-05-12T23:30:00-IPS-001'}
3,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T01:53:00-FLR-001'}
4,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T04:12:00-SEP-001'}


In [10]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
def extract_activityID_from_dict(input_dict):
        try:
                if isinstance(input_dict, dict):
                        return input_dict.get('activityID')
                
                else:
                        raise TypeError("Input is not a dictionary")
        
        # Log the error or print it for debugging
        except (ValueError, TypeError) as e:
                print(e)

extract_activityID_from_dict(cme_expanded_df.loc[0, 'linkedEvents'])



'2013-05-04T04:52:00-IPS-001'

In [11]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
cme_expanded_df['GST_ActivityID'] = cme_expanded_df['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))
cme_expanded_df.head()

Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,{'activityID': '2013-05-04T04:52:00-IPS-001'},2013-05-04T04:52:00-IPS-001
1,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,{'activityID': '2013-05-07T04:37:00-IPS-001'},2013-05-07T04:37:00-IPS-001
2,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,{'activityID': '2013-05-12T23:30:00-IPS-001'},2013-05-12T23:30:00-IPS-001
3,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T01:53:00-FLR-001'},2013-05-13T01:53:00-FLR-001
4,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T04:12:00-SEP-001'},2013-05-13T04:12:00-SEP-001


In [12]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
cme_expanded_df = cme_expanded_df[['activityID', 'startTime', 'linkedEvents', 'GST_ActivityID']].dropna()

In [13]:
# print out the datatype of each column in this DataFrame:
cme_expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1714 entries, 0 to 1713
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   activityID      1714 non-null   object
 1   startTime       1714 non-null   object
 2   linkedEvents    1714 non-null   object
 3   GST_ActivityID  1714 non-null   object
dtypes: object(4)
memory usage: 53.7+ KB


In [14]:
# Convert the 'GST_ActivityID' column to string format 
cme_expanded_df['GST_ActivityID'] = cme_expanded_df['GST_ActivityID'].astype("string")


# Convert startTime to datetime format  
cme_expanded_df['startTime'] = pd.to_datetime(cme_expanded_df['startTime'])

# making a copy w/ updated dtpyes
cme_cleaned_df = cme_expanded_df[['activityID', 'startTime', 'linkedEvents', 'GST_ActivityID']]

# Rename startTime to startTime_CME and activityID to cmeID
cme_cleaned_df = cme_cleaned_df.rename(columns={'startTime': 'startTime_CME', 'activityID': 'cmeID'})

# Drop linkedEvents
cme_cleaned_df = cme_cleaned_df.drop('linkedEvents', axis=1)

# Verify that all steps were executed correctly
cme_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1714 entries, 0 to 1713
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   cmeID           1714 non-null   object             
 1   startTime_CME   1714 non-null   datetime64[ns, UTC]
 2   GST_ActivityID  1714 non-null   string             
dtypes: datetime64[ns, UTC](1), object(1), string(1)
memory usage: 40.3+ KB


In [15]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  
cme_cleaned_df = cme_cleaned_df[cme_cleaned_df['GST_ActivityID'].str.contains('GST') == True]
cme_cleaned_df.head()

Unnamed: 0,cmeID,startTime_CME,GST_ActivityID
21,2013-06-02T20:24:00-CME-001,2013-06-02 20:24:00+00:00,2013-06-07T03:00:00-GST-001
48,2013-09-29T22:40:00-CME-001,2013-09-29 22:40:00+00:00,2013-10-02T03:00:00-GST-001
90,2013-12-04T23:12:00-CME-001,2013-12-04 23:12:00+00:00,2013-12-08T00:00:00-GST-001
148,2014-02-16T14:15:00-CME-001,2014-02-16 14:15:00+00:00,2014-02-19T03:00:00-GST-001
151,2014-02-18T01:25:00-CME-001,2014-02-18 01:25:00+00:00,2014-02-20T03:00:00-GST-001


### GST Data

In [16]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
query_url = f'{base_url}GST?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}'


In [17]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(query_url)

In [18]:
# Convert the response variable to json and store it as a variable named gst_json
gst_json = gst_response.json()

gst_json_preview = gst_json[0]
print(json.dumps(gst_json_preview, indent=4))

{
    "gstID": "2013-06-01T01:00:00-GST-001",
    "startTime": "2013-06-01T01:00Z",
    "allKpIndex": [
        {
            "observedTime": "2013-06-01T01:00Z",
            "kpIndex": 6.0,
            "source": "NOAA"
        }
    ],
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
    "linkedEvents": [
        {
            "activityID": "2013-05-31T15:45:00-HSS-001"
        }
    ],
    "submissionTime": "2013-07-15T19:26Z",
    "versionId": 1
}


In [19]:
# Preview ONLY the first element from the gst_json list you created in JSON format
# Do NOT print out the entire list
# Use json.dumps with argument indent=4 to format data
gst_df = pd.json_normalize(gst_json)

In [20]:
gst_df.head(5
            )

Unnamed: 0,gstID,startTime,allKpIndex,link,linkedEvents,submissionTime,versionId
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,"[{'observedTime': '2013-06-01T01:00Z', 'kpInde...",https://webtools.ccmc.gsfc.nasa.gov/DONKI/view...,[{'activityID': '2013-05-31T15:45:00-HSS-001'}],2013-07-15T19:26Z,1
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,"[{'observedTime': '2013-06-07T03:00Z', 'kpInde...",https://webtools.ccmc.gsfc.nasa.gov/DONKI/view...,[{'activityID': '2013-06-02T20:24:00-CME-001'}],2013-07-15T19:41Z,1
2,2013-06-29T03:00:00-GST-001,2013-06-29T03:00Z,"[{'observedTime': '2013-06-29T03:00Z', 'kpInde...",https://webtools.ccmc.gsfc.nasa.gov/DONKI/view...,,2013-09-25T04:48Z,2
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,"[{'observedTime': '2013-10-02T06:00Z', 'kpInde...",https://webtools.ccmc.gsfc.nasa.gov/DONKI/view...,[{'activityID': '2013-09-29T22:40:00-CME-001'}...,2013-10-02T13:23Z,1
4,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,"[{'observedTime': '2013-12-08T03:00Z', 'kpInde...",https://webtools.ccmc.gsfc.nasa.gov/DONKI/view...,[{'activityID': '2013-12-04T23:12:00-CME-001'}...,2013-12-08T03:42Z,1


In [None]:
# Convert gst_json to a Pandas DataFrame  
gst_df = pd.DataFrame(gst_json)


# Keep only the columns: gstID, startTime, linkedEvents
gst_df = gst_df[['activityID', 'startTime', 'linkedEvents']]
gst_df.head()

KeyError: "['activityID'] not in index"

In [58]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME
gst_df = gst_df[['activityID', 'startTime', 'linkedEvents']].dropna()
gst_df.head()

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}]
7,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,[{'activityID': '2013-05-12T23:30:00-IPS-001'}]
10,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,[{'activityID': '2013-05-13T01:53:00-FLR-001'}...
13,2013-05-13T16:18:00-CME-001,2013-05-13T16:18Z,[{'activityID': '2013-05-13T15:40:00-FLR-001'}...


In [59]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.
gst_exploded_df = gst_df.explode('linkedEvents').dropna()
gst_exploded_df.head()

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,{'activityID': '2013-05-04T04:52:00-IPS-001'}
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,{'activityID': '2013-05-07T04:37:00-IPS-001'}
7,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,{'activityID': '2013-05-12T23:30:00-IPS-001'}
10,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T01:53:00-FLR-001'}
10,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T04:12:00-SEP-001'}


In [60]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:
gst_exploded_df['CME_ActivityID'] = gst_exploded_df['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:
gst_exploded_df = gst_exploded_df[['activityID', 'startTime', 'linkedEvents', 'CME_ActivityID']].dropna()
gst_exploded_df.head()

Unnamed: 0,activityID,startTime,linkedEvents,CME_ActivityID
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,{'activityID': '2013-05-04T04:52:00-IPS-001'},2013-05-04T04:52:00-IPS-001
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,{'activityID': '2013-05-07T04:37:00-IPS-001'},2013-05-07T04:37:00-IPS-001
7,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,{'activityID': '2013-05-12T23:30:00-IPS-001'},2013-05-12T23:30:00-IPS-001
10,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T01:53:00-FLR-001'},2013-05-13T01:53:00-FLR-001
10,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T04:12:00-SEP-001'},2013-05-13T04:12:00-SEP-001


In [61]:
# Convert the 'CME_ActivityID' column to string format 
gst_exploded_df['CME_ActivityID'] = gst_exploded_df['CME_ActivityID'].astype("string")

# Convert the 'gstID' column to string format 
gst_exploded_df['activityID'] = gst_exploded_df['activityID'].astype("string")

# Convert startTime to datetime format  
gst_exploded_df['startTime'] = pd.to_datetime(gst_exploded_df['startTime'])

# Rename startTime to startTime_GST 

gst_cleaned_df = gst_exploded_df[['activityID', 'startTime', 'linkedEvents', 'CME_ActivityID']]

gst_cleaned_df = gst_cleaned_df.rename(columns={'startTime': 'startTime_GST'})

# Drop linkedEvents

gst_cleaned_df = gst_cleaned_df.drop('linkedEvents', axis=1)

# Verify that all steps were executed correctly
gst_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1714 entries, 0 to 5523
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   activityID      1714 non-null   string             
 1   startTime_GST   1714 non-null   datetime64[ns, UTC]
 2   CME_ActivityID  1714 non-null   string             
dtypes: datetime64[ns, UTC](1), string(2)
memory usage: 53.6 KB


In [65]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  
gst_cleaned_df = gst_cleaned_df[gst_cleaned_df['CME_ActivityID'].str.contains('CME') == True]
gst_cleaned_df.head(5
                    )

Unnamed: 0,activityID,startTime_GST,CME_ActivityID


### Merge both datatsets

In [64]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.
cme_gst_df = pd.merge(gst_cleaned_df, cme_cleaned_df, left_on='CME_ActivityID', right_on='cmeID')

cme_gst_df.head(5)

Unnamed: 0,activityID,startTime_GST,CME_ActivityID,cmeID,startTime_CME,GST_ActivityID


In [66]:
# Verify that the new DataFrame has the same number of rows as cme and gst
cme_gst_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   activityID      0 non-null      string             
 1   startTime_GST   0 non-null      datetime64[ns, UTC]
 2   CME_ActivityID  0 non-null      string             
 3   cmeID           0 non-null      object             
 4   startTime_CME   0 non-null      datetime64[ns, UTC]
 5   GST_ActivityID  0 non-null      string             
dtypes: datetime64[ns, UTC](2), object(1), string(3)
memory usage: 132.0+ bytes


### Computing the time it takes for a CME to cause a GST

In [67]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.
cme_gst_df['timeDiff'] = cme_gst_df['startTime_GST'] - cme_gst_df['startTime_CME']
cme_gst_df.head()

Unnamed: 0,activityID,startTime_GST,CME_ActivityID,cmeID,startTime_CME,GST_ActivityID,timeDiff


In [68]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 
results = cme_gst_df['timeDiff'].describe()

#make it a dataframe

results = pd.DataFrame(results)
results


Unnamed: 0,timeDiff
count,0
mean,NaT
std,NaT
min,NaT
25%,NaT
50%,NaT
75%,NaT
max,NaT


In [74]:
mean = results.iloc[1]

#make a DF and convert it 
mean = pd.DataFrame(mean).reset_index()


#assign the timedelta value
mean_td = mean['mean'][0]

#grab mean values

mean_days = mean_td.days
mean_seconds = mean_td.seconds


#convert to minutes
mean_minutes = mean_seconds / 60

#convert to hours
mean_hours = mean_minutes / 60
mean_hours = int(mean_hours)

#store only remaining minutes after converting to hour
mean_minutes = mean_minutes % 60
mean_minutes = int(mean_minutes)

#store only remaining seconds after converting to minute
mean_seconds = mean_seconds % 60


#print results
print(f'The mean time difference of a GST and a CME occurring between the dates of 2013-05-01 and 2024-05-01 is {mean_days} days, {mean_hours} hours, {mean_minutes} minutes, and {mean_seconds} seconds')


#repeat above for median
median = results.iloc[5]
median = pd.DataFrame(median).reset_index()
median_td = median['50%'][0]
median_days = median_td.days
median_seconds = median_td.seconds

median_minutes = median_seconds / 60

median_hours = median_minutes / 60
median_hours = int(median_hours)

median_minutes = median_minutes % 60
median_minutes = int(median_minutes)

median_seconds = median_seconds % 60


print(f'The median time difference of a GST and a CME occurring between the dates of 2013-05-01 and 2024-05-01 is {median_days} days, {median_hours} hours, {median_minutes} minutes, and {median_seconds} seconds')

ValueError: cannot convert float NaN to integer

### Exporting data in csv format

In [60]:
# Export data to CSV without the index
cme_gst_df.to_csv('CME_GST_analysis_data_new.csv', index=False)