In [1]:
import pandas as pd
import numpy as np
import lxml
import requests
from bs4 import BeautifulSoup
import re

In [2]:
spaceweather_page = requests.get('https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares.html')

spaceweather_root_soup = BeautifulSoup(spaceweather_page.text, 'lxml')
spaceweather_root_soup.select("body")

[<body> <div class="layout"> <div class="layout__main"> <h1>Access Denied</h1> <p> <strong>www.spaceweatherlive.com</strong> is using a security service for protection against online attacks. An action has triggered the service and blocked your request. </p> <p> Please try again in a few minutes. If the issue persist, please contact the site owner for further assistance. </p> <table> <thead> <tr> <th>Reference ID</th> <th>IP Address</th> <th>Date and Time</th> </tr> </thead> <tbody> <tr> <td data-title="Reference ID">ca0b5b9310f3a3c41a0272eb42d39d79</td> <td data-title="IP Address">69.251.154.167</td> <td data-title="Date and Time">02/11/2021 08:34 PM UTC</td> </tr> </tbody> </table> </div> <div class="layout__footer"> Protected by <a href="https://www.stackpath.com/" target="_blank">StackPath</a> </div> </div> </body>]

In [3]:
# HTTP GET request to receive data from server located at URL
nasa_page = requests.get('https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html')

# using the imported lxml HTML parser to parse the response content from NASA website URL
nasa_root_soup = BeautifulSoup(nasa_page.text, 'html.parser')
# Uncomment to print out a prettybeautifulsoup-ified HTML tree
# soup.prettify()



In [4]:
# matching with the <pre> tag which contains all of the relevant data as 
flare_data_pre = nasa_root_soup.find('pre')

# splitting long string of text from <pre> by new line -> rows
pre_string_rows = flare_data_pre.get_text().split("\n")

# filtering out rows which don't start with a year ( 1 or 2) and
# splitting into sublists split on white space
split_filtered_rows = [x.split(' ') for x in pre_string_rows if re.match(r'^[1-2]', x)]

# removing empty strings and "PHTX" from each sublist
for row in split_filtered_rows:
    row.remove('PHTX')
    while '' in row:
        row.remove('')


# converting 2d list into data frame
nasa_df = pd.DataFrame(split_filtered_rows)

# removing un-needed columns 12-23
nasa_df = nasa_df.iloc[:, 0:14]

# setting appropriate column names
nasa_df.columns = ['start_date', 'start_time', 'end_date','end_time', 'start_frequency', 'end_frequency', 'flare_location', 'flare_region', 'flare_class', 'cme_date', 'cme_time', 'cme_angle', 'cme_width', 'cme_speed']

nasa_df.head()

Unnamed: 0,start_date,start_time,end_date,end_time,start_frequency,end_frequency,flare_location,flare_region,flare_class,cme_date,cme_time,cme_angle,cme_width,cme_speed
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74,79,312
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,Halo,360,878
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,Halo,360,464
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263,165,296
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133,155,712


In [8]:
# tidying up Nasa data

# recode missing entries to NaN
nasa_df = nasa_df.applymap(lambda x: np.nan if x in ['----', '-----', 'BACK', 'Back', '--/--', '--:--'] else x)

# uncomment to output all rows to check NaN recoding


# inserting new column to indicate True/False for halo CME
nasa_df['is_halo'] = np.where(nasa_df['cme_angle'] == 'Halo', True, False)
# recoding 'Halo' cme_angle to NaN
nasa_df.cme_angle = nasa_df.cme_angle.replace('Halo', np.nan)

nasa_df['width_lower_bound'] = np.where(nasa_df['cme_width'][0] == '>', True, False)

pd.options.display.max_rows = 1000
display(nasa_df)









Unnamed: 0,start_date,start_time,end_date,end_time,start_frequency,end_frequency,flare_location,flare_region,flare_class,cme_date,cme_time,cme_angle,cme_width,cme_speed,is_halo,width_lower_bound
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74.0,79,312.0,False,False
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,,360,878.0,False,False
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,,360,464.0,False,False
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263.0,165,296.0,False,False
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133.0,155,712.0,False,False
5,1997/11/03,05:15,11/03,12:00,14000,250,S20W13,8100,C8.6,11/03,05:28,240.0,109,227.0,False,False
6,1997/11/03,10:30,11/03,11:30,14000,5000,S16W21,8100,M4.2,11/03,11:11,233.0,122,352.0,False,False
7,1997/11/04,06:00,11/05,04:30,14000,100,S14W33,8100,X2.1,11/04,06:10,,360,785.0,False,False
8,1997/11/06,12:20,11/07,08:30,14000,100,S18W63,8100,X9.4,11/06,12:10,,360,1556.0,False,False
9,1997/11/27,13:30,11/27,14:00,14000,7000,N17E63,8113,X2.6,11/27,13:56,98.0,91,441.0,False,False
