In [None]:
# Data Science
import pandas as pd
import numpy as np

# Web-Scraping
#import lxml
import requests
from bs4 import BeautifulSoup

# Misc
import re
import datetime

In [None]:
# Scraping data from spaceweatherlive.com using HTTP request


# Provides the HTTP request with a User Agent header that mimics a standard Chrome browser
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}

# Performs the GET request with specified headers
spaceweather_page = requests.get('https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares.html', headers=headers)

# Parses the raw HTML data using lxml
spaceweather_html_root = BeautifulSoup(spaceweather_page.text, 'lxml')

# Prints outs the parsed HTML of the page starting from the root of the DOM
# (check to make sure this is not None)
# spaceweather_html_root

In [None]:
# Cleaning spaceweatherlive.com HTML data to create DataFrame

# Uses a CSS selector to extract the table contaning the Top 50 Solar Flares data
spaceweather_html_table = spaceweather_html_root.select('table.table.table-striped.table-responsive-md')

# Creating DataFrame using lxml parser
[swl_df] = pd.read_html(str(spaceweather_html_table), flavor = 'lxml')

# Displays first 5 rows of DataFrame to check format
display(swl_df.head())

# Sets appropriate column names for DataFrame
columns = ['rank', 'x_class', 'date', 'region', 'start_time', 'max_time', 'end_time', 'movie']
swl_df.columns = columns

# Dropping last column of data, which is unused
swl_df_nomovie = swl_df.drop(['movie'], axis=1)

display(swl_df_nomovie.head())

In [None]:
# Tidying up spaceweatherlive.com's Top 50 Solar Flares data

# makes a new copy to mutate
swl_df_nomovie = swl_df_nomovie.copy()

# Concatenating date column to time columns
swl_df_nomovie['start_datetime'] = swl_df_nomovie['date'] + ' ' + swl_df_nomovie['start_time']
swl_df_nomovie['max_datetime'] = swl_df_nomovie['date'] + ' ' + swl_df_nomovie['max_time']
swl_df_nomovie['end_datetime'] = swl_df_nomovie['date'] + ' ' + swl_df_nomovie['end_time']

# Drops the old date and time columns
swl_df_datetime1 = swl_df_nomovie.drop(['date', 'start_time', 'max_time', 'end_time'], axis=1)

# makes a new copy to mutate
swl_df_datetime2 = swl_df_datetime1.copy()

# Converting strings to datetime objects
swl_df_datetime2['start_datetime'] = swl_df_datetime1['start_datetime'].map(lambda x: datetime.datetime.strptime(x, '%Y/%m/%d %H:%M'))
swl_df_datetime2['max_datetime'] = swl_df_datetime1['max_datetime'].map(lambda x: datetime.datetime.strptime(x, '%Y/%m/%d %H:%M'))
swl_df_datetime2['end_datetime'] = swl_df_datetime1['end_datetime'].map(lambda x: datetime.datetime.strptime(x, '%Y/%m/%d %H:%M'))

swl_df_datetime2.head()


In [None]:
# Scraping data from NASA using HTTP requests

# HTTP GET request to receive NASA page's HTML data from server located at URL
nasa_page = requests.get('https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html')

# using the imported lxml HTML parser to parse the response content from NASA website URL
nasa_html_root = BeautifulSoup(nasa_page.text, 'lxml')

# Prints outs the parsed HTML of the page starting from the root of the DOM
# (check to make sure this is not None)
# nasa_html_root.prettify()



In [None]:
# Cleaning NASA page's HTML data and creating a DataFrame

# matching with the <pre> tag which contains all of the relevant data as 
nasa_html_pre = nasa_html_root.find('pre')

# splitting long string of text from <pre> by new line -> rows
nasa_pre_string_rows = nasa_html_pre.get_text().split("\n")

# filtering out rows which don't start with a year ( 1 or 2) and
# splitting into sublists split on white space
nasa_filtered_rows = [x.split(' ') for x in nasa_pre_string_rows if re.match(r'^[1-2]', x)]

# removing empty strings and "PHTX" from each sublist
for row in nasa_filtered_rows:
    row.remove('PHTX')
    while '' in row:
        row.remove('')


# converting 2d list into data frame
nasa_df = pd.DataFrame(nasa_filtered_rows)


In [146]:
# tidying up NASA DataFrame

# removing un-needed columns 12-23
nasa_df = nasa_df.iloc[:, 0:14]

# setting appropriate column names
nasa_df.columns = ['start_date', 'start_time', 'end_date','end_time', 'start_frequency', 'end_frequency', 'flare_location', 'flare_region', 'flare_class', 'cme_date', 'cme_time', 'cme_angle', 'cme_width', 'cme_speed']

# recode missing entries to NaN
nasa_df = nasa_df.applymap(lambda x: np.nan if x in ['----', '-----', 'BACK', 'Back', '--/--', '--:--'] else x)

# uncomment to output all rows to check NaN recoding
pd.options.display.max_rows = 1000
display(nasa_df)
pd.options.display.max_rows = 0

Unnamed: 0,start_date,start_time,end_date,end_time,start_frequency,end_frequency,flare_location,flare_region,flare_class,cme_date,cme_time,cme_angle,cme_width,cme_speed
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74,79,312
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,Halo,360,878
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,Halo,360,464
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263,165,296
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133,155,712
5,1997/11/03,05:15,11/03,12:00,14000,250,S20W13,8100,C8.6,11/03,05:28,240,109,227
6,1997/11/03,10:30,11/03,11:30,14000,5000,S16W21,8100,M4.2,11/03,11:11,233,122,352
7,1997/11/04,06:00,11/05,04:30,14000,100,S14W33,8100,X2.1,11/04,06:10,Halo,360,785
8,1997/11/06,12:20,11/07,08:30,14000,100,S18W63,8100,X9.4,11/06,12:10,Halo,360,1556
9,1997/11/27,13:30,11/27,14:00,14000,7000,N17E63,8113,X2.6,11/27,13:56,98,91,441


In [147]:
# inserting new column to indicate True/False for halo CME
nasa_df.insert(loc=len(nasa_df.columns), column='is_halo', value=False)
nasa_df.insert(loc=len(nasa_df.columns), column='width_lower_bound', value=False)


In [148]:
nasa_df.loc[:, 'is_halo'] = nasa_df['cme_angle'].str.contains('Halo', na=False)
nasa_df.loc[:, 'width_lower_bound'] = nasa_df['cme_width'].str.contains('>', na=False)

In [None]:
# # recoding 'Halo' cme_angle to NaN
# nasa_df.cme_angle = nasa_df.cme_angle.replace('Halo', np.nan)
# nasa_df.loc[nasa_df['cme_angle'] == 'Halo', 'cme_angle'] = np.nan

In [149]:
nasa_df.head()

Unnamed: 0,start_date,start_time,end_date,end_time,start_frequency,end_frequency,flare_location,flare_region,flare_class,cme_date,cme_time,cme_angle,cme_width,cme_speed,is_halo,width_lower_bound
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74,79,312,False,False
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,Halo,360,878,True,False
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,Halo,360,464,True,False
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263,165,296,False,False
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133,155,712,False,False
