# Web Scraping Falcon 9 and Falcon Heavy from Web

In [1]:
!pip3 install beautifulsoup4



In [2]:
# requests allows HTTP requests  to get data from API
# pandas for data manipulation and analysis
# beautifulSoup to web scrap data
# datetime allows dates representation

import sys
import requests
import pandas as pd
import bs4 as beautifulsoup
import re
import unicodedata

### Helper function definitions to extract data from API

In [3]:
#  function to return data and time from the HTML table cell - As input: element of a table data cell extracts extra row
def date_time(table_cells):
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

In [4]:
#  function to return booster version from the HTML table cell - As input: element of a table data cell extracts extra row
def booster_version(table_cells):
    output = ''.join([booster_version for item, booster_version in enumerate(table_cells.strings) if item % 2 == 0] [0:-1])
    return  output

In [5]:
#  function to return landing status from the HTML table cell - As input: element of a table data cell extracts extra row
def landing_stats(table_cells):
    output = [item for item in table_cells.strings][0]
    return output

In [6]:
#  function to return mass from the HTML table cell - As input: element of a table data cell extracts extra row
def get_mass(table_cells):
    mass = unicodedata.normalize('NFKD', table_cells.text).strip()
    if mass:
        mass.find('kg')
        new_mass = mass[0:mass.find('kg') + 2]
    else:
        new_mass = 0 
    return new_mass

In [7]:
#  function to return landing status (cleaned , ordered, column from header) 
def extract_column(_row):
    if _row.br:
        _row.br.extract()
    if _row.a:
        _row.a.extract()
    if _row.sup:
        _row.sup.extract()
    column_name = ' '.join(_row.contents)
    
    # filter empty and digit
    if not(column_name.strip().isdigit()):
        column_name = column_name.strip()
        return column_name

static url to scrape public data 

In [8]:
url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

## Request Flacon9 launch from url

In [9]:
data = requests.get(url)
data.status_code

200

use <code>beautifulSoup</code> to create an obj from the html response

In [10]:
soup = beautifulsoup.BeautifulSoup(data.text)
soup.title

<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>

## Extract data: column / variable names from HTML table header

1. find all tables on wiki
2. remember to refresh bs memory

In [11]:
# find all elements with type 'table' and assign the result
html_tables = soup.find_all('table')

the 3 table is target table, with launch records

In [12]:
launch_table = html_tables[2]

# print(launch_table) to check table

Iteration through <code>th</code> elements and call the <code>extract_column()</code> to retrieve column name 

In [13]:
# find all elements 'th' on launch_table
# iterate each element and extract column name - extract_column()
# if not empty (name != None and len(name) > 0) append into column_names
column_names = []

for element in launch_table.find_all('th'):
    name = extract_column(element)
    if name is not None and len(name) > 0:
        column_names.append(name)

print(column_names)

['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


## Parse HTML Tables and create a dataFrame

Dictionary creation. Keys are the extracted column names. Finally convert dictionary into a Data Frame.

In [14]:

# Create an empty dictionary with specified column names
launch_dict = dict.fromkeys(column_names)

# Remove an irrelevant column
del launch_dict['Date and time ( )']

# Initialize the launch_dict with each value as an empty list
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
launch_dict['Version Booster'] = []
launch_dict['Booster landing'] = []
launch_dict['Date'] = []
launch_dict['Time'] = []

extracted_row = 0

# Extract data from each table
for table_number, table in enumerate(soup.find_all('table', "wikitable plainrowheaders collapsible")):
    # Get each table row
    for rows in table.find_all("tr"):
        # Check if the first table heading is a number corresponding to a launch number
        if rows.th:
            if rows.th.string:
                flight_number = rows.th.string.strip()
                flag = flight_number.isdigit()
        else:
            flag = False

        # Get table elements
        row = rows.find_all('td')

        # If it is a number, save cells in the dictionary
        if flag:
            extracted_row += 1

            # Flight Number value
            launch_dict['Flight No.'].append(flight_number)

            # Date value
            date_time_list = date_time(row[0])
            date = date_time_list[0].strip(',')
            launch_dict['Date'].append(date)

            # Time value
            time = date_time_list[1]
            launch_dict['Time'].append(time)

            # Booster version
            bv = booster_version(row[1])
            if not bv:
                bv = row[1].a.string
            launch_dict['Version Booster'].append(bv)

            # Launch Site
            launch_site = row[2].a.string
            launch_dict['Launch site'].append(launch_site)

            # Payload
            payload = row[3].a.string
            launch_dict['Payload'].append(payload)

            # Payload Mass
            payload_mass = get_mass(row[4])
            launch_dict['Payload mass'].append(payload_mass)

            # Orbit
            orbit = row[5].a.string
            launch_dict['Orbit'].append(orbit)

            # Customer
            try:
                customer = row[6].a.string
            except:
                customer = 'Various'
            launch_dict['Customer'].append(customer)
            

            # Launch outcome
            launch_outcome = list(row[7].strings)[0]
            launch_dict['Launch outcome'].append(launch_outcome)

            # Booster landing
            booster_landing = landing_stats(row[8])
            launch_dict['Booster landing'].append(booster_landing)

Data in the web are often inconsistent, with unexpected tokens, annotations and other type of noise.
Clean, and parse all the data.

In [15]:
df = pd.DataFrame(launch_dict)
df.head()

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,1,CCAFS,Dragon Spacecraft Qualification Unit,0,LEO,SpaceX,Success\n,F9 v1.0B0003.1,Failure,4 June 2010,18:45
1,2,CCAFS,Dragon,0,LEO,NASA,Success,F9 v1.0B0004.1,Failure,8 December 2010,15:43
2,3,CCAFS,Dragon,525 kg,LEO,NASA,Success,F9 v1.0B0005.1,No attempt\n,22 May 2012,07:44
3,4,CCAFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success\n,F9 v1.0B0006.1,No attempt,8 October 2012,00:35
4,5,CCAFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success\n,F9 v1.0B0007.1,No attempt\n,1 March 2013,15:10
