# Data Collection via SpaceX API


In [1]:
# requests allows HTTP requests  to get data from API
# pandas for data manipulation and analysis
# numPy support for arrays and matrices, along with mathematical functions
# datetime allows dates representation

import requests
import pandas as pd
import numpy as np
import datetime

### Helper function definitions to extract data from API

Target the <code> rocket </code> col and get the <code> booster </code> name

In [2]:
# get data from url, use rocket column to call API, extract data and append it to the list
def getBoosterName(data):
    for booster in data['rocket']:
        if booster:
            response = requests.get('https://api.spacexdata.com/v4/rockets/' + str(booster)).json()
            BoosterName.append(response['name'])

Target the <code> site name </code>  and <code> Lat </code>, <code>Long </code>

In [3]:
# get data from url, use launchpad column to call API, extract data and append it to the list
def getLaunchPad(data):
    for pad in data['launchpad']:
        if pad:
            response = requests.get('https://api.spacexdata.com/v4/launchpads/' + str(pad)).json()
            Longitude.append(response['longitude'])
            Latitude.append(response['latitude'])
            LaunchPad.append(response['name'])

Target the <code>mass payload</code>  and  the final <code>orbit</code>

In [4]:
# get data from url, use payloads column to call API, extract data and append it to the list
def getPayloadData(data):
    for payload in data['payloads']:
        if payload:
            response = requests.get('https://api.spacexdata.com/v4/payloads/' + payload).json()
            PayloadMass.append(response['mass_kg'])
            FinalOrbit.append(response['orbit'])

Extract from <code>cores</code>:
- landing outcome
- landing type
- number of flights with specific core
- gridfins used or not
- reused core
- number of time specific core is reused
- used legs
- landing pad used
- block number of the core
- core serial

In [5]:
# get data from url, use cores column to call API, extract data and append it to the list
def getCoresData(data):
    for core in data['cores']:
        if core['core'] is not None:
            response = requests.get('https://api.spacexdata.com/v4/cores/' + core['core']).json()
            Block.append(response['block'])
            ReusedCount.append(response['reuse_count'])
            Serial.append(response['serial'])
        else:
            Block.append(None)
            ReusedCount.append(None)
            Serial.append(None)
        Outcome.append(str(core['landing_success']) + ' ' + str(core['landing_type']))
        Flights.append(core['flight'])
        GridFins.append(core['gridfins'])
        Reused.append(core['reused'])
        Legs.append(core['legs'])
        LandPad.append(core['landpad'])

In [6]:
# get data from url, use cores column to call API, extract data and append it to the list
def getFailuresData(data):
    for core in data['cores']:
        if core['core'] is not None:
            response = requests.get('https://api.spacexdata.com/v4/cores/' + core['core']).json()
            Failures.append(response['last_update'])
        else:
            Failures.append(None)

GET raw data from API

In [7]:
spacex_url = 'https://api.spacexdata.com/v4/launches/past'

In [8]:
res = requests.get(spacex_url)

## Request and parse SpaceX GET req

In [9]:
allData = pd.json_normalize(res.json())
# print first 5 rows
allData.head()

Unnamed: 0,static_fire_date_utc,static_fire_date_unix,net,window,rocket,success,failures,details,crew,ships,...,links.reddit.media,links.reddit.recovery,links.flickr.small,links.flickr.original,links.presskit,links.webcast,links.youtube_id,links.article,links.wikipedia,fairings
0,2006-03-17T00:00:00.000Z,1142554000.0,False,0.0,5e9d0d95eda69955f709d1eb,False,"[{'time': 33, 'altitude': None, 'reason': 'mer...",Engine failure at 33 seconds and loss of vehicle,[],[],...,,,[],[],,https://www.youtube.com/watch?v=0a_00nJ_Y88,0a_00nJ_Y88,https://www.space.com/2196-spacex-inaugural-fa...,https://en.wikipedia.org/wiki/DemoSat,
1,,,False,0.0,5e9d0d95eda69955f709d1eb,False,"[{'time': 301, 'altitude': 289, 'reason': 'har...",Successful first stage burn and transition to ...,[],[],...,,,[],[],,https://www.youtube.com/watch?v=Lk4zQ2wP-Nc,Lk4zQ2wP-Nc,https://www.space.com/3590-spacex-falcon-1-roc...,https://en.wikipedia.org/wiki/DemoSat,
2,,,False,0.0,5e9d0d95eda69955f709d1eb,False,"[{'time': 140, 'altitude': 35, 'reason': 'resi...",Residual stage 1 thrust led to collision betwe...,[],[],...,,,[],[],,https://www.youtube.com/watch?v=v0w9p3U8860,v0w9p3U8860,http://www.spacex.com/news/2013/02/11/falcon-1...,https://en.wikipedia.org/wiki/Trailblazer_(sat...,
3,2008-09-20T00:00:00.000Z,1221869000.0,False,0.0,5e9d0d95eda69955f709d1eb,True,[],Ratsat was carried to orbit on the first succe...,[],[],...,,,[],[],,https://www.youtube.com/watch?v=dLQ2tZEH6G0,dLQ2tZEH6G0,https://en.wikipedia.org/wiki/Ratsat,https://en.wikipedia.org/wiki/Ratsat,
4,,,False,0.0,5e9d0d95eda69955f709d1eb,True,[],,[],[],...,,,[],[],http://www.spacex.com/press/2012/12/19/spacexs...,https://www.youtube.com/watch?v=yTaIDooc8Og,yTaIDooc8Og,http://www.spacex.com/news/2013/02/12/falcon-1...,https://en.wikipedia.org/wiki/RazakSAT,


Turning IDs into something meaningful for each launch
- use each specific ID to match each launch, so extract utils IDs and clean the data

In [10]:
# obtain data subset to work only with meaningful data from selected IDs to keep only selected features
cleanData = allData[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

# remove rows with multiple cores => falcon rockets with extra rocket boosters and rows have multiple payloads in a single rocket.
cleanData = cleanData[cleanData['cores'].map(len) == 1]
cleanData = cleanData[cleanData['payloads'].map(len) == 1]

# payloads and cores are lists size 1 => extract single value in the list and replace
cleanData['cores'] = cleanData['cores'].map(lambda x : x[0])
cleanData['payloads'] = cleanData['payloads'].map(lambda x : x[0])

# utc date to datetime dataType and extract date leaving time
cleanData['date'] = pd.to_datetime(cleanData['date_utc']).dt.date

# restrict launch dates with date
cleanData = cleanData[cleanData['date'] <= datetime.date(2020, 11, 13)]

In [11]:
#Global variables
LaunchPad = []
Longitude = []
Latitude = []
Serial = []
BoosterName = []
Flights = []
Reused = []
ReusedCount = []
Block = []
Legs = []
GridFins = []
PayloadMass = []
FinalOrbit = []
LandPad = []
Outcome = []
Failures = []

In [12]:
getBoosterName(cleanData)
getLaunchPad(cleanData)
getCoresData(cleanData)
getPayloadData(cleanData)
getFailuresData(cleanData)

Combine columns into a dictionary to build the cleaned dataset

In [13]:
launch_dict = {
    'FlightNumber':list(cleanData['flight_number']),
    'Date': list(cleanData['date']),
    'Outcome':Outcome,
    'Serial':Serial,
    'BoosterName':BoosterName,
    'Flights':Flights,
    'Reused':Reused,
    'ReusedCount':ReusedCount,
    'Block':Block,
    'Legs':Legs,
    'GridFins':GridFins,
    'PayloadMass':PayloadMass,
    'FinalOrbit':FinalOrbit,
    'LaunchPad': LaunchPad,
    'LandPad':LandPad,
    'Latitude':Latitude,
    'Longitude':Longitude,
    'Failures':Failures
}

Create pd dataframe from the dictionary

In [14]:
launch_data = pd.DataFrame(launch_dict)

# show first 5 rows
launch_data.head()

Unnamed: 0,FlightNumber,Date,Outcome,Serial,BoosterName,Flights,Reused,ReusedCount,Block,Legs,GridFins,PayloadMass,FinalOrbit,LaunchPad,LandPad,Latitude,Longitude,Failures
0,1,2006-03-24,None None,Merlin1A,Falcon 1,1,False,0,,False,False,20.0,LEO,Kwajalein Atoll,,9.047721,167.743129,Engine failure at T+33 seconds resulted in los...
1,2,2007-03-21,None None,Merlin2A,Falcon 1,1,False,0,,False,False,,LEO,Kwajalein Atoll,,9.047721,167.743129,Successful first-stage burn and transition to ...
2,4,2008-09-28,None None,Merlin2C,Falcon 1,1,False,0,,False,False,165.0,LEO,Kwajalein Atoll,,9.047721,167.743129,"Initially scheduled for 23–25 Sep, carried dum..."
3,5,2009-07-13,None None,Merlin3C,Falcon 1,1,False,0,,False,False,200.0,LEO,Kwajalein Atoll,,9.047721,167.743129,
4,6,2010-06-04,None None,B0003,Falcon 9,1,False,0,1.0,False,False,,LEO,CCSFS SLC 40,,28.561857,-80.577366,Stage Expended


## Display only Falcon 9 entries

In [15]:
falcon9_data = launch_data[launch_data['BoosterName'] != 'Falcon 1']

# reset FlightNumber column
falcon9_data.loc[:, 'FlightNumber'] = list(range(1, falcon9_data.shape[0] + 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  falcon9_data.loc[:, 'FlightNumber'] = list(range(1, falcon9_data.shape[0] + 1))


# Data Wrangling

In [16]:
# check rows with missing values
falcon9_data.isnull().sum()

FlightNumber     0
Date             0
Outcome          0
Serial           0
BoosterName      0
Flights          0
Reused           0
ReusedCount      0
Block            0
Legs             0
GridFins         0
PayloadMass      5
FinalOrbit       0
LaunchPad        0
LandPad         26
Latitude         0
Longitude        0
Failures         7
dtype: int64

In order to deal with missing values, the best practice is to calculate the mean for the <code>payload mass</code>
Then use the <code>mean</code> to replace the missing values whit it

In [17]:
# calculate the mean
payloadMean = falcon9_data['PayloadMass'].mean()

# replace missing values with the mean
falcon9_data['PayloadMass'].replace(np.nan, payloadMean, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  falcon9_data['PayloadMass'].replace(np.nan, payloadMean, inplace=True)


Export all data to CSV

In [18]:
falcon9_data.to_csv('dataset1.csv', index=False)