# 2. Data Wrangling

- read data from the previous step
- select all "Falcon 9" launches and filter out the rest
- save new "Falcon 9" data to a `.csv` file


In [1]:
from pathlib import Path
import pandas as pd

import helpers as hlp

In [2]:
# Setup
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

INPUT_FILE = hlp.DATA_DIR / Path("01_collected_data.csv")
OUTPUT_FILE = hlp.DATA_DIR / Path("02_falcon9_data.csv")


## Read data


In [3]:
data_df = pd.read_csv(INPUT_FILE)
data_df.shape

(94, 17)

In [4]:
data_df.columns

Index(['flight_number', 'date', 'booster_version', 'payload_mass', 'orbit',
       'launch_site', 'outcome', 'flights', 'grid_fins', 'reused', 'legs',
       'landing_pad', 'block', 'reused_count', 'serial', 'longitude',
       'latitude'],
      dtype='object')

## Grab only Falcon 9 data


In [5]:
data_falcon9 = data_df.loc[data_df["booster_version"] == "Falcon 9"]
data_falcon9

Unnamed: 0,flight_number,date,booster_version,payload_mass,orbit,launch_site,outcome,flights,grid_fins,reused,legs,landing_pad,block,reused_count,serial,longitude,latitude
4,6,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857
5,8,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857
6,10,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0007,-80.577366,28.561857
7,11,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1.0,0,B1003,-120.610829,34.632093
8,12,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B1004,-80.577366,28.561857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,102,2020-09-03,Falcon 9,15600.0,VLEO,KSC LC 39A,True ASDS,2,True,True,True,5e9e3032383ecb6bb234e7ca,5.0,12,B1060,-80.603956,28.608058
90,103,2020-10-06,Falcon 9,15600.0,VLEO,KSC LC 39A,True ASDS,3,True,True,True,5e9e3032383ecb6bb234e7ca,5.0,13,B1058,-80.603956,28.608058
91,104,2020-10-18,Falcon 9,15600.0,VLEO,KSC LC 39A,True ASDS,6,True,True,True,5e9e3032383ecb6bb234e7ca,5.0,12,B1051,-80.603956,28.608058
92,105,2020-10-24,Falcon 9,15600.0,VLEO,CCSFS SLC 40,True ASDS,3,True,True,True,5e9e3033383ecbb9e534e7cc,5.0,12,B1060,-80.577366,28.561857


## Clean data


In [6]:
data_falcon9.dtypes

flight_number        int64
date                object
booster_version     object
payload_mass       float64
orbit               object
launch_site         object
outcome             object
flights              int64
grid_fins             bool
reused                bool
legs                  bool
landing_pad         object
block              float64
reused_count         int64
serial              object
longitude          float64
latitude           float64
dtype: object

In [7]:
data_falcon9.isnull().sum()
# We should see some null values on
# - the payload_mass column which we'll average out, and
# - the landing_pad column which we'll leave as is.

flight_number       0
date                0
booster_version     0
payload_mass        5
orbit               0
launch_site         0
outcome             0
flights             0
grid_fins           0
reused              0
legs                0
landing_pad        26
block               0
reused_count        0
serial              0
longitude           0
latitude            0
dtype: int64

In [8]:
payloadmass_mean = data_falcon9["payload_mass"].mean()
data_falcon9 = data_falcon9.fillna({"payload_mass": payloadmass_mean})

In [9]:
data_falcon9["payload_mass"].isnull().sum()

np.int64(0)

## Save the clean data


In [10]:
data_falcon9.to_csv(OUTPUT_FILE, encoding="utf-8", mode="w", header=True, index=False)