# 4. Basic Data Analysis

- adds an extra column to our dataset to classify successful and unsuccessful launches
- performs very basic numerical analysis on some of the columns


In [1]:
import pandas as pd
from pathlib import Path

import helpers as hlp

## Setup


In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

INPUT_FILE = hlp.DATA_DIR / Path("02_falcon9_data.csv")  # data from step 2
OUTPUT_FILE = hlp.DATA_DIR / Path("04_eda_data.csv")


## Data Analysis


In [3]:
df = pd.read_csv(INPUT_FILE, encoding="utf-8", header=0, index_col=False)
df.astype(
    {
        "date": "datetime64[ns]",
        "booster_version": "object",
        "payload_mass": "float64",
        "orbit": "object",
        "launch_site": "object",
        "outcome": "object",
        "flights": "int64",
        "grid_fins": bool,
        "reused": bool,
        "legs": bool,
        "landing_pad": "object",
        "block": "float64",
        "reused_count": "int64",
        "serial": "object",
        "longitude": "object",
        "latitude": "object",
    }
)
df.head(5)

Unnamed: 0,flight_number,date,booster_version,payload_mass,orbit,launch_site,outcome,flights,grid_fins,reused,legs,landing_pad,block,reused_count,serial,longitude,latitude
0,6,2010-06-04,Falcon 9,6123.547647,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857
1,8,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857
2,10,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0007,-80.577366,28.561857
3,11,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1.0,0,B1003,-120.610829,34.632093
4,12,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B1004,-80.577366,28.561857


In [4]:
df.dtypes

flight_number        int64
date                object
booster_version     object
payload_mass       float64
orbit               object
launch_site         object
outcome             object
flights              int64
grid_fins             bool
reused                bool
legs                  bool
landing_pad         object
block              float64
reused_count         int64
serial              object
longitude          float64
latitude           float64
dtype: object

In [5]:
df.shape

(90, 17)

### create a new column as `landing_class`

distinct successful from unsuccessful launches:

- landing_class = 0 if bad_outcome
- landing_class = 1 otherwise


In [6]:
# Get a set of the possible bad outcomes
bad_outcomes = set(
    outcome
    for outcome in df["outcome"].unique()
    if "none" in outcome.lower() or "false" in outcome.lower()
)
bad_outcomes

{'False ASDS', 'False Ocean', 'False RTLS', 'None ASDS', 'None None'}

In [7]:
landing_class = [0 if landing in bad_outcomes else 1 for landing in df["outcome"]]
len(landing_class)

90

In [8]:
df["class"] = landing_class

In [9]:
df.shape

(90, 18)

## Simple Data Analysis


### % of missing values


In [10]:
df.isnull().sum() / len(df) * 100

flight_number       0.000000
date                0.000000
booster_version     0.000000
payload_mass        0.000000
orbit               0.000000
launch_site         0.000000
outcome             0.000000
flights             0.000000
grid_fins           0.000000
reused              0.000000
legs                0.000000
landing_pad        28.888889
block               0.000000
reused_count        0.000000
serial              0.000000
longitude           0.000000
latitude            0.000000
class               0.000000
dtype: float64

### # of launches from each site


In [11]:
df["launch_site"].value_counts()

launch_site
CCSFS SLC 40    55
KSC LC 39A      22
VAFB SLC 4E     13
Name: count, dtype: int64

### mission outcome


In [12]:
# itemized launch outcomes
df["outcome"].value_counts()

outcome
True ASDS      41
None None      19
True RTLS      14
False ASDS      6
True Ocean      5
False Ocean     2
None ASDS       2
False RTLS      1
Name: count, dtype: int64

### # of launches to each orbit


In [13]:
# number of launches to each orbit
df["orbit"].value_counts()

orbit
GTO      27
ISS      21
VLEO     14
PO        9
LEO       7
SSO       5
MEO       3
HEO       1
ES-L1     1
SO        1
GEO       1
Name: count, dtype: int64

### obtain launch success rate


In [14]:
print(f"Success rate: {(sum(landing_class) / len(landing_class)) * 100:.2f}%")

Success rate: 66.67%


In [15]:
# values should be the same, ~66.67%
print(f"Mean of class column: {df['class'].mean() * 100:.2f}%")

Mean of class column: 66.67%


## Save dataframe


In [16]:
df.to_csv(OUTPUT_FILE, encoding="utf-8", mode="w", header=True, index=False)