## Machine Learning Models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv('Launch_Data_Features.csv')
df.head()

Unnamed: 0,FlightNumber,Date,Booster,PayloadMass,Outcome,Flights,GridFins,Reused,Legs,Block,...,Serial_B1060,Serial_B1061,Serial_B1062,Serial_B1063,Serial_B1067,Serial_B1069,Serial_B1071,Serial_B1072,Serial_B1073,Serial_B1077
0,1,2010-06-04,Falcon 9,8191.07911,None None,1,False,False,False,1.0,...,0,0,0,0,0,0,0,0,0,0
1,2,2012-05-22,Falcon 9,525.0,None None,1,False,False,False,1.0,...,0,0,0,0,0,0,0,0,0,0
2,3,2013-03-01,Falcon 9,677.0,None None,1,False,False,False,1.0,...,0,0,0,0,0,0,0,0,0,0
3,4,2013-09-29,Falcon 9,500.0,False Ocean,1,False,False,False,1.0,...,0,0,0,0,0,0,0,0,0,0
4,5,2013-12-03,Falcon 9,3170.0,None None,1,False,False,False,1.0,...,0,0,0,0,0,0,0,0,0,0


First, a review of the dataset we are using.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 97 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   FlightNumber                         168 non-null    int64  
 1   Date                                 168 non-null    object 
 2   Booster                              168 non-null    object 
 3   PayloadMass                          168 non-null    float64
 4   Outcome                              168 non-null    object 
 5   Flights                              168 non-null    int64  
 6   GridFins                             168 non-null    bool   
 7   Reused                               168 non-null    bool   
 8   Legs                                 168 non-null    bool   
 9   Block                                168 non-null    float64
 10  ReusedCount                          168 non-null    int64  
 11  Longitude                       

In [4]:
df.describe()

Unnamed: 0,FlightNumber,PayloadMass,Flights,Block,ReusedCount,Longitude,Latitude,MissionSuccess,Orbit_ES-L1,Orbit_GEO,...,Serial_B1060,Serial_B1061,Serial_B1062,Serial_B1063,Serial_B1067,Serial_B1069,Serial_B1071,Serial_B1072,Serial_B1073,Serial_B1077
count,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,...,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0,168.0
mean,84.5,8191.07911,3.732143,4.196429,5.5,-86.780776,29.514774,0.815476,0.005952,0.011905,...,0.077381,0.059524,0.053571,0.041667,0.035714,0.011905,0.02381,0.005952,0.017857,0.005952
std,48.641546,5144.814299,3.241707,1.385377,4.681471,14.519168,2.196342,0.38907,0.077152,0.108782,...,0.267994,0.23731,0.225843,0.200424,0.186132,0.108782,0.152911,0.077152,0.132828,0.077152
min,1.0,330.0,1.0,1.0,0.0,-120.610829,28.561857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,42.75,3457.0,1.0,4.0,1.0,-80.603956,28.561857,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,84.5,8191.07911,2.0,5.0,5.0,-80.577366,28.561857,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,126.25,13260.0,5.25,5.0,9.0,-80.577366,28.608058,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,168.0,15600.0,13.0,5.0,13.0,-80.577366,34.632093,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
df.columns


Index(['FlightNumber', 'Date', 'Booster', 'PayloadMass', 'Outcome', 'Flights',
       'GridFins', 'Reused', 'Legs', 'Block', 'ReusedCount', 'Longitude',
       'Latitude', 'MissionSuccess', 'Orbit_ES-L1', 'Orbit_GEO', 'Orbit_GTO',
       'Orbit_HEO', 'Orbit_ISS', 'Orbit_LEO', 'Orbit_MEO', 'Orbit_PO',
       'Orbit_SO', 'Orbit_SSO', 'Orbit_TLI', 'Orbit_VLEO', 'Site_CCSFS SLC 40',
       'Site_KSC LC 39A', 'Site_VAFB SLC 4E',
       'LandingPad_5e9e3032383ecb267a34e7c7',
       'LandingPad_5e9e3032383ecb554034e7c9',
       'LandingPad_5e9e3032383ecb6bb234e7ca',
       'LandingPad_5e9e3032383ecb761634e7cb',
       'LandingPad_5e9e3033383ecb075134e7cd',
       'LandingPad_5e9e3033383ecbb9e534e7cc', 'Serial_B0003', 'Serial_B0005',
       'Serial_B0007', 'Serial_B1003', 'Serial_B1004', 'Serial_B1005',
       'Serial_B1006', 'Serial_B1007', 'Serial_B1008', 'Serial_B1010',
       'Serial_B1011', 'Serial_B1012', 'Serial_B1013', 'Serial_B1015',
       'Serial_B1016', 'Serial_B1017', 'Serial_B101

Our prediciton model will be used to prediction mission success using the features/variables in the data set. The output will be Y and we will train on the <code>MissionSuccess</code> column in the dataset. The features/inputs that we propose Y is dependent on are <code>Flight Number, Payload Mass, Orbit, Site, LandingPad, GridFins, Legs, Block ReusedCount</code>. (Site, Orbit, LandingPad, Serial are distributed into dummy columns for simplicity)

In [6]:
Y = df['MissionSuccess'].to_numpy()
X = df.drop(columns =['Date','Booster', 'Longitude', 'Latitude', 'Outcome', 'MissionSuccess'])


In [10]:
# Standardize the features data 
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))

### Split the data into train and testing sets. Lets start with a 80%/20% split

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state=2)

In [14]:
print('Training size: ', Y_train.shape[0], ' Testing size: ', Y_test.shape[0])

Training size:  134  Testing size:  34


# Logistic Regression Model

In [15]:
# Initial Parameters
parameters ={"C":[0.01,0.1,1],'penalty':['l2'], 'solver':['lbfgs']}# l1 lasso l2 ridge

In [16]:
# Create a Logistic Regression Instance
lr=LogisticRegression()

# to search for optimal parameters, we will use GridSearchCV for an exhaustive search over specified parameter values for an estimator
lr_cv=GridSearchCV(lr,cv=10, param_grid=parameters)
lr.fit(X_train,Y_train)
y_pred = lr.predict(X_test)
lr_cv.fit(X_train, Y_train)

In [22]:
print("GridSearch (best) tuned hpyerparameters : ",lr_cv.best_params_)
print("and its accuracy :",lr_cv.best_score_)
print("\nTest Data Accuracy: ",lr_cv.score(X_test, Y_test))

GridSearch (best) tuned hpyerparameters :  {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
and its accuracy : 0.9181318681318682

Test Data Accuracy:  0.8529411764705882
