In [1]:
!pip install pandas numpy scikit-learn



In [2]:
#importing required libraries
import pandas as pd
import numpy as np  
from sklearn.model_selection import train_test_split

In [61]:
df=pd.read_csv('clean_data_falcon9.csv')

In [62]:
df.dtypes

FlightNumber        int64
Date               object
BoosterVersion     object
PayloadMass       float64
Orbit              object
LaunchSite         object
Outcome            object
Flights             int64
GridFins             bool
Reused               bool
Legs                 bool
LandingPad         object
Block             float64
ReusedCount         int64
Serial             object
Longitude         float64
Latitude          float64
dtype: object

In [63]:
set(df.Outcome)

{'False ASDS',
 'False Ocean',
 'False RTLS',
 'None ASDS',
 'None None',
 'True ASDS',
 'True Ocean',
 'True RTLS'}

In [64]:
#converting the outcome column to numerical values
landing_outcomes = df['Outcome'].value_counts()
for i,outcome in enumerate(landing_outcomes.keys()):
    print(i,outcome)

0 True ASDS
1 True RTLS
2 None None
3 False ASDS
4 True Ocean
5 False Ocean
6 None ASDS
7 False RTLS


In [65]:
bad_outcomes=set(landing_outcomes.keys()[[2,3,5,6,7]])
bad_outcomes

{'False ASDS', 'False Ocean', 'False RTLS', 'None ASDS', 'None None'}

In [66]:
# landing_class = 0 if bad_outcome
# landing_class = 1 otherwise

df['Class']=[0 if i in bad_outcomes else 1 for i in df['Outcome']]

In [67]:
# df['Class']=landing_class
df[['Class']].head(8)

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
5,0
6,1
7,1


In [68]:
df=df.drop('Outcome',axis=1)

In [69]:
dict(df.dtypes)


{'FlightNumber': dtype('int64'),
 'Date': dtype('O'),
 'BoosterVersion': dtype('O'),
 'PayloadMass': dtype('float64'),
 'Orbit': dtype('O'),
 'LaunchSite': dtype('O'),
 'Flights': dtype('int64'),
 'GridFins': dtype('bool'),
 'Reused': dtype('bool'),
 'Legs': dtype('bool'),
 'LandingPad': dtype('O'),
 'Block': dtype('float64'),
 'ReusedCount': dtype('int64'),
 'Serial': dtype('O'),
 'Longitude': dtype('float64'),
 'Latitude': dtype('float64'),
 'Class': dtype('int64')}

In [70]:
#dropping columns that are not required for prediction
df=df.drop(['FlightNumber','Date','LandingPad'],axis=1)

In [71]:
df

Unnamed: 0,BoosterVersion,PayloadMass,Orbit,LaunchSite,Flights,GridFins,Reused,Legs,Block,ReusedCount,Serial,Longitude,Latitude,Class
0,Falcon 9,8191.07911,LEO,CCSFS SLC 40,1,False,False,False,1.0,0,B0003,-80.577366,28.561857,0
1,Falcon 9,525.00000,LEO,CCSFS SLC 40,1,False,False,False,1.0,0,B0005,-80.577366,28.561857,0
2,Falcon 9,677.00000,ISS,CCSFS SLC 40,1,False,False,False,1.0,0,B0007,-80.577366,28.561857,0
3,Falcon 9,500.00000,PO,VAFB SLC 4E,1,False,False,False,1.0,0,B1003,-120.610829,34.632093,0
4,Falcon 9,3170.00000,GTO,CCSFS SLC 40,1,False,False,False,1.0,0,B1004,-80.577366,28.561857,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,Falcon 9,13260.00000,VLEO,KSC LC 39A,2,True,True,True,5.0,1,B1069,-80.603956,28.608058,1
163,Falcon 9,13260.00000,VLEO,VAFB SLC 4E,7,True,True,True,5.0,6,B1063,-120.610829,34.632093,1
164,Falcon 9,13260.00000,VLEO,CCSFS SLC 40,6,True,True,True,5.0,5,B1067,-80.577366,28.561857,1
165,Falcon 9,13260.00000,VLEO,CCSFS SLC 40,4,True,True,True,5.0,0,B1072,-80.577366,28.561857,1


In [72]:
len(set(df['Serial']))

62

In [73]:
df.drop(['Serial'],axis=1,inplace=True)

In [74]:
boolean_columns=df.select_dtypes(include=['bool']).columns
df[boolean_columns]=df[boolean_columns].astype(int)


In [75]:
df.dtypes

BoosterVersion     object
PayloadMass       float64
Orbit              object
LaunchSite         object
Flights             int64
GridFins            int64
Reused              int64
Legs                int64
Block             float64
ReusedCount         int64
Longitude         float64
Latitude          float64
Class               int64
dtype: object

In [76]:
#one hot encoding categorical columns
from sklearn.preprocessing import OneHotEncoder
categorical_columns=['BoosterVersion','Orbit','LaunchSite']
OHE=OneHotEncoder(sparse_output=False,drop="first")
encoded=OHE.fit_transform(df[categorical_columns])

encoded_df = pd.DataFrame(
    encoded,
    columns=OHE.get_feature_names_out(categorical_columns),
    index=df.index
)

# drop original categorical columns
df = df.drop(columns=categorical_columns)

# add encoded columns
df = pd.concat([df, encoded_df], axis=1)

In [77]:
df.columns

Index(['PayloadMass', 'Flights', 'GridFins', 'Reused', 'Legs', 'Block',
       'ReusedCount', 'Longitude', 'Latitude', 'Class', 'Orbit_GEO',
       'Orbit_GTO', 'Orbit_HEO', 'Orbit_ISS', 'Orbit_LEO', 'Orbit_MEO',
       'Orbit_PO', 'Orbit_SO', 'Orbit_SSO', 'Orbit_TLI', 'Orbit_VLEO',
       'LaunchSite_KSC LC 39A', 'LaunchSite_VAFB SLC 4E'],
      dtype='object')

In [78]:
df.dtypes

PayloadMass               float64
Flights                     int64
GridFins                    int64
Reused                      int64
Legs                        int64
Block                     float64
ReusedCount                 int64
Longitude                 float64
Latitude                  float64
Class                       int64
Orbit_GEO                 float64
Orbit_GTO                 float64
Orbit_HEO                 float64
Orbit_ISS                 float64
Orbit_LEO                 float64
Orbit_MEO                 float64
Orbit_PO                  float64
Orbit_SO                  float64
Orbit_SSO                 float64
Orbit_TLI                 float64
Orbit_VLEO                float64
LaunchSite_KSC LC 39A     float64
LaunchSite_VAFB SLC 4E    float64
dtype: object

In [None]:
#splitting the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split((df.drop('Class',axis=1)), df['Class'], test_size=0.2, random_state=42)

In [83]:
print(len(x_train))
print(len(x_test))

133
34


In [84]:
print(len(y_train))
print(len(y_test))

133
34
