In [1]:
!pip install pandas numpy scikit-learn



In [61]:
#importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [62]:
df=pd.read_csv('/content/clean_data_falcon9.csv')

In [63]:
df.dtypes

Unnamed: 0,0
FlightNumber,int64
Date,object
BoosterVersion,object
PayloadMass,float64
Orbit,object
LaunchSite,object
Outcome,object
Flights,int64
GridFins,bool
Reused,bool


In [64]:
set(df.Outcome)

{'False ASDS',
 'False Ocean',
 'False RTLS',
 'None ASDS',
 'None None',
 'True ASDS',
 'True Ocean',
 'True RTLS'}

In [65]:
#converting the outcome column to numerical values
landing_outcomes = df['Outcome'].value_counts()
for i,outcome in enumerate(landing_outcomes.keys()):
    print(i,outcome)

0 True ASDS
1 True RTLS
2 None None
3 False ASDS
4 True Ocean
5 False Ocean
6 None ASDS
7 False RTLS


In [66]:
bad_outcomes=set(landing_outcomes.keys()[[2,3,5,6,7]])
bad_outcomes

{'False ASDS', 'False Ocean', 'False RTLS', 'None ASDS', 'None None'}

<h1>Preparing data to train models<h1>

In [67]:
# landing_class = 0 if bad_outcome
# landing_class = 1 otherwise

df['Class']=[0 if i in bad_outcomes else 1 for i in df['Outcome']]

In [68]:
# df['Class']=landing_class
df[['Class']].head(8)

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
5,0
6,1
7,1


In [69]:
df=df.drop('Outcome',axis=1)

In [70]:
dict(df.dtypes)


{'FlightNumber': dtype('int64'),
 'Date': dtype('O'),
 'BoosterVersion': dtype('O'),
 'PayloadMass': dtype('float64'),
 'Orbit': dtype('O'),
 'LaunchSite': dtype('O'),
 'Flights': dtype('int64'),
 'GridFins': dtype('bool'),
 'Reused': dtype('bool'),
 'Legs': dtype('bool'),
 'LandingPad': dtype('O'),
 'Block': dtype('float64'),
 'ReusedCount': dtype('int64'),
 'Serial': dtype('O'),
 'Longitude': dtype('float64'),
 'Latitude': dtype('float64'),
 'Class': dtype('int64')}

In [71]:
#dropping columns that are not required for prediction
df=df.drop(['FlightNumber','Date','LandingPad','LaunchSite'],axis=1)

In [72]:
df

Unnamed: 0,BoosterVersion,PayloadMass,Orbit,Flights,GridFins,Reused,Legs,Block,ReusedCount,Serial,Longitude,Latitude,Class
0,Falcon 9,8191.07911,LEO,1,False,False,False,1.0,0,B0003,-80.577366,28.561857,0
1,Falcon 9,525.00000,LEO,1,False,False,False,1.0,0,B0005,-80.577366,28.561857,0
2,Falcon 9,677.00000,ISS,1,False,False,False,1.0,0,B0007,-80.577366,28.561857,0
3,Falcon 9,500.00000,PO,1,False,False,False,1.0,0,B1003,-120.610829,34.632093,0
4,Falcon 9,3170.00000,GTO,1,False,False,False,1.0,0,B1004,-80.577366,28.561857,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,Falcon 9,13260.00000,VLEO,2,True,True,True,5.0,1,B1069,-80.603956,28.608058,1
163,Falcon 9,13260.00000,VLEO,7,True,True,True,5.0,6,B1063,-120.610829,34.632093,1
164,Falcon 9,13260.00000,VLEO,6,True,True,True,5.0,5,B1067,-80.577366,28.561857,1
165,Falcon 9,13260.00000,VLEO,4,True,True,True,5.0,0,B1072,-80.577366,28.561857,1


In [73]:
len(set(df['Serial']))

62

In [74]:
df.drop(['Serial'],axis=1,inplace=True)

In [75]:
boolean_columns=df.select_dtypes(include=['bool']).columns
df[boolean_columns]=df[boolean_columns].astype(int)


In [76]:
df.dtypes

Unnamed: 0,0
BoosterVersion,object
PayloadMass,float64
Orbit,object
Flights,int64
GridFins,int64
Reused,int64
Legs,int64
Block,float64
ReusedCount,int64
Longitude,float64


In [78]:
#one hot encoding categorical columns
from sklearn.preprocessing import OneHotEncoder
categorical_columns=['BoosterVersion','Orbit']
OHE=OneHotEncoder(sparse_output=False,drop="first")
encoded=OHE.fit_transform(df[categorical_columns])

encoded_df = pd.DataFrame(
    encoded,
    columns=OHE.get_feature_names_out(categorical_columns),
    index=df.index
)

# drop original categorical columns
df = df.drop(columns=categorical_columns)

# add encoded columns
df = pd.concat([df, encoded_df], axis=1)

In [79]:
df.columns

Index(['PayloadMass', 'Flights', 'GridFins', 'Reused', 'Legs', 'Block',
       'ReusedCount', 'Longitude', 'Latitude', 'Class', 'Orbit_GEO',
       'Orbit_GTO', 'Orbit_HEO', 'Orbit_ISS', 'Orbit_LEO', 'Orbit_MEO',
       'Orbit_PO', 'Orbit_SO', 'Orbit_SSO', 'Orbit_TLI', 'Orbit_VLEO'],
      dtype='object')

In [80]:
df.dtypes

Unnamed: 0,0
PayloadMass,float64
Flights,int64
GridFins,int64
Reused,int64
Legs,int64
Block,float64
ReusedCount,int64
Longitude,float64
Latitude,float64
Class,int64


In [81]:
#splitting the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split((df.drop('Class',axis=1)), df['Class'], test_size=0.2, random_state=42)

In [82]:
print(len(x_train))
print(len(x_test))

133
34


In [83]:
print(len(y_train))
print(len(y_test))

133
34


<h1>Training Classification Models<h1>

In [84]:
!pip install xgboost



In [85]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC as SVM
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve

In [86]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [87]:
## Hyperparameter Training
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "sqrt", 8,"log2"],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

svm_params = {
            "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
            "degree": [2, 3, 4, 5],
            "gamma": ['scale', 'auto']
             }
naive_bayes_params = {

             "var_smoothing": np.logspace(-12, -1, 20)
}

LR_params={
    "C": [0.1, 1, 10, 100, np.inf],
    "l1_ratio": [0.0,0.2,0.3, 0.5,0.7,0.8, 1.0],
    "solver": ['saga']
}
adaboost_param={
    "n_estimators":[50,60,70,80,90]
}

knn_params={
    "n_neighbors":[3,5,7,9,11],
    "weights":['uniform','distance'],
    "metric":['euclidean','manhattan','minkowski']
}

decisiontree_param={
    'criterion':['gini','entropy', 'log_loss'],
    'splitter':['best','random'],
    'max_depth':[1,2,3,4,5],
    'max_features':['sqrt','log2']
}



xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8, 12, 20, 30],
                  "n_estimators": [100, 200, 300],
                  "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4]}
gradient_params={"loss": ['log_loss','exponential'],
             "criterion": ['friedman_mse','squared_error'],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500],
              "max_depth": [5, 8, 15, None, 10]
                }

In [88]:
# Models list for Hyperparameter tuning



randomcv_models = [
     ("LR", LogisticRegression(), LR_params),
     ("SVM", SVM(), svm_params),
  ("NB", GaussianNB(), naive_bayes_params),
   ("knn", KNeighborsClassifier(), knn_params),
("DT", DecisionTreeClassifier(), decisiontree_param),
 ("RF", RandomForestClassifier(), rf_params),
("XGB", XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgboost_params),
   ("gradientBoost", GradientBoostingClassifier(), gradient_params),
   ("AB", AdaBoostClassifier(), adaboost_param),
                   ]

In [89]:
results = []  # to store the results of each model


In [90]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1,
                                   error_score="raise"
                                   )
    random.fit(x_train, y_train)
    model_param[name] = random.best_params_
    print(f"---------------- Best Params for {name} -------------------")
    print(model_param[name])

     # Make predictions
    y_train_pred = random.predict(x_train)
    y_test_pred = random.predict(x_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc




    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))

    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))



    print('----------------------------------')

    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
    results.append({
        'Model': name,
        'Model Parameters': model_param[name],
        'Train Accuracy': model_train_accuracy,
        'Test Accuracy': model_test_accuracy,
        'Train F1 Score': model_train_f1,
        'Test F1 Score': model_test_f1,
        'Train Precision': model_train_precision,
        'Test Precision': model_test_precision,
        'Train Recall': model_train_recall,
        'Test Recall': model_test_recall,
        'Train Roc Auc Score': model_train_rocauc_score,
        'Test Roc Auc Score': model_test_rocauc_score
    })



    print('='*35)
    print('\n')


results_df = pd.DataFrame(results)
results_df.to_csv('model_performance_results.csv', index=False)


Fitting 3 folds for each of 35 candidates, totalling 105 fits




---------------- Best Params for LR -------------------
{'solver': 'saga', 'l1_ratio': 0.2, 'C': 100}
Model performance for Training set
- Accuracy: 0.9549
- F1 score: 0.9519
- Precision: 0.9487
- Recall: 1.0000
- Roc Auc Score: 0.8636
----------------------------------
Model performance for Test set
- Accuracy: 0.8824
- F1 score: 0.8773
- Precision: 0.8889
- Recall: 0.9600
- Roc Auc Score: 0.8133


Fitting 3 folds for each of 32 candidates, totalling 96 fits
---------------- Best Params for SVM -------------------
{'kernel': 'linear', 'gamma': 'scale', 'degree': 2}
Model performance for Training set
- Accuracy: 0.9323
- F1 score: 0.9249
- Precision: 0.9250
- Recall: 1.0000
- Roc Auc Score: 0.7955
----------------------------------
Model performance for Test set
- Accuracy: 0.8529
- F1 score: 0.8424
- Precision: 0.8571
- Recall: 0.9600
- Roc Auc Score: 0.7578


Fitting 3 folds for each of 20 candidates, totalling 60 fits




---------------- Best Params for NB -------------------
{'var_smoothing': np.float64(0.1)}
Model performance for Training set
- Accuracy: 0.7970
- F1 score: 0.8181
- Precision: 0.9565
- Recall: 0.7928
- Roc Auc Score: 0.8055
----------------------------------
Model performance for Test set
- Accuracy: 0.7647
- F1 score: 0.7759
- Precision: 0.9048
- Recall: 0.7600
- Roc Auc Score: 0.7689


Fitting 3 folds for each of 30 candidates, totalling 90 fits




---------------- Best Params for knn -------------------
{'weights': 'distance', 'n_neighbors': 7, 'metric': 'euclidean'}
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.8824
- F1 score: 0.8699
- Precision: 0.8621
- Recall: 1.0000
- Roc Auc Score: 0.7778


Fitting 3 folds for each of 60 candidates, totalling 180 fits




---------------- Best Params for DT -------------------
{'splitter': 'best', 'max_features': 'log2', 'max_depth': 3, 'criterion': 'entropy'}
Model performance for Training set
- Accuracy: 0.9098
- F1 score: 0.9098
- Precision: 0.9459
- Recall: 0.9459
- Roc Auc Score: 0.8366
----------------------------------
Model performance for Test set
- Accuracy: 0.8529
- F1 score: 0.8501
- Precision: 0.8846
- Recall: 0.9200
- Roc Auc Score: 0.7933


Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for RF -------------------
{'n_estimators': 500, 'min_samples_split': 15, 'max_features': 8, 'max_depth': 5}
Model performance for Training set
- Accuracy: 0.9398
- F1 score: 0.9374
- Precision: 0.9478
- Recall: 0.9820
- Roc Auc Score: 0.8546
----------------------------------
Model performance for Test set
- Accuracy: 0.8529
- F1 score: 0.8553
- Precision: 0.9167
- Recall: 0.8800
- Roc Auc Score: 0.8289


Fitting 3 folds for each of 100 candidates, totalling 30

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


---------------- Best Params for XGB -------------------
{'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 0.3}
Model performance for Training set
- Accuracy: 0.9850
- F1 score: 0.9850
- Precision: 0.9910
- Recall: 0.9910
- Roc Auc Score: 0.9728
----------------------------------
Model performance for Test set
- Accuracy: 0.8824
- F1 score: 0.8773
- Precision: 0.8889
- Recall: 0.9600
- Roc Auc Score: 0.8133


Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for gradientBoost -------------------
{'n_estimators': 200, 'min_samples_split': 15, 'max_depth': 5, 'loss': 'exponential', 'criterion': 'squared_error'}
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.8529
- F1 score: 0.8553
- Precision: 0.9167
- Recall: 0.8800
- Roc Auc Score: 0.8289


Fitting 3 



---------------- Best Params for AB -------------------
{'n_estimators': 50}
Model performance for Training set
- Accuracy: 0.9774
- F1 score: 0.9768
- Precision: 0.9737
- Recall: 1.0000
- Roc Auc Score: 0.9318
----------------------------------
Model performance for Test set
- Accuracy: 0.8529
- F1 score: 0.8501
- Precision: 0.8846
- Recall: 0.9200
- Roc Auc Score: 0.7933


