In [None]:
!pip install pandas numpy scikit-learn



In [None]:
#importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv('clean_data_falcon9.csv')

In [None]:
df.dtypes

FlightNumber        int64
Date               object
BoosterVersion     object
PayloadMass       float64
Orbit              object
LaunchSite         object
Outcome            object
Flights             int64
GridFins             bool
Reused               bool
Legs                 bool
LandingPad         object
Block             float64
ReusedCount         int64
Serial             object
Longitude         float64
Latitude          float64
dtype: object

In [None]:
set(df.Outcome)

{'False ASDS',
 'False Ocean',
 'False RTLS',
 'None ASDS',
 'None None',
 'True ASDS',
 'True Ocean',
 'True RTLS'}

In [None]:
#converting the outcome column to numerical values
landing_outcomes = df['Outcome'].value_counts()
for i,outcome in enumerate(landing_outcomes.keys()):
    print(i,outcome)

0 True ASDS
1 True RTLS
2 None None
3 False ASDS
4 True Ocean
5 False Ocean
6 None ASDS
7 False RTLS


In [None]:
bad_outcomes=set(landing_outcomes.keys()[[2,3,5,6,7]])
bad_outcomes

{'False ASDS', 'False Ocean', 'False RTLS', 'None ASDS', 'None None'}

<h1>Preparing data to train models<h1>

In [None]:
# landing_class = 0 if bad_outcome
# landing_class = 1 otherwise

df['Class']=[0 if i in bad_outcomes else 1 for i in df['Outcome']]

In [None]:
# df['Class']=landing_class
df[['Class']].head(8)

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
5,0
6,1
7,1


In [None]:
df=df.drop('Outcome',axis=1)

In [None]:
dict(df.dtypes)


{'FlightNumber': dtype('int64'),
 'Date': dtype('O'),
 'BoosterVersion': dtype('O'),
 'PayloadMass': dtype('float64'),
 'Orbit': dtype('O'),
 'LaunchSite': dtype('O'),
 'Flights': dtype('int64'),
 'GridFins': dtype('bool'),
 'Reused': dtype('bool'),
 'Legs': dtype('bool'),
 'LandingPad': dtype('O'),
 'Block': dtype('float64'),
 'ReusedCount': dtype('int64'),
 'Serial': dtype('O'),
 'Longitude': dtype('float64'),
 'Latitude': dtype('float64'),
 'Class': dtype('int64')}

In [None]:
#dropping columns that are not required for prediction
df=df.drop(['FlightNumber','Date','LandingPad'],axis=1)

In [None]:
df

Unnamed: 0,BoosterVersion,PayloadMass,Orbit,LaunchSite,Flights,GridFins,Reused,Legs,Block,ReusedCount,Serial,Longitude,Latitude,Class
0,Falcon 9,8191.07911,LEO,CCSFS SLC 40,1,False,False,False,1.0,0,B0003,-80.577366,28.561857,0
1,Falcon 9,525.00000,LEO,CCSFS SLC 40,1,False,False,False,1.0,0,B0005,-80.577366,28.561857,0
2,Falcon 9,677.00000,ISS,CCSFS SLC 40,1,False,False,False,1.0,0,B0007,-80.577366,28.561857,0
3,Falcon 9,500.00000,PO,VAFB SLC 4E,1,False,False,False,1.0,0,B1003,-120.610829,34.632093,0
4,Falcon 9,3170.00000,GTO,CCSFS SLC 40,1,False,False,False,1.0,0,B1004,-80.577366,28.561857,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,Falcon 9,13260.00000,VLEO,KSC LC 39A,2,True,True,True,5.0,1,B1069,-80.603956,28.608058,1
163,Falcon 9,13260.00000,VLEO,VAFB SLC 4E,7,True,True,True,5.0,6,B1063,-120.610829,34.632093,1
164,Falcon 9,13260.00000,VLEO,CCSFS SLC 40,6,True,True,True,5.0,5,B1067,-80.577366,28.561857,1
165,Falcon 9,13260.00000,VLEO,CCSFS SLC 40,4,True,True,True,5.0,0,B1072,-80.577366,28.561857,1


In [None]:
len(set(df['Serial']))

62

In [None]:
df.drop(['Serial'],axis=1,inplace=True)

In [None]:
boolean_columns=df.select_dtypes(include=['bool']).columns
df[boolean_columns]=df[boolean_columns].astype(int)


In [None]:
df.dtypes

BoosterVersion     object
PayloadMass       float64
Orbit              object
LaunchSite         object
Flights             int64
GridFins            int64
Reused              int64
Legs                int64
Block             float64
ReusedCount         int64
Longitude         float64
Latitude          float64
Class               int64
dtype: object

In [None]:
#one hot encoding categorical columns
from sklearn.preprocessing import OneHotEncoder
categorical_columns=['BoosterVersion','Orbit','LaunchSite']
OHE=OneHotEncoder(sparse_output=False,drop="first")
encoded=OHE.fit_transform(df[categorical_columns])

encoded_df = pd.DataFrame(
    encoded,
    columns=OHE.get_feature_names_out(categorical_columns),
    index=df.index
)

# drop original categorical columns
df = df.drop(columns=categorical_columns)

# add encoded columns
df = pd.concat([df, encoded_df], axis=1)

In [None]:
df.columns

Index(['PayloadMass', 'Flights', 'GridFins', 'Reused', 'Legs', 'Block',
       'ReusedCount', 'Longitude', 'Latitude', 'Class', 'Orbit_GEO',
       'Orbit_GTO', 'Orbit_HEO', 'Orbit_ISS', 'Orbit_LEO', 'Orbit_MEO',
       'Orbit_PO', 'Orbit_SO', 'Orbit_SSO', 'Orbit_TLI', 'Orbit_VLEO',
       'LaunchSite_KSC LC 39A', 'LaunchSite_VAFB SLC 4E'],
      dtype='object')

In [None]:
df.dtypes

PayloadMass               float64
Flights                     int64
GridFins                    int64
Reused                      int64
Legs                        int64
Block                     float64
ReusedCount                 int64
Longitude                 float64
Latitude                  float64
Class                       int64
Orbit_GEO                 float64
Orbit_GTO                 float64
Orbit_HEO                 float64
Orbit_ISS                 float64
Orbit_LEO                 float64
Orbit_MEO                 float64
Orbit_PO                  float64
Orbit_SO                  float64
Orbit_SSO                 float64
Orbit_TLI                 float64
Orbit_VLEO                float64
LaunchSite_KSC LC 39A     float64
LaunchSite_VAFB SLC 4E    float64
dtype: object

In [None]:
#splitting the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split((df.drop('Class',axis=1)), df['Class'], test_size=0.2, random_state=42)

In [None]:
print(len(x_train))
print(len(x_test))

133
34


In [None]:
print(len(y_train))
print(len(y_test))

133
34


<h1>Training Classification Models<h1>

In [None]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
    --------------------------------------- 1.3/72.0 MB 5.1 MB/s eta 0:00:14
   - -------------------------------------- 2.6/72.0 MB 5.8 MB/s eta 0:00:13
   -- ------------------------------------- 3.9/72.0 MB 5.9 MB/s eta 0:00:12
   -- ------------------------------------- 5.2/72.0 MB 5.9 MB/s eta 0:00:12
   --- ------------------------------------ 6.6/72.0 MB 6.0 MB/s eta 0:00:11
   ---- ----------------------------------- 7.9/72.0 MB 6.0 MB/s eta 0:00:11
   ---- ----------------------------------- 8.9/72.0 MB 5.9 MB/s eta 0:00:11
   ----- ---------------------------------- 10.2/72.0 MB 6.0 MB/s eta 0:00:11
   ------ --------------------------------- 11.5/72.0 MB 6.0 MB/s eta 0:00:11
   ------- -------

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC as SVM
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve

In [None]:
## Hyperparameter Training
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "sqrt", 8,"log2"],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

svm_params = {
            "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
            "degree": [2, 3, 4, 5],
            "gamma": ['scale', 'auto']
             }
naive_bayes_params = {

             "var_smoothing": np.logspace(-12, -1, 20)
}

LR_params={
    "C": [0.1, 1, 10, 100, np.inf],
    "l1_ratio": [0.0,0.2,0.3, 0.5,0.7,0.8, 1.0],
    "solver": ['saga']
}
adaboost_param={
    "n_estimators":[50,60,70,80,90]
}

knn_params={
    "n_neighbors":[3,5,7,9,11],
    "weights":['uniform','distance'],
    "metric":['euclidean','manhattan','minkowski']
}

decisiontree_param={
    'criterion':['gini','entropy', 'log_loss'],
    'splitter':['best','random'],
    'max_depth':[1,2,3,4,5],
    'max_features':['sqrt','log2']
}

rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8, 12, 20, 30],
                  "n_estimators": [100, 200, 300],
                  "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4]}
gradient_params={"loss": ['log_loss','exponential'],
             "criterion": ['friedman_mse','squared_error'],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500],
              "max_depth": [5, 8, 15, None, 10]
                }

In [None]:
# Models list for Hyperparameter tuning



randomcv_models = [
     ("LR", LogisticRegression(), LR_params),
     ("SVM", SVM(), svm_params),
  ("NB", GaussianNB(), naive_bayes_params),
   ("knn", KNeighborsClassifier(), knn_params),
("DT", DecisionTreeClassifier(), decisiontree_param),
 ("RF", RandomForestClassifier(), rf_params),
("XGB", XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgboost_params),
   ("gradientBoost", GradientBoostingClassifier(), gradient_params),
   ("AB", AdaBoostClassifier(), adaboost_param),
                   ]

In [None]:
results = []  # to store the results of each model


In [None]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1,
                                   error_score="raise"
                                   )
    random.fit(x_train, y_train)
    model_param[name] = random.best_params_
    print(f"---------------- Best Params for {name} -------------------")
    print(model_param[name])

     # Make predictions
    y_train_pred = random.predict(x_train)
    y_test_pred = random.predict(x_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))

    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))



    print('----------------------------------')

    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
    results.append({
        'Model': name,
        'Model Parameters': model_param[name],
        'Train Accuracy': model_train_accuracy,
        'Test Accuracy': model_test_accuracy,
        'Train F1 Score': model_train_f1,
        'Test F1 Score': model_test_f1,
        'Train Precision': model_train_precision,
        'Test Precision': model_test_precision,
        'Train Recall': model_train_recall,
        'Test Recall': model_test_recall,
        'Train Roc Auc Score': model_train_rocauc_score,
        'Test Roc Auc Score': model_test_rocauc_score
    })



    print('='*35)
    print('\n')


results_df = pd.DataFrame(results)
results_df.to_csv('model_performance_results.csv', index=False)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for gradientBoost -------------------
{'n_estimators': 500, 'min_samples_split': 15, 'max_depth': 5, 'loss': 'exponential', 'criterion': 'friedman_mse'}
Adaboost
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.8529
- F1 score: 0.8553
- Precision: 0.9167
- Recall: 0.8800
- Roc Auc Score: 0.8289


