In [47]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.0.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.3-py3-none-win_amd64.whl (149.9 MB)
   ---------------------------------------- 0.0/149.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/149.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/149.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/149.9 MB ? eta -:--:--
   ---------------------------------------- 0.8/149.9 MB 1.1 MB/s eta 0:02:18
   ---------------------------------------- 1.0/149.9 MB 1.2 MB/s eta 0:02:02
   ---------------------------------------- 1.3/149.9 MB 1.2 MB/s eta 0:02:04
   ---------------------------------------- 1.6/149.9 MB 1.3 MB/s eta 0:01:59
   ---------------------------------------- 1.8/149.9 MB 1.3 MB/s eta 0:01:55
    --------------------------------------- 2.4/149.9 MB 1.3 MB/s eta 0:01:50
    ------------------


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [57]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import requests

# 2. Load SpaceX Launch Data from API
launch_url = "https://api.spacexdata.com/v4/launches"
response = requests.get(launch_url)

if response.status_code == 200:
    launches = pd.json_normalize(response.json())
    print(" Launch data loaded from API!")
else:
    print(" Failed to fetch launch data.")
    exit()

# 3. Feature Engineering
# Use a subset of relevant features
features = ['date_utc', 'success', 'rocket', 'launchpad', 'details']
df = launches[features].copy()

# Drop rows with missing target
df = df[df['success'].notna()]

# Convert target to binary (boolean to int)
df['success'] = df['success'].astype(int)

# Extract date features
df['date'] = pd.to_datetime(df['date_utc'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Use text length as proxy feature (details)
df['details_len'] = df['details'].fillna('').apply(len)

# Select features for modeling
X = df[['rocket', 'launchpad', 'year', 'month', 'day', 'details_len']]
y = df['success']

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Preprocessing Pipeline for Categorical Features
cat_features = ['rocket', 'launchpad']
num_features = ['year', 'month', 'day', 'details_len']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
], remainder='passthrough')

# 6. Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# 7. Train and Evaluate Models
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred)
    }

# 8. Display Evaluation Results
results_df = pd.DataFrame(results).T
print("\nModel Evaluation Summary:")
print(results_df)

# 9. Hyperparameter Tuning for Random Forest
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [5, 10, None]
}

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

print("\n Best Random Forest Parameters:")
print(grid_search.best_params_)

# Final Model Evaluation
y_pred_final = grid_search.predict(X_test)
final_f1 = f1_score(y_test, y_pred_final)
print(f"\n Final Model F1-score: {final_f1:.4f}")

 Launch data loaded from API!



Parameters: { "use_label_encoder" } are not used.





Model Evaluation Summary:
                     Accuracy  Precision    Recall  F1-score
Logistic Regression  1.000000        1.0  1.000000  1.000000
Decision Tree        0.973684        1.0  0.973684  0.986667
Random Forest        1.000000        1.0  1.000000  1.000000
XGBoost              1.000000        1.0  1.000000  1.000000

 Best Random Forest Parameters:
{'classifier__max_depth': 5, 'classifier__n_estimators': 50}

 Final Model F1-score: 1.0000
