In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# --- 1. Load the Data ---
print("Loading CSV file...")
df = pd.read_csv(
    r"C:\Users\Jonah\Desktop\Expo\q1_q17_dr25_sup_koi_2025.10.04_20.29.34.csv", 
    comment='#'
)

# --- 2. Define the ONLY features that have data in your file ---
working_features = [
    'koi_period', 
    'koi_fpflag_nt', 
    'koi_fpflag_ss', 
    'koi_fpflag_co'
]
print(f"Using the only 4 available features: {working_features}")

# --- 3. Clean the dataset ---
# This will now work because these 4 columns have data
df_clean = df[working_features + ['koi_disposition']].dropna()
X = df_clean[working_features]
y = df_clean['koi_disposition']
print(f"Data cleaned. Using {X.shape[0]} rows.")

# --- 4. Split the data and train a simple model ---
print("Training a new, simpler model...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# --- 5. Check the new model's accuracy ---
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"✅ New 4-feature model trained with an accuracy of: {accuracy * 100:.2f}%")

# --- 6. Save the final, working model ---
joblib.dump(model, 'exoplanet_model_final.joblib')
print("\n🎉 SUCCESS! A working model has been saved as 'exoplanet_model_final.joblib'!")

Loading CSV file...
Using the only 4 available features: ['koi_period', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co']
Data cleaned. Using 9564 rows.
Training a new, simpler model...
✅ New 4-feature model trained with an accuracy of: 75.69%

🎉 SUCCESS! A working model has been saved as 'exoplanet_model_final.joblib'!
