In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [28]:
df = pd.read_csv('..\data\exoplanents_cleaned_new.csv')

  df = pd.read_csv('..\data\exoplanents_cleaned_new.csv')


In [29]:
df.sample(2)

Unnamed: 0,koi_disposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad
3001,CANDIDATE,0.78,0,0,0,0,21.086786,135.5636,0.702,0.978,499.6,2.08,651,42.44,11.0,6084,4.542,0.863
5643,FALSE POSITIVE,0.0,0,1,0,0,23.737953,147.006678,1.004,8.50818,95884.0,53.27,620,34.94,1221.2,5559,4.394,1.0


In [30]:
feature_cols = [
    'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
    'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth',
    'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_steff',
    'koi_slogg', 'koi_srad'
]

In [31]:
X = df[feature_cols]

In [43]:
X.columns.tolist()

['koi_score',
 'koi_fpflag_nt',
 'koi_fpflag_ss',
 'koi_fpflag_co',
 'koi_fpflag_ec',
 'koi_period',
 'koi_time0bk',
 'koi_impact',
 'koi_duration',
 'koi_depth',
 'koi_prad',
 'koi_teq',
 'koi_insol',
 'koi_model_snr',
 'koi_steff',
 'koi_slogg',
 'koi_srad']

In [32]:
y = df['koi_disposition'].apply(lambda x: 1 if x == 'CONFIRMED' else 0)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, 
)

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
models = {
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(probability=True),
    "RF":RandomForestClassifier(n_estimators=100, random_state=42)
}


In [36]:
results = []

In [37]:
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({
        'Model': name,
        'Accuracy': round(acc, 4),
        'F1 Score': round(f1, 4)
    })


In [38]:
results

[{'Model': 'KNN', 'Accuracy': 0.8897, 'F1 Score': 0.8295},
 {'Model': 'Naive Bayes', 'Accuracy': 0.9122, 'F1 Score': 0.8713},
 {'Model': 'Decision Tree', 'Accuracy': 0.9352, 'F1 Score': 0.8921},
 {'Model': 'SVM', 'Accuracy': 0.9005, 'F1 Score': 0.8568},
 {'Model': 'RF', 'Accuracy': 0.9532, 'F1 Score': 0.9231}]

In [39]:
import joblib
joblib.dump(models['RF'],'../models/Random_forest_exo.pkl')
joblib.dump(scaler,'../models/scaler.pkl')
joblib.dump(X.columns.tolist(),'../models/columns.pkl')

['../models/columns.pkl']