In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
data=pd.read_csv("../data/exoplanet_dataset.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5599 entries, 0 to 5598
Data columns (total 57 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   P_NAME                5599 non-null   object 
 1   P_DETECTION           5599 non-null   object 
 2   P_DISCOVERY_FACILITY  5599 non-null   object 
 3   P_YEAR                5599 non-null   int64  
 4   P_UPDATE              5599 non-null   object 
 5   P_MASS                5592 non-null   float64
 6   P_MASS_ORIGIN         5599 non-null   object 
 7   P_RADIUS              5592 non-null   float64
 8   P_PERIOD              5350 non-null   float64
 9   P_SEMI_MAJOR_AXIS     5595 non-null   float64
 10  P_ECCENTRICITY        4822 non-null   float64
 11  P_INCLINATION         4288 non-null   float64
 12  P_OMEGA               1659 non-null   float64
 13  S_NAME                5599 non-null   object 
 14  S_NAME_HD             971 non-null    object 
 15  S_NAME_HIP           

In [4]:

# #these the earth like planet
# clean = data[data['P_HABITABLE'] == 1]

In [5]:
useless_features = [
    'P_NAME', 'P_DETECTION', 'P_DISCOVERY_FACILITY', 'P_YEAR', 'P_UPDATE',
    'P_MASS_ORIGIN', 'S_NAME', 'S_NAME_HD', 'S_NAME_HIP', 'S_RA', 'S_DEC',
    'S_RA_STR', 'S_DEC_STR', 'S_RA_TXT', 'S_DEC_TXT', 'S_CONSTELLATION',
    'S_CONSTELLATION_ABR', 'S_CONSTELLATION_ENG'
]

In [9]:
clean_data = data.drop(columns=useless_features)

In [10]:
X = clean_data.drop(columns=['P_HABITABLE'])
y = clean_data['P_HABITABLE']

In [11]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

In [12]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5599 entries, 0 to 5598
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   P_MASS             5592 non-null   float64
 1   P_RADIUS           5592 non-null   float64
 2   P_PERIOD           5350 non-null   float64
 3   P_SEMI_MAJOR_AXIS  5595 non-null   float64
 4   P_ECCENTRICITY     4822 non-null   float64
 5   P_INCLINATION      4288 non-null   float64
 6   P_OMEGA            1659 non-null   float64
 7   S_TYPE             2021 non-null   object 
 8   S_MAG              5380 non-null   float64
 9   S_DISTANCE         5578 non-null   float64
 10  S_TEMPERATURE      5380 non-null   float64
 11  S_MASS             5595 non-null   float64
 12  S_RADIUS           5367 non-null   float64
 13  S_METALLICITY      5166 non-null   float64
 14  S_AGE              4392 non-null   float64
 15  S_LOG_LUM          5364 non-null   float64
 16  S_LOG_G            5353 

In [13]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# hand j
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))

In [14]:
X_encoded.columns = encoder.get_feature_names_out(categorical_cols)

X_preprocessed = pd.concat([X[numerical_cols].reset_index(drop=True), X_encoded.reset_index(drop=True)], axis=1)

X_preprocessed.fillna(X_preprocessed.mean(), inplace=True)




In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [16]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

import joblib
joblib.dump(model, "habitability_model.pkl")

new_data = X_test.iloc[:5]
predictions = model.predict(new_data)
print("Predictions:", predictions)


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1108
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         7

    accuracy                           1.00      1120
   macro avg       1.00      1.00      1.00      1120
weighted avg       1.00      1.00      1.00      1120

Predictions: [0 0 0 0 0]
