In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder



In [36]:
data=pd.read_csv("../data/exoplanet_dataset.csv")

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5599 entries, 0 to 5598
Data columns (total 57 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   P_NAME                5599 non-null   object 
 1   P_DETECTION           5599 non-null   object 
 2   P_DISCOVERY_FACILITY  5599 non-null   object 
 3   P_YEAR                5599 non-null   int64  
 4   P_UPDATE              5599 non-null   object 
 5   P_MASS                5592 non-null   float64
 6   P_MASS_ORIGIN         5599 non-null   object 
 7   P_RADIUS              5592 non-null   float64
 8   P_PERIOD              5350 non-null   float64
 9   P_SEMI_MAJOR_AXIS     5595 non-null   float64
 10  P_ECCENTRICITY        4822 non-null   float64
 11  P_INCLINATION         4288 non-null   float64
 12  P_OMEGA               1659 non-null   float64
 13  S_NAME                5599 non-null   object 
 14  S_NAME_HD             971 non-null    object 
 15  S_NAME_HIP           

In [38]:

# #these the earth like planet
# clean = data[data['P_HABITABLE'] == 1]

In [39]:
useless_features = [
    'P_NAME', 'P_DETECTION', 'P_DISCOVERY_FACILITY', 'P_YEAR', 'P_UPDATE',
    'P_MASS_ORIGIN', 'S_NAME', 'S_NAME_HD', 'S_NAME_HIP', 'S_RA', 'S_DEC',
    'S_RA_STR', 'S_DEC_STR', 'S_RA_TXT', 'S_DEC_TXT', 'S_CONSTELLATION',
    'S_CONSTELLATION_ABR', 'S_CONSTELLATION_ENG'
]

In [40]:
data_cleaned = data.drop(columns=useless_features)

In [41]:
data_cleaned

Unnamed: 0,P_MASS,P_RADIUS,P_PERIOD,P_SEMI_MAJOR_AXIS,P_ECCENTRICITY,P_INCLINATION,P_OMEGA,S_TYPE,S_MAG,S_DISTANCE,...,P_TYPE,S_TYPE_TEMP,S_LUMINOSITY,S_SNOW_LINE,S_ABIO_ZONE,S_TIDAL_LOCK,P_HABZONE_OPT,P_HABZONE_CON,P_TYPE_TEMP,P_HABITABLE
0,250.00000,13.90,,3.4000,,,,,,,...,Jovian,,,,,0.214133,0,0,,0
1,16.60000,2.90,31.884000,0.1994,0.000,86.920,,,15.48100,1144.8600,...,Neptunian,G,0.814704,2.437046,2.097783,0.316980,0,0,Hot,0
2,5.10000,2.11,6.883376,0.0678,0.000,89.820,,,14.97300,1073.7600,...,Superterran,G,1.096478,2.827247,1.756317,0.459559,0,0,Hot,0
3,12.20000,3.52,1.921036,0.0291,,,,,14.36400,402.9150,...,Neptunian,K,0.299226,1.476943,0.568374,0.443760,0,0,Hot,0
4,4.94000,2.07,11.119907,0.0911,0.000,87.750,,,13.83500,367.0000,...,Superterran,K,0.424620,1.759397,0.768502,0.386150,0,0,Hot,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5594,581.62598,13.40,3999.000000,5.1400,0.160,68.000,163.0,G2 V,7.11716,50.6078,...,Jovian,G,3.228494,4.851363,1.893917,0.541322,0,0,Cold,0
5595,17.20000,2.49,146.050000,0.5800,0.240,89.900,306.0,G2 V,6.46974,33.2673,...,Neptunian,G,2.471724,4.244864,2.520276,0.543834,0,0,Hot,0
5596,11.30000,3.90,29.334340,0.1800,0.042,90.179,-31.0,,15.72600,914.2210,...,Neptunian,G,0.579429,2.055246,1.249824,0.400198,0,0,Hot,0
5597,4.33000,1.92,8.411200,0.0763,0.120,,,K0 V,8.97000,47.2899,...,Superterran,K,0.537032,1.978626,0.787469,0.475336,0,0,Hot,0


In [42]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

In [44]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# hand j
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))

In [45]:
X_encoded.columns = encoder.get_feature_names_out(categorical_cols)

X_preprocessed = pd.concat([X[numerical_cols].reset_index(drop=True), X_encoded.reset_index(drop=True)], axis=1)

X_preprocessed.fillna(X_preprocessed.mean(), inplace=True)




In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [47]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

import joblib
joblib.dump(model, "habitability_model.pkl")

new_data = X_test.iloc[:5]
predictions = model.predict(new_data)
print("Predictions:", predictions)


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1108
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         7

    accuracy                           1.00      1120
   macro avg       1.00      1.00      1.00      1120
weighted avg       1.00      1.00      1.00      1120

Predictions: [0 0 0 0 0]
