In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
data=pd.read_csv("../data/exoplanet_dataset.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5599 entries, 0 to 5598
Data columns (total 57 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   P_NAME                5599 non-null   object 
 1   P_DETECTION           5599 non-null   object 
 2   P_DISCOVERY_FACILITY  5599 non-null   object 
 3   P_YEAR                5599 non-null   int64  
 4   P_UPDATE              5599 non-null   object 
 5   P_MASS                5592 non-null   float64
 6   P_MASS_ORIGIN         5599 non-null   object 
 7   P_RADIUS              5592 non-null   float64
 8   P_PERIOD              5350 non-null   float64
 9   P_SEMI_MAJOR_AXIS     5595 non-null   float64
 10  P_ECCENTRICITY        4822 non-null   float64
 11  P_INCLINATION         4288 non-null   float64
 12  P_OMEGA               1659 non-null   float64
 13  S_NAME                5599 non-null   object 
 14  S_NAME_HD             971 non-null    object 
 15  S_NAME_HIP           

In [4]:

# #these the earth like planet
# clean = data[data['P_HABITABLE'] == 1]

In [5]:
useless_features = [
    'P_NAME', 'P_DETECTION', 'P_DISCOVERY_FACILITY', 'P_YEAR', 'P_UPDATE',
    'P_MASS_ORIGIN', 'S_NAME', 'S_NAME_HD', 'S_NAME_HIP', 'S_RA', 'S_DEC',
    'S_RA_STR', 'S_DEC_STR', 'S_RA_TXT', 'S_DEC_TXT', 'S_CONSTELLATION',
    'S_CONSTELLATION_ABR', 'S_CONSTELLATION_ENG'
]

In [6]:
clean_data = data.drop(columns=useless_features)

In [7]:
X = clean_data.drop(columns=['P_HABITABLE'])
y = clean_data['P_HABITABLE']

In [8]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

In [9]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5599 entries, 0 to 5598
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   P_MASS             5592 non-null   float64
 1   P_RADIUS           5592 non-null   float64
 2   P_PERIOD           5350 non-null   float64
 3   P_SEMI_MAJOR_AXIS  5595 non-null   float64
 4   P_ECCENTRICITY     4822 non-null   float64
 5   P_INCLINATION      4288 non-null   float64
 6   P_OMEGA            1659 non-null   float64
 7   S_TYPE             2021 non-null   object 
 8   S_MAG              5380 non-null   float64
 9   S_DISTANCE         5578 non-null   float64
 10  S_TEMPERATURE      5380 non-null   float64
 11  S_MASS             5595 non-null   float64
 12  S_RADIUS           5367 non-null   float64
 13  S_METALLICITY      5166 non-null   float64
 14  S_AGE              4392 non-null   float64
 15  S_LOG_LUM          5364 non-null   float64
 16  S_LOG_G            5353 

In [10]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# hand j
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))

In [11]:
X_encoded.columns = encoder.get_feature_names_out(categorical_cols)

X_preprocessed = pd.concat([X[numerical_cols].reset_index(drop=True), X_encoded.reset_index(drop=True)], axis=1)

X_preprocessed.fillna(X_preprocessed.mean(), inplace=True)




In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [20]:
si = X_test.iloc[0]


In [21]:
si

P_MASS                3.780000
P_RADIUS              1.770000
P_PERIOD             14.667108
P_SEMI_MAJOR_AXIS     0.113000
P_ECCENTRICITY        0.000000
                       ...    
S_TYPE_TEMP_nan       0.000000
P_TYPE_TEMP_Cold      0.000000
P_TYPE_TEMP_Hot       1.000000
P_TYPE_TEMP_Warm      0.000000
P_TYPE_TEMP_nan       0.000000
Name: 4440, Length: 350, dtype: float64

In [25]:
y_pred = model.predict(X_train)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [30]:
y_pred = model.predict(X_train)

print("Acc:", accuracy_score(y_train, y_pred))



print("Classification Report:\n", classification_report(y_train, y_pred))



Acc: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4421
           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        34

    accuracy                           1.00      4479
   macro avg       1.00      1.00      1.00      4479
weighted avg       1.00      1.00      1.00      4479

Predictions: [0 0 0 0 0]


In [32]:
import joblib
joblib.dump(model, "model.pkl")



Predictions: [0 0 0 0 0 0 0 0 0]


In [35]:
new_data = X_test.iloc[470:490]
predictions = model.predict(new_data)
print("Predictions:", predictions)


Predictions: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [34]:
clean_data[clean_data['P_HABITABLE'] == 1]

Unnamed: 0,P_MASS,P_RADIUS,P_PERIOD,P_SEMI_MAJOR_AXIS,P_ECCENTRICITY,P_INCLINATION,P_OMEGA,S_TYPE,S_MAG,S_DISTANCE,...,P_TYPE,S_TYPE_TEMP,S_LUMINOSITY,S_SNOW_LINE,S_ABIO_ZONE,S_TIDAL_LOCK,P_HABZONE_OPT,P_HABZONE_CON,P_TYPE_TEMP,P_HABITABLE
480,0.818,0.953,27.80978,0.134,0.059,89.6,70.0,M2.5 V,13.151,31.1265,...,Terran,M,0.022909,0.408662,0.028274,0.286631,1,0,Warm,1
537,1.25,1.073,37.42396,0.1633,0.042,89.8,10.0,M2.5 V,13.151,31.1265,...,Terran,M,0.022909,0.408662,0.028274,0.286631,1,1,Warm,1
762,25.3,1.367,8.457463,0.03984,,89.287,,M6 V,18.0,32.4298,...,Terran,M,0.001442,0.102533,0.006541,0.245193,1,1,Warm,1
934,1.16,1.05,4.90634,0.0259,0.03,,338.0,M7.0 V,15.13,3.83078,...,Terran,M,0.000723,0.072588,0.008444,0.234823,1,0,Warm,1
935,1.05,1.02,11.416,0.0455,0.04,,301.0,M7.0 V,15.13,3.83078,...,Terran,M,0.000723,0.072588,0.008444,0.234823,1,1,Warm,1
1291,1.2,1.06,19.53527,0.0649,0.0,89.339,,,17.95,92.1913,...,Terran,M,0.005164,0.194028,0.015156,0.269791,1,0,Warm,1
1977,1.321,1.129,12.352446,0.04683,0.00208,89.742,191.34,,17.02,12.429889,...,Terran,M,0.000553,0.063513,0.003648,0.142824,1,1,Warm,1
2239,0.388,0.788,4.049219,0.02227,0.00837,89.896,-8.73,,17.02,12.429889,...,Subterran,M,0.000553,0.063513,0.003648,0.142824,1,0,Warm,1
2267,1.08,1.03,10.3465,0.0457,,,,M5.5 V,13.76,4.84867,...,Terran,M,0.001406,0.101243,0.00882,0.227551,1,1,Warm,1
2412,2.7,1.45,62.24,0.213,0.02,,28.6,M1.5 V,10.602,7.24396,...,Terran,M,0.013709,0.316129,0.021635,0.277482,1,1,Warm,1
