In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
## import dataset
df = pd.read_csv('./data/exoplanets-clean.csv')

null_col_vals = df.isnull().sum().to_dict()
NUM_ROWS = df.shape[0]

thresholdAmount = 0.3 * NUM_ROWS

dropped = []
for key in null_col_vals:
    if(null_col_vals[key] >= thresholdAmount):
        dropped.append(key)

df = df.drop(columns=dropped, axis=1)
df = df.dropna()
df.head()
df.dtypes

num_stars                    int64
single_planet_exosystem       bool
discoverymethod             object
disc_year                    int64
disc_facility               object
                            ...   
sy_kmagerr1                float64
sy_kmagerr2                float64
sy_gaiamag                 float64
sy_gaiamagerr1             float64
sy_gaiamagerr2             float64
Length: 63, dtype: object

In [3]:
attr = df.drop('single_planet_exosystem', axis=1)
target = df['single_planet_exosystem']

# Split the data into training and testing sets
attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.3, random_state=5)

categorical_columns=['discoverymethod', 'disc_facility', 'pl_bmassprov','st_metratio']
# Create preprocessor for categorical data
preprocessor = ColumnTransformer(
     transformers=[('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_columns)])
# Create a Categorical Naive Bayes model
cnb = CategoricalNB()

# Create a pipeline with preprocessing and model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', cnb)])

# attr_train = attr_train.toarray()
# target_train = target_train.toarray()
# Train the model
model.fit(attr_train, target_train)

# # Make predictions on the test set
target_pred = model.predict(attr_test)



In [4]:
attr_test.head()

Unnamed: 0,num_stars,discoverymethod,disc_year,disc_facility,pl_controv_flag,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,pl_orbsmax,...,sy_disterr2,sy_vmag,sy_vmagerr1,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2
2276,1,Transit,2016,Kepler,0,10.458434,5.3e-05,-5.3e-05,0.0,0.0845,...,-2.3055,14.665,0.08,-0.08,12.212,0.025,-0.025,14.494,0.000459,-0.000459
5255,1,Transit,2023,Transiting Exoplanet Survey Satellite (TESS),0,13.33668,1e-05,-1e-05,0.0,0.112,...,-7.223,13.251,0.08,-0.08,11.586,0.025,-0.025,13.0711,0.000337,-0.000337
3069,1,Transit,2014,Kepler,0,96.678988,0.000702,-0.000702,0.0,0.397,...,-10.082,14.876,0.195,-0.195,12.899,0.024,-0.024,14.5782,0.000273,-0.000273
2101,1,Transit,2016,Kepler,0,1.93156,2e-06,-2e-06,0.0,0.0299,...,-2.499,13.115,0.08,-0.08,11.362,0.02,-0.02,13.0239,0.000898,-0.000898
1343,1,Radial Velocity,2023,Calar Alto Observatory,0,36.116,0.027,-0.029,0.0,0.1417,...,-0.004455,11.311,0.026,-0.026,5.939,0.034,-0.034,9.87882,0.001708,-0.001708


In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# Evaluate the accuracy
accuracy = accuracy_score(target_test, target_pred)
print(f'Accuracy: {accuracy}')
print()
print(confusion_matrix(target_test, target_pred))
print()
print('Classification Report')
print(classification_report(target_test, target_pred))

Accuracy: 0.654054054054054

[[112 291]
 [ 29 493]]

Classification Report
              precision    recall  f1-score   support

       False       0.79      0.28      0.41       403
        True       0.63      0.94      0.75       522

    accuracy                           0.65       925
   macro avg       0.71      0.61      0.58       925
weighted avg       0.70      0.65      0.61       925

