In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [32]:
## import dataset
df = pd.read_csv('./data/exoplanets-clean.csv')

null_col_vals = df.isnull().sum().to_dict()
NUM_ROWS = df.shape[0]

thresholdAmount = 0.3 * NUM_ROWS

dropped = []
for key in null_col_vals:
    if(null_col_vals[key] >= thresholdAmount):
        dropped.append(key)

df = df.drop(columns=dropped, axis=1)
df = df.dropna()

# dummy encoding for categorical data
df = pd.get_dummies(df, drop_first=True)

df.head()

Unnamed: 0,num_stars,single_planet_exosystem,disc_year,pl_controv_flag,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,pl_orbsmax,pl_orbsmaxlim,...,disc_facility_TrES,disc_facility_Transiting Exoplanet Survey Satellite (TESS),disc_facility_United Kingdom Infrared Telescope,disc_facility_W. M. Keck Observatory,disc_facility_WASP-South,disc_facility_XO,pl_bmassprov_Mass,pl_bmassprov_Msin(i)/sin(i),pl_bmassprov_Msini,st_metratio_[M/H]
30,3,True,2015,0,11688.0,6209.25,-3287.25,0.0,13.2,0.0,...,False,False,False,False,False,False,True,False,False,True
32,2,False,1996,0,14.6516,0.0001,-0.0001,0.0,0.1134,0.0,...,False,False,False,False,False,False,True,False,False,False
35,2,False,2004,0,0.736547,1e-06,-1e-06,0.0,0.01544,0.0,...,False,False,False,False,False,False,True,False,False,False
52,1,False,2020,0,8.46308,6e-05,-6e-05,0.0,0.0649,0.0,...,False,True,False,False,False,False,True,False,False,False
53,1,False,2021,0,18.85969,8e-05,-8e-05,0.0,0.1108,0.0,...,False,True,False,False,False,False,True,False,False,False


In [33]:
# divide the dataset into class and target variable (target is single_planet_exosystem)
attr = df.drop(columns=['single_planet_exosystem'], axis=1)
target = df['single_planet_exosystem']

print(attr.shape)
print(target.shape)
attr.head()

(3082, 101)
(3082,)


Unnamed: 0,num_stars,disc_year,pl_controv_flag,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,pl_orbsmax,pl_orbsmaxlim,pl_rade,...,disc_facility_TrES,disc_facility_Transiting Exoplanet Survey Satellite (TESS),disc_facility_United Kingdom Infrared Telescope,disc_facility_W. M. Keck Observatory,disc_facility_WASP-South,disc_facility_XO,pl_bmassprov_Mass,pl_bmassprov_Msin(i)/sin(i),pl_bmassprov_Msini,st_metratio_[M/H]
30,3,2015,0,11688.0,6209.25,-3287.25,0.0,13.2,0.0,13.4,...,False,False,False,False,False,False,True,False,False,True
32,2,1996,0,14.6516,0.0001,-0.0001,0.0,0.1134,0.0,13.9,...,False,False,False,False,False,False,True,False,False,False
35,2,2004,0,0.736547,1e-06,-1e-06,0.0,0.01544,0.0,1.875,...,False,False,False,False,False,False,True,False,False,False
52,1,2020,0,8.46308,6e-05,-6e-05,0.0,0.0649,0.0,3.957,...,False,True,False,False,False,False,True,False,False,False
53,1,2021,0,18.85969,8e-05,-8e-05,0.0,0.1108,0.0,2.522,...,False,True,False,False,False,False,True,False,False,False


In [34]:
# Preprocess the dataset and divide into train and test
sc = StandardScaler()
attr = sc.fit_transform(attr)
le = LabelEncoder()
target = le.fit_transform(df['single_planet_exosystem'])
attr_train, attr_test, target_train, target_test = train_test_split(
    attr, target, test_size=0.2, random_state=0)

In [None]:
# Apply LDA
lda = LinearDiscriminantAnalysis()
attr_train = lda.fit_transform(attr_train, target_train)
attr_test = lda.transform(attr_test)\

# plot the scatter plot
plt.scatter(attr_train[:, 0], np.zeros_like(attr_train[:, 0]), c=target_train, cmap='rainbow')

IndexError: index 1 is out of bounds for axis 1 with size 1

In [36]:
# Classification using Random Forest
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(attr_train, target_train)
target_pred = classifier.predict(attr_test)

In [37]:
# print the accuracy
print('Accuracy:', accuracy_score(target_test, target_pred))
print('Confusion Matrix:', confusion_matrix(target_test, target_pred))
plt.show()

Accuracy: 0.8411669367909238
Confusion Matrix: [[225  68]
 [ 30 294]]
