<a href="https://colab.research.google.com/github/joehawkens/MachineLearning/blob/main/FINAL_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EXOPLANET PREDICTION MODEL
**GOAL: To predict if astronomical objects are exoplanets using data collected from the Kepler Space Observatory telescope.**

- Source: https://www.kaggle.com/datasets/nasa/kepler-exoplanet-search-results
- Data: https://raw.githubusercontent.com/joehawkens/MachineLearning/main/exoplanets.csv

# DATA CLEAN

### Checking the columns:
- Total Rows: 9,564

In [None]:
import pandas as pd
exoplanet_data = pd.read_csv('https://raw.githubusercontent.com/joehawkens/MachineLearning/main/exoplanets.csv')
exoplanet_data.head()


# Count total rows for each column
row_counts = exoplanet_data.count()

# Print the row counts for each column
# print(row_counts)

data_types = exoplanet_data.dtypes

# print(data_types)
# print(exoplanet_data['koi_tce_delivname'])
# print(exoplanet_data['koi_fpflag_nt'].value_counts())
# print(exoplanet_data['koi_fpflag_ss'].value_counts())
# print(exoplanet_data['koi_fpflag_co'].value_counts())
# print(exoplanet_data['koi_fpflag_ec'].value_counts())

# Convert the categorical target variable to numerical labels
exoplanet_data['koi_disposition'] = exoplanet_data['koi_disposition'].replace({
    'CONFIRMED': 1,
    'FALSE POSITIVE': 0,
    'CANDIDATE': 2
})

exoplanet_data['koi_disposition'].value_counts()

# FAlSE - 5023 - 0
# CONFIRMED - 2293 - 1
# Candidate -  2248 - 2

# combined confirmed and candidate??

0    5023
1    2293
2    2248
Name: koi_disposition, dtype: int64

### Dropping unecessary columns:
(KOI = Kepler Object of Interest)
- row ID - Identifier
- kep ID - Identifier
- kepoi_name - Identifier
- kepler_name - Identifier
- koi_teq_error1 - no data
- koi_teq_error2 - no data
- koi_pdisposition - same as disposition
- koi_tce_plnt_num - Identifier
- koi_tce_delivname - Identifier

In [None]:
# Drop unnecessary columns
columns_to_drop = ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_disposition',
                     'koi_pdisposition', 'koi_teq_err1', 'koi_teq_err2', 'koi_pdisposition', 'koi_tce_plnt_num', 'koi_tce_delivname']

exoplanets = exoplanet_data.drop(columns_to_drop, axis=1)



In [None]:
#print(exoplanets.count())

# DATA EXPLORATION - Do Not Run

In [18]:
# nan_counts = exploration_data.isna().sum()
# print(nan_counts)
missing_values = exploration_data.isna().sum()
print(missing_values)
# print(exploration_data['koi_score'].value_counts == 0)

# About 7,000 rows after dropping 'CANDIDATE' classification.

koi_disposition      0
koi_fpflag_nt        0
koi_fpflag_ss        0
koi_fpflag_co        0
koi_fpflag_ec        0
koi_period           0
koi_period_err1      0
koi_period_err2      0
koi_time0bk          0
koi_time0bk_err1     0
koi_time0bk_err2     0
koi_impact           0
koi_impact_err1      0
koi_impact_err2      0
koi_duration         0
koi_duration_err1    0
koi_duration_err2    0
koi_depth            0
koi_depth_err1       0
koi_depth_err2       0
koi_prad             0
koi_prad_err1        0
koi_prad_err2        0
koi_teq              0
koi_insol            0
koi_insol_err1       0
koi_insol_err2       0
koi_model_snr        0
koi_steff            0
koi_steff_err1       0
koi_steff_err2       0
koi_slogg            0
koi_slogg_err1       0
koi_slogg_err2       0
koi_srad             0
koi_srad_err1        0
koi_srad_err2        0
ra                   0
dec                  0
koi_kepmag           0
dtype: int64


In [3]:
import altair as alt
import pandas as pd

exploration_data = pd.read_csv('https://raw.githubusercontent.com/joehawkens/MachineLearning/main/exoplanets.csv')
columns_to_drop = ['rowid', 'koi_score', 'kepid', 'kepoi_name', 'kepler_name', 'koi_teq_err1', 'koi_teq_err2', 'koi_pdisposition', 'koi_tce_plnt_num', 'koi_tce_delivname']

# Fill missing values with X.
# exploration_data = exploration_data.fillna(exploration_data.drop())

exploration_data = exploration_data.drop(columns_to_drop, axis=1)
exploration_data['koi_disposition'] = exploration_data['koi_disposition'].replace({
    'CONFIRMED': 1,
    'FALSE POSITIVE': 0,
    'CANDIDATE': 2
})
exploration_data = exploration_data[exploration_data['koi_disposition'] != 2]

# correlation scores

correlation_matrix = exploration_data.corr()

# Reset the index for better visualization
correlation_matrix = correlation_matrix.reset_index()

# Convert the correlation matrix to long format
correlation_long = correlation_matrix.melt(id_vars='index', var_name='variable', value_name='correlation')

# Create the correlation heatmap using Altair
heatmap = alt.Chart(correlation_long).mark_rect().encode(
    x='index:O',
    y='variable:O',
    color='correlation:Q'
).properties(
    width=500,
    height=500,
    title='Correlation Matrix'
)

heatmap

Feature Importance Using a random gradient boost feature importance

In [20]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier

# Drop missing rows or fill with median (uncomment the appropriate line)
# exploration_data.dropna(inplace=True)
# X.fillna(X.median(), inplace=True)

X = exploration_data.drop('koi_disposition', axis=1)
y = exploration_data['koi_disposition']

gb = GradientBoostingClassifier()
gb.fit(X, y)
importances = gb.feature_importances_

feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print feature importances
print(feature_importances)


              Feature  Importance
28     koi_steff_err1    0.211354
2       koi_fpflag_co    0.210333
1       koi_fpflag_ss    0.176267
0       koi_fpflag_nt    0.136200
19           koi_prad    0.121460
29     koi_steff_err2    0.043636
21      koi_prad_err2    0.026263
3       koi_fpflag_ec    0.023460
14  koi_duration_err1    0.018743
15  koi_duration_err2    0.008637
26      koi_model_snr    0.004716
20      koi_prad_err1    0.003324
4          koi_period    0.003070
7         koi_time0bk    0.002724
24     koi_insol_err1    0.001741
32     koi_slogg_err2    0.001541
16          koi_depth    0.000874
34      koi_srad_err1    0.000794
37                dec    0.000495
31     koi_slogg_err1    0.000494
18     koi_depth_err2    0.000494
13       koi_duration    0.000470
36                 ra    0.000364
6     koi_period_err2    0.000347
17     koi_depth_err1    0.000327
27          koi_steff    0.000292
10         koi_impact    0.000273
23          koi_insol    0.000248
9    koi_time0

In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

#Drop missing rows
# exploration_data.dropna(inplace=True)
X.fillna(X.median(), inplace=True)

X = exploration_data.drop('koi_disposition', axis=1)
y = exploration_data['koi_disposition']



rf = RandomForestClassifier()
rf.fit(X, y)
importances = rf.feature_importances_


feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print feature importances
print(feature_importances)


              Feature  Importance
2       koi_fpflag_co    0.132614
28     koi_steff_err1    0.093164
1       koi_fpflag_ss    0.074577
29     koi_steff_err2    0.068958
0       koi_fpflag_nt    0.068450
20      koi_prad_err1    0.055784
19           koi_prad    0.055331
15  koi_duration_err2    0.045553
3       koi_fpflag_ec    0.038252
21      koi_prad_err2    0.035422
14  koi_duration_err1    0.030816
4          koi_period    0.023524
8    koi_time0bk_err1    0.022360
9    koi_time0bk_err2    0.021565
26      koi_model_snr    0.019050
23          koi_insol    0.017799
34      koi_srad_err1    0.017432
22            koi_teq    0.016214
10         koi_impact    0.015499
24     koi_insol_err1    0.013661
17     koi_depth_err1    0.012649
5     koi_period_err1    0.012591
18     koi_depth_err2    0.012166
16          koi_depth    0.012115
25     koi_insol_err2    0.011297
6     koi_period_err2    0.008819
32     koi_slogg_err2    0.007789
7         koi_time0bk    0.007604
13       koi_d

## TARGET FEATURE: - koi_disposition - classification of astronomical object.
## Features:

Positive Correlation:
- koi_score            ** 0.976616 (Only use if model sucks)
- koi_steff_err2       ** 0.378062
- koi_slogg_err2       ** 0.308141

Negative Correlation:
- koi_fpflag_co       ** -0.391005
- koi_fpflag_ss       ** -0.423874
- koi_steff_err1      ** -0.437383
- koi_fpflag_nt       ** -0.371595
- koi_prad ** 5% importance

# MODEL

In [7]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Get the dataset:
model_data = pd.read_csv('https://raw.githubusercontent.com/joehawkens/MachineLearning/main/exoplanets.csv')
selected_features = ['koi_score', 'koi_steff_err2', 'koi_slogg_err2', 'koi_fpflag_co', 'koi_fpflag_ss', 'koi_steff_err1', 'koi_fpflag_nt', 'koi_prad']

# Clean the data:

model_data = model_data.fillna(model_data.median())

model_data['koi_disposition'] = model_data['koi_disposition'].replace({
    'CONFIRMED': 1,
    'FALSE POSITIVE': 0,
    'CANDIDATE': 2
})
model_data = model_data[model_data['koi_disposition'] != 2]


# Only the selected features and target are in the df.
data = model_data[selected_features + ['koi_disposition']]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[selected_features], data['koi_disposition'], test_size=0.2, random_state=42)

# Create the gradient boost model
gb = GradientBoostingClassifier()

# Train the model
gb.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gb.predict(X_test)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


  model_data = model_data.fillna(model_data.median())


Accuracy: 98.63%
