## Import the Necessary Stuff

In [95]:
import csv
import math
import os, os.path
import io
import sys
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from IPython.display import display

%matplotlib inline

### Streamline the reading of the CSV file (Thanks Dr. Munroe!)

In [3]:
csv_filename = "./Data/exoplanetarchive/candidates-20190613-TIDIED.csv"
longnames = {}
with open(csv_filename, 'r') as f:
    lines = f.readlines()
    for line in lines[8:23]:
        varname, longname = line.strip().split(':')
        varname = varname[9:].strip()
        longname = longname.replace(',', '').strip()
        print(varname, longname)
        longnames[varname] = longname

kepid KepID
kepoi_name KOI Name
koi_period Orbital Period [days]
koi_period_err1 Orbital Period Upper Unc. [days]
koi_prad Planetary Radius [Earth radii]
koi_prad_err1 Planetary Radius Upper Unc. [Earth radii]
koi_teq Equilibrium Temperature [K]
koi_insol Insolation Flux [Earth flux]
koi_insol_err1 Insolation Flux Upper Unc. [Earth flux]
koi_steff Stellar Effective Temperature [K]
koi_steff_err1 Stellar Effective Temperature Upper Unc. [K]
koi_slogg Stellar Surface Gravity [log10(cm/s**2)]
koi_slogg_err1 Stellar Surface Gravity Upper Unc. [log10(cm/s**2)]
koi_srad Stellar Radius [Solar radii]
koi_srad_err1 Stellar Radius Upper Unc. [Solar radii]


In [5]:
dataset = pd.read_csv(csv_filename, header = 24)
dataset.head()

Unnamed: 0,kepid,kepoi_name,koi_period,koi_period_err1,koi_prad,koi_prad_err1,koi_teq,koi_insol,koi_insol_err1,koi_steff,koi_steff_err1,koi_slogg,koi_slogg_err1,koi_srad,koi_srad_err1
0,1025986,K07621.01,275.07365,0.03928,1.86,0.33,303.0,2.0,0.92,5604.0,84.0,4.226,0.208,1.187,0.207
1,1026957,K00958.01,21.761298,1.7e-05,2.53,0.16,486.0,13.18,3.0,4859.0,97.0,4.612,0.015,0.72,0.047
2,1161345,K00984.01,4.287467,1.5e-05,2.29,0.22,942.0,186.31,56.73,5296.0,105.0,4.576,0.015,0.815,0.08
3,1164301,K04921.01,95.688682,0.003599,28.41,1.13,1046.0,283.07,37.83,5040.0,60.0,2.681,0.029,13.056,0.52
4,1293379,K06253.01,27.556452,0.002928,1.4,0.54,531.0,18.86,22.4,5509.0,184.0,4.565,0.024,0.836,0.322


### Making the NEW dataset

In [7]:
goldilocks_dataset = dataset.drop(['kepid',
                                   'koi_period',
                                   'koi_period_err1',
                                   'koi_prad_err1',
                                   #'koi_insol',
                                   'koi_insol_err1', 
                                   'koi_steff',
                                   'koi_steff_err1',
                                   'koi_slogg',
                                   'koi_slogg_err1',
                                   #'koi_srad',
                                   'koi_srad_err1'],
                                  axis=1).copy()
goldilocks_dataset.head()

Unnamed: 0,kepoi_name,koi_prad,koi_teq,koi_insol,koi_srad
0,K07621.01,1.86,303.0,2.0,1.187
1,K00958.01,2.53,486.0,13.18,0.72
2,K00984.01,2.29,942.0,186.31,0.815
3,K04921.01,28.41,1046.0,283.07,13.056
4,K06253.01,1.4,531.0,18.86,0.836


#### Let's define the Goldilocks temperature function

In [44]:
140.82.114.3def goldilocks_temp(t):
    if t < 273:
        return -1;
    elif t > 373:
        return 1;
    else:
        return 0

#### Time to make sure that I understand the Pythonic way of making new columns, using the MAP function

In [84]:
goldilocks_dataset["gold_temp"] = list(map(goldilocks_temp, goldilocks_dataset["koi_teq"]))

In [154]:
goldilocks_dataset = goldilocks_dataset.dropna()
goldilocks_dataset.head()

Unnamed: 0,kepoi_name,koi_prad,koi_teq,koi_insol,koi_srad,gold_temp
0,K07621.01,1.86,303.0,2.0,1.187,0
1,K00958.01,2.53,486.0,13.18,0.72,1
2,K00984.01,2.29,942.0,186.31,0.815,1
3,K04921.01,28.41,1046.0,283.07,13.056,1
4,K06253.01,1.4,531.0,18.86,0.836,1


In [155]:
goldX = goldilocks_dataset.drop(["kepoi_name","koi_teq","gold_temp"], axis=1)
goldY = goldilocks_dataset['gold_temp']

In [156]:
display(goldX.head())
display(goldX.tail())

Unnamed: 0,koi_prad,koi_insol,koi_srad
0,1.86,2.0,1.187
1,2.53,13.18,0.72
2,2.29,186.31,0.815
3,28.41,283.07,13.056
4,1.4,18.86,0.836


Unnamed: 0,koi_prad,koi_insol,koi_srad
2313,3.32,2.72,1.17
2314,0.89,29.06,1.076
2315,1.76,91.32,1.554
2316,1.62,5.85,0.79
2317,2.17,1.63,0.736


In [145]:
goldY.head()

0    0
1    1
2    1
3    1
4    1
Name: gold_temp, dtype: int64

In [148]:
testCols = goldX.columns.copy()

In [113]:
newCols = [longnames[x] for x in testCols]
newCols

['Planetary Radius [Earth radii]',
 'Insolation Flux [Earth flux]',
 'Stellar Radius [Solar radii]']

Can I do the above two cells in one step?

(NOTE: Before running the code below, re-run the 8th cell!!!)

In [158]:
goldX.columns = [longnames[x] for x in goldX.columns]
goldX.head()

Unnamed: 0,Planetary Radius [Earth radii],Insolation Flux [Earth flux],Stellar Radius [Solar radii]
0,1.86,2.0,1.187
1,2.53,13.18,0.72
2,2.29,186.31,0.815
3,28.41,283.07,13.056
4,1.4,18.86,0.836


HUZZAH!

#### Let's split the data into the TEST and TRAIN data

In [103]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(goldX, goldY, random_state=1)

#### Let's do a GAUSSIAN Naive-Bayes test

In [106]:
from sklearn.naive_bayes import GaussianNB # 1. choose model class
model = GaussianNB()                       # 2. instantiate model
model.fit(Xtrain, ytrain)                  # 3. fit model to data
y_model = model.predict(Xtest)             # 4. predict on new data

In [107]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_model)

# THIS GIVES ME THE ACCURACY SCORE OF THE ALGORITHM

0.9585492227979274