In [91]:
#Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import tensorflow as tf

# Import findspark and initialize. 
import findspark
findspark.init()

# Start Spark session
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Asteroid DataFrame").getOrCreate()
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [92]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/kat-project4-csv-site.click/dataset.csv"
spark.sparkContext.addFile(url)
spark_df = spark.read.csv(SparkFiles.get("dataset.csv"), sep=",", header=True)

#This takes about 3 minutes to load in.

In [93]:
# Show DataFrame
spark_df.show(10)

+--------+-------+--------------+----+-------+------+---+---+----+--------+------+--------------+--------+---------+---------+----------------+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+--------------------+----------------+-----------------+----------------+-------+------------+----------+----------+----------+---------+---------+---------+---------+----------+----------+---------+---------+-----+------+
|      id|  spkid|     full_name|pdes|   name|prefix|neo|pha|   H|diameter|albedo|diameter_sigma|orbit_id|    epoch|epoch_mjd|       epoch_cal|equinox|                 e|                a|                q|                i|               om|                w|               ma|               ad|                n|                  tp|          tp_cal|              per|           per_y|   moid|     moid_ld|   sigma_e|   sigma_a|   sigma_q|  sigma_i| sigma_om|  

In [94]:
#review all the column names
spark_df.columns

['id',
 'spkid',
 'full_name',
 'pdes',
 'name',
 'prefix',
 'neo',
 'pha',
 'H',
 'diameter',
 'albedo',
 'diameter_sigma',
 'orbit_id',
 'epoch',
 'epoch_mjd',
 'epoch_cal',
 'equinox',
 'e',
 'a',
 'q',
 'i',
 'om',
 'w',
 'ma',
 'ad',
 'n',
 'tp',
 'tp_cal',
 'per',
 'per_y',
 'moid',
 'moid_ld',
 'sigma_e',
 'sigma_a',
 'sigma_q',
 'sigma_i',
 'sigma_om',
 'sigma_w',
 'sigma_ma',
 'sigma_ad',
 'sigma_n',
 'sigma_tp',
 'sigma_per',
 'class',
 'rms']

In [95]:
#Make a copy of the spark DF for later use. 
spark_df_copy = spark_df

#Drop unnecessary columns that don't determine the hazardousness of an asteroid
spark_df = spark_df.drop('spkid','pdes','full_name','name','prefix','orbit_id','equinox')

spark_df.show(10)

+--------+---+---+----+--------+------+--------------+---------+---------+----------------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+--------------------+----------------+-----------------+----------------+-------+------------+----------+----------+----------+---------+---------+---------+---------+----------+----------+---------+---------+-----+------+
|      id|neo|pha|   H|diameter|albedo|diameter_sigma|    epoch|epoch_mjd|       epoch_cal|                 e|                a|                q|                i|               om|                w|               ma|               ad|                n|                  tp|          tp_cal|              per|           per_y|   moid|     moid_ld|   sigma_e|   sigma_a|   sigma_q|  sigma_i| sigma_om|  sigma_w| sigma_ma|  sigma_ad|   sigma_n| sigma_tp|sigma_per|class|   rms|
+--------+---+---+----+--------+------+-------

In [96]:
#drop rows with null values
spark_df = spark_df.na.drop("any")

In [97]:
#Convert the spark df to pandas
df = spark_df.select("*").toPandas()

In [98]:
#display the pandas DF
df.head()

Unnamed: 0,id,neo,pha,H,diameter,albedo,diameter_sigma,epoch,epoch_mjd,epoch_cal,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
0,a0000001,N,N,3.4,939.4,0.09,0.2,2458600.5,58600,20190427.0,...,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,MBA,0.43301
1,a0000002,N,N,4.2,545.0,0.101,18.0,2459000.5,59000,20200531.0,...,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,MBA,0.35936
2,a0000003,N,N,5.33,246.596,0.214,10.594,2459000.5,59000,20200531.0,...,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,MBA,0.33848
3,a0000004,N,N,3.0,525.4,0.4228,0.2,2458600.5,58600,20190427.0,...,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,MBA,0.3998
4,a0000005,N,N,6.9,106.699,0.274,3.14,2459000.5,59000,20200531.0,...,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,MBA,0.52191


In [99]:
#check number of entries in the dataset
len(df.index)

131142

In [100]:
#set first column as the index
df = df.set_index('id')

In [101]:
#review datatypes
df.dtypes

neo               object
pha               object
H                 object
diameter          object
albedo            object
diameter_sigma    object
epoch             object
epoch_mjd         object
epoch_cal         object
e                 object
a                 object
q                 object
i                 object
om                object
w                 object
ma                object
ad                object
n                 object
tp                object
tp_cal            object
per               object
per_y             object
moid              object
moid_ld           object
sigma_e           object
sigma_a           object
sigma_q           object
sigma_i           object
sigma_om          object
sigma_w           object
sigma_ma          object
sigma_ad          object
sigma_n           object
sigma_tp          object
sigma_per         object
class             object
rms               object
dtype: object

In [102]:
df.columns

Index(['neo', 'pha', 'H', 'diameter', 'albedo', 'diameter_sigma', 'epoch',
       'epoch_mjd', 'epoch_cal', 'e', 'a', 'q', 'i', 'om', 'w', 'ma', 'ad',
       'n', 'tp', 'tp_cal', 'per', 'per_y', 'moid', 'moid_ld', 'sigma_e',
       'sigma_a', 'sigma_q', 'sigma_i', 'sigma_om', 'sigma_w', 'sigma_ma',
       'sigma_ad', 'sigma_n', 'sigma_tp', 'sigma_per', 'class', 'rms'],
      dtype='object')

In [103]:
#convert numerical columns to floats
df = df.astype({'H':"float", 'diameter': "float", 'albedo':"float", 'diameter_sigma':"float", 'epoch':"float",
       'epoch_mjd':"float", 'epoch_cal':"float", 'e':"float", 'a':"float", 'q':"float", 'i':"float", 'om':"float", 'w':"float", 'ma':"float", 'ad':"float",
       'n':"float", 'tp':"float", 'tp_cal':"float", 'per':"float", 'per_y':"float", 'moid':"float", 'moid_ld':"float", 'sigma_e':"float",
       'sigma_a':"float", 'sigma_q':"float", 'sigma_i':"float", 'sigma_om':"float", 'sigma_w':"float", 'sigma_ma':"float",
       'sigma_ad':"float", 'sigma_n':"float", 'sigma_tp':"float", 'sigma_per':"float", 'rms':"float"})

In [104]:
#Encode neo and pha N and Y values as 0s and 1s

# Create a function to encode the N and Y values
def encode(value):
    if value =="Y":
        return 1
    else:
        return 0

# Call the encode_function on the neo and pha columns
df["neo"] = df["neo"].apply(encode)
df["pha"] = df["pha"].apply(encode)

#Review the df
df.head()

Unnamed: 0_level_0,neo,pha,H,diameter,albedo,diameter_sigma,epoch,epoch_mjd,epoch_cal,e,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a0000001,0,0,3.4,939.4,0.09,0.2,2458600.5,58600.0,20190427.0,0.076009,...,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,MBA,0.43301
a0000002,0,0,4.2,545.0,0.101,18.0,2459000.5,59000.0,20200531.0,0.229972,...,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,MBA,0.35936
a0000003,0,0,5.33,246.596,0.214,10.594,2459000.5,59000.0,20200531.0,0.256936,...,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,MBA,0.33848
a0000004,0,0,3.0,525.4,0.4228,0.2,2458600.5,58600.0,20190427.0,0.088721,...,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,MBA,0.3998
a0000005,0,0,6.9,106.699,0.274,3.14,2459000.5,59000.0,20200531.0,0.190913,...,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,MBA,0.52191


In [105]:
df['class'].unique()

array(['MBA', 'OMB', 'MCA', 'AMO', 'IMB', 'TJN', 'APO', 'ATE', 'CEN',
       'AST', 'TNO'], dtype=object)

In [106]:
#Encode class values as numbers

# Create a function to encode the N and Y values
def encode2(value):
    if value =="MBA":
        return 0
    elif value =="OMB":
        return 1
    elif value =="MCA":
        return 2
    elif value =="AMO":
        return 3
    elif value =="IMB":
        return 4
    elif value =="TJN":
        return 5
    elif value =="APO":
        return 6
    elif value =="ATE":
        return 7
    elif value =="CEN":
        return 8
    elif value =="AST":
        return 9
    else:
        return 10

# Call the encode_function on the neo and pha columns
df["class"] = df["class"].apply(encode2)

#Review the df
df.head()

Unnamed: 0_level_0,neo,pha,H,diameter,albedo,diameter_sigma,epoch,epoch_mjd,epoch_cal,e,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a0000001,0,0,3.4,939.4,0.09,0.2,2458600.5,58600.0,20190427.0,0.076009,...,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,0,0.43301
a0000002,0,0,4.2,545.0,0.101,18.0,2459000.5,59000.0,20200531.0,0.229972,...,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,0,0.35936
a0000003,0,0,5.33,246.596,0.214,10.594,2459000.5,59000.0,20200531.0,0.256936,...,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,0,0.33848
a0000004,0,0,3.0,525.4,0.4228,0.2,2458600.5,58600.0,20190427.0,0.088721,...,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,0,0.3998
a0000005,0,0,6.9,106.699,0.274,3.14,2459000.5,59000.0,20200531.0,0.190913,...,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,0,0.52191


In [107]:
# Remove target (pha) from features data
y = df.pha.values
X = df.drop(columns="pha").values


# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [108]:
# Preprocess numerical data for neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

First Model

In [109]:
# Define the deep learning model 
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=36, activation="relu", input_dim=36))
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train, y_train, epochs=10)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1025/1025 - 2s - loss: 0.0104 - accuracy: 0.9986 - 2s/epoch - 2ms/step
Loss: 0.01043011900037527, Accuracy: 0.9986274838447571


Notes: Our first model was very accurate (99.86% accuracy). Future models will see if we can achieve the same accuaracy, with less data. 

Optimization Attempt #1

In [110]:
#Copy the processed df
df_no_sigma = df.copy()

#remove all "simgma" columns from the DF
df_no_sigma = df_no_sigma.drop(columns=['diameter_sigma','sigma_e', 'sigma_a', 'sigma_q', 'sigma_i', 'sigma_om', 'sigma_w','sigma_ma', 'sigma_ad', 'sigma_n', 'sigma_tp', 'sigma_per'])
df_no_sigma.head()

Unnamed: 0_level_0,neo,pha,H,diameter,albedo,epoch,epoch_mjd,epoch_cal,e,a,...,ad,n,tp,tp_cal,per,per_y,moid,moid_ld,class,rms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a0000001,0,0,3.4,939.4,0.09,2458600.5,58600.0,20190427.0,0.076009,2.769165,...,2.979647,0.213885,2458239.0,20180430.0,1683.145703,4.608202,1.59478,620.640533,0,0.43301
a0000002,0,0,4.2,545.0,0.101,2459000.5,59000.0,20200531.0,0.229972,2.773841,...,3.411748,0.213345,2458321.0,20180720.0,1687.410992,4.61988,1.23429,480.348639,0,0.35936
a0000003,0,0,5.33,246.596,0.214,2459000.5,59000.0,20200531.0,0.256936,2.668285,...,3.353865,0.226129,2458446.0,20181120.0,1592.013769,4.358696,1.03429,402.514639,0,0.33848
a0000004,0,0,3.0,525.4,0.4228,2458600.5,58600.0,20190427.0,0.088721,2.361418,...,2.570926,0.271609,2458248.0,20180510.0,1325.432763,3.628837,1.13948,443.451432,0,0.3998
a0000005,0,0,6.9,106.699,0.274,2459000.5,59000.0,20200531.0,0.190913,2.574037,...,3.065455,0.238661,2458926.0,20200320.0,1508.414421,4.129814,1.09575,426.433027,0,0.52191


In [111]:
# Remove target (pha) from features data
y = df_no_sigma.pha.values
X = df_no_sigma.drop(columns="pha").values


# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [112]:
# Preprocess numerical data for neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [113]:
# Define the deep learning model 
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=24, activation="relu", input_dim=24))
nn_model.add(tf.keras.layers.Dense(units=12, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train, y_train, epochs=10)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1025/1025 - 2s - loss: 0.0129 - accuracy: 0.9986 - 2s/epoch - 2ms/step
Loss: 0.012885482050478458, Accuracy: 0.9986274838447571


Notes: Sigma values in the data set don't impact the accuracy of training in determining if an asteroid is a potential hazard. 

Optimization Attempt #2

In [114]:
#verifying that pha (potentialy hazardous asteroid) and neo (near earth object are not a total 1 to 1 relationship)
print(df['pha'].value_counts())

print(df['neo'].value_counts())

pha
0    130961
1       181
Name: count, dtype: int64
neo
0    130403
1       739
Name: count, dtype: int64


In [115]:
df_no_sigma.columns

Index(['neo', 'pha', 'H', 'diameter', 'albedo', 'epoch', 'epoch_mjd',
       'epoch_cal', 'e', 'a', 'q', 'i', 'om', 'w', 'ma', 'ad', 'n', 'tp',
       'tp_cal', 'per', 'per_y', 'moid', 'moid_ld', 'class', 'rms'],
      dtype='object')

In [116]:
#copy df with sigma coplumns removed
df3 = df_no_sigma.copy()

#remove additional columns of data
df3 = df3.drop(columns=['albedo', 'epoch', "epoch_mjd",	"epoch_cal", "e", "a", "q", "i", "om", "w", "ma", "ad", "n", "tp", "tp_cal",	"per", "per_y", "moid_ld"])

#display the new df
df3.head()


Unnamed: 0_level_0,neo,pha,H,diameter,moid,class,rms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a0000001,0,0,3.4,939.4,1.59478,0,0.43301
a0000002,0,0,4.2,545.0,1.23429,0,0.35936
a0000003,0,0,5.33,246.596,1.03429,0,0.33848
a0000004,0,0,3.0,525.4,1.13948,0,0.3998
a0000005,0,0,6.9,106.699,1.09575,0,0.52191


In [117]:
# Remove target (pha) from features data
y = df3.pha.values
X = df3.drop(columns="pha").values


# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [118]:
# Preprocess numerical data for neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [119]:
# Define the deep learning model 
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=6, activation="relu", input_dim=6))
nn_model.add(tf.keras.layers.Dense(units=3, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train, y_train, epochs=10)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1025/1025 - 3s - loss: 0.0024 - accuracy: 0.9989 - 3s/epoch - 3ms/step
Loss: 0.002432210138067603, Accuracy: 0.9989019632339478


Less information about the asteroid actually increased the accuracy of the model from 99.86% to 99.90%

Optimization Attempt #3

In [120]:
df4 = df3.copy()

df4= df4.drop(columns=['moid', 'class', 'rms']) 

df4.head()

Unnamed: 0_level_0,neo,pha,H,diameter
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a0000001,0,0,3.4,939.4
a0000002,0,0,4.2,545.0
a0000003,0,0,5.33,246.596
a0000004,0,0,3.0,525.4
a0000005,0,0,6.9,106.699


In [121]:
# Remove target (pha) from features data
y = df4.pha.values
X = df4.drop(columns="pha").values


# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [122]:
# Preprocess numerical data for neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [123]:
# Define the deep learning model 
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=3, activation="relu", input_dim=3))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train, y_train, epochs=10)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1025/1025 - 2s - loss: 0.0039 - accuracy: 0.9982 - 2s/epoch - 2ms/step
Loss: 0.0038928284775465727, Accuracy: 0.9982004761695862


Accuracy went back down to our starting point. Removing moid, class, and rms did help determine which asteroids are hazardous and which are not.

Final Optimization Attempt

Df3 gave us the most accurate Model. In this final attempt, we will recreate the df only using these columns and removing null values from these columns. This will ideally give us more data to train the model with. 

In [124]:
#Take the copied spark DF and remove all but key columns from DF3

spark_df_copy = spark_df_copy.drop('spkid', 'full_name', 'pdes', 'name', 'prefix', 'albedo', 'diameter_sigma', 'orbit_id', 'epoch',
       'epoch_mjd', 'epoch_cal', 'equinox', 'e', 'a', 'q', 'i', 'om', 'w',
       'ma', 'ad', 'n', 'tp', 'tp_cal', 'per', 'per_y', 'moid_ld',
       'sigma_e', 'sigma_a', 'sigma_q', 'sigma_i', 'sigma_om', 'sigma_w',
       'sigma_ma', 'sigma_ad', 'sigma_n', 'sigma_tp', 'sigma_per')

In [125]:
#drop null values from the DF
df5 = spark_df_copy.na.drop("any")

In [126]:
#Convert to a pandas DF
df5 = df5.select("*").toPandas()

#show the DF
df5.head()

Unnamed: 0,id,neo,pha,H,diameter,moid,class,rms
0,a0000001,N,N,3.4,939.4,1.59478,MBA,0.43301
1,a0000002,N,N,4.2,545.0,1.23429,MBA,0.35936
2,a0000003,N,N,5.33,246.596,1.03429,MBA,0.33848
3,a0000004,N,N,3.0,525.4,1.13948,MBA,0.3998
4,a0000005,N,N,6.9,106.699,1.09575,MBA,0.52191


In [127]:
#There are only a few 1000 more cases so this may not impact accuracy much
len(df5.index)

132045

In [128]:
#set first column as the index
df5 = df5.set_index('id')

In [129]:
#convert numerical columns to floats
df5 = df5.astype({'H':"float", 'diameter': "float", 'moid':"float", 'rms':"float"})

In [130]:
#Encode neo and pha N and Y values as 0s and 1s

# Call the encode_function on the neo and pha columns
df5["neo"] = df5["neo"].apply(encode)
df5["pha"] = df5["pha"].apply(encode)

#Review the df
df5.head()

Unnamed: 0_level_0,neo,pha,H,diameter,moid,class,rms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a0000001,0,0,3.4,939.4,1.59478,MBA,0.43301
a0000002,0,0,4.2,545.0,1.23429,MBA,0.35936
a0000003,0,0,5.33,246.596,1.03429,MBA,0.33848
a0000004,0,0,3.0,525.4,1.13948,MBA,0.3998
a0000005,0,0,6.9,106.699,1.09575,MBA,0.52191


In [131]:
#Encode class values as numbers

# Call the encode_function on the neo and pha columns
df5["class"] = df5["class"].apply(encode2)

#Review the df
df5.head()

Unnamed: 0_level_0,neo,pha,H,diameter,moid,class,rms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a0000001,0,0,3.4,939.4,1.59478,0,0.43301
a0000002,0,0,4.2,545.0,1.23429,0,0.35936
a0000003,0,0,5.33,246.596,1.03429,0,0.33848
a0000004,0,0,3.0,525.4,1.13948,0,0.3998
a0000005,0,0,6.9,106.699,1.09575,0,0.52191


In [132]:
# Remove target (pha) from features data
y = df5.pha.values
X = df5.drop(columns="pha").values


# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [133]:
# Preprocess numerical data for neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [134]:
# Define the deep learning model 
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=6, activation="relu", input_dim=6))
nn_model.add(tf.keras.layers.Dense(units=3, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train, y_train, epochs=10)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1032/1032 - 3s - loss: 0.0020 - accuracy: 0.9992 - 3s/epoch - 3ms/step
Loss: 0.0019521883223205805, Accuracy: 0.999212384223938


Model Accuracy is back up to 99.9% with the 6 features from df3 and slightly more data for training.