In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\joaog\anaconda3\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
import matplotlib.pyplot as plt


# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [6]:
# Set features. This will also be used as your x values.
# removing the error2 columns

selected_features = df[['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co','koi_fpflag_ec'
                        ,'koi_period', 'koi_period_err1',  'koi_time0bk', 'koi_time0bk_err1'
                        ,'koi_impact', 'koi_impact_err1',  'koi_duration', 'koi_duration_err1'
                        ,'koi_depth', 'koi_depth_err1',    'koi_prad', 'koi_prad_err1'
                        ,'koi_teq', 'koi_insol', 'koi_insol_err1'
                        ,'koi_model_snr', 'koi_tce_plnt_num',   'koi_steff', 'koi_steff_err1'
                        ,'koi_slogg', 'koi_slogg_err1',  'koi_srad', 'koi_srad_err1'
                        ,'ra', 'dec','koi_kepmag']]

selected_features.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_time0bk,koi_time0bk_err1,koi_impact,...,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_slogg,koi_slogg_err1,koi_srad,koi_srad_err1,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,162.51384,0.00352,0.586,...,2,5455,81,4.467,0.064,0.927,0.105,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,175.850252,0.000581,0.969,...,1,5853,158,4.544,0.044,0.868,0.233,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,170.307565,0.000115,1.276,...,1,5805,157,4.564,0.053,0.791,0.201,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,171.59555,0.00113,0.701,...,1,6031,169,4.438,0.07,1.046,0.334,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,172.97937,0.0019,0.762,...,2,6046,189,4.486,0.054,0.972,0.315,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [7]:
#using the selected_features
X = selected_features.drop(["koi_disposition"], axis=1)
y = selected_features["koi_disposition"].values.reshape(-1, 1)
print(X.shape, y.shape)

(6991, 30) (6991, 1)


In [8]:
#Label-encode data set
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y).reshape(-1,1).ravel()
encoded_y

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([1, 2, 2, ..., 0, 2, 2])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, encoded_y,train_size=0.25,random_state=42)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# SVC Model



In [11]:
from sklearn.svm import SVC 
from sklearn.metrics import classification_report

model_SVC = SVC(kernel='linear')
model_SVC.fit(X_train_scaled, y_train)
prediction_SVC = model_SVC.predict(X_test_scaled)

In [12]:
print(f"Training Data SVC Score: {model_SVC.score(X_train_scaled, y_train)}")
print(f"Testing Data SVC Score: {model_SVC.score(X_test_scaled, y_test)}\n")
print(classification_report(y_test, prediction_SVC,
                            target_names=['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE']))

Training Data SVC Score: 0.8334287349742415
Testing Data SVC Score: 0.8363844393592678

                precision    recall  f1-score   support

     CANDIDATE       0.67      0.66      0.66      1244
     CONFIRMED       0.70      0.70      0.70      1396
FALSE POSITIVE       0.98      1.00      0.99      2604

      accuracy                           0.84      5244
     macro avg       0.79      0.78      0.78      5244
  weighted avg       0.83      0.84      0.84      5244



## Hyperparameter Tuning SVC Model

Use `GridSearchCV` to tune the model's parameters

In [13]:
from sklearn.model_selection import GridSearchCV
param_grid_SVC = {'C': [1, 5, 10, 50, 100],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid_SVC = GridSearchCV(model_SVC, param_grid_SVC, verbose=3)

In [14]:
# Train the model with GridSearch
grid_SVC.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.794, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.827, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.823, total=   0.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.794, total=   0.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.827, total=   0.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.823, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.794, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.827, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    2.3s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10, 50, 100],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [15]:
print(grid_SVC.best_params_)
print(grid_SVC.best_score_)

{'C': 100, 'gamma': 0.0001}
0.8752146536920435


In [16]:
bestSVC_model = SVC(C=100, kernel='linear', gamma=0.0001)

In [17]:
#Best SVC model results
bestSVC_model.fit(X_test_scaled, y_test)
print(f"Testing Data bestSVC_model Score: {bestSVC_model.score(X_test_scaled, y_test)}")

Testing Data bestSVC_model Score: 0.8905415713196033


# Deep Learning Model

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

deep_model = Sequential()
deep_model.add(Dense(units=6, activation='relu', input_dim=30))
deep_model.add(Dense(units=3, activation='softmax'))

In [19]:
deep_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 6)                 186       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 21        
Total params: 207
Trainable params: 207
Non-trainable params: 0
_________________________________________________________________


In [20]:
from keras.utils import to_categorical

# One-hot encoding y
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
print(y_test_categorical)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]]


Using TensorFlow backend.


In [21]:
# Compile and fit the model
deep_model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])
deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 1747 samples
Epoch 1/100
1747/1747 - 0s - loss: 1.0183 - accuracy: 0.4282
Epoch 2/100
1747/1747 - 0s - loss: 0.9285 - accuracy: 0.5140
Epoch 3/100
1747/1747 - 0s - loss: 0.8647 - accuracy: 0.5152
Epoch 4/100
1747/1747 - 0s - loss: 0.8138 - accuracy: 0.5152
Epoch 5/100
1747/1747 - 0s - loss: 0.7586 - accuracy: 0.5272
Epoch 6/100
1747/1747 - 0s - loss: 0.6945 - accuracy: 0.6903
Epoch 7/100
1747/1747 - 0s - loss: 0.6269 - accuracy: 0.7624
Epoch 8/100
1747/1747 - 0s - loss: 0.5666 - accuracy: 0.7659
Epoch 9/100
1747/1747 - 0s - loss: 0.5169 - accuracy: 0.7665
Epoch 10/100
1747/1747 - 0s - loss: 0.4803 - accuracy: 0.7682
Epoch 11/100
1747/1747 - 0s - loss: 0.4532 - accuracy: 0.7716
Epoch 12/100
1747/1747 - 0s - loss: 0.4328 - accuracy: 0.7710
Epoch 13/100
1747/1747 - 0s - loss: 0.4179 - accuracy: 0.7934
Epoch 14/100
1747/1747 - 0s - loss: 0.4053 - accuracy: 0.7859
Epoch 15/100
1747/1747 - 0s - loss: 0.3960 - accuracy: 0.8088
Epoch 16/100
1747/1747 - 0s - loss: 0.3879 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x20003aa8d08>

In [22]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=3)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 0.31877187721880224, Accuracy: 0.8659420013427734


# Ramdon Forest Model

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)
# rf.fit(X_test_scaled, y_test)

print(f"Training RandonForest Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing RandonForest Data Score: {rf.score(X_test_scaled, y_test)}")

Training RandonForest Data Score: 0.9942759015455066
Testing RandonForest Data Score: 0.8686117467581999




In [24]:
param_grid_rf = {'n_estimators': [250, 300, 350],'max_depth': [125, 150, 175]}

grid_rf = GridSearchCV(rf, param_grid_rf, verbose=3)

In [25]:
grid_rf.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] max_depth=125, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... max_depth=125, n_estimators=250, score=0.854, total=   0.7s
[CV] max_depth=125, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ..... max_depth=125, n_estimators=250, score=0.868, total=   0.7s
[CV] max_depth=125, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV] ..... max_depth=125, n_estimators=250, score=0.871, total=   0.7s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.859, total=   0.8s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.864, total=   0.8s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.869, total=   0.8s
[CV] max_depth=125, n_estimators=350 .................................
[CV] ..... max_depth=125, n_estimators=350, score=0.861, total=   1.0s
[CV] max_depth=125, n_estimators=350 .................................
[CV] ..... max_depth=125, n_estimators=350, score=0.863, total=   1.0s
[CV] max_depth=125, n_estimators=350 .................................
[CV] ..... max_depth=125, n_estimators=350, score=0.876, total=   1.0s
[CV] max_depth=150, n_estimators=250 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   22.7s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [26]:
print(grid_rf.best_params_)
print(grid_rf.best_score_)

{'max_depth': 150, 'n_estimators': 300}
0.86949055523755


In [27]:
prediction_rf = grid_rf.predict(X_test_scaled)
print(classification_report(y_test, prediction_rf,
                            target_names=['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE']))

                precision    recall  f1-score   support

     CANDIDATE       0.79      0.77      0.78      1244
     CONFIRMED       0.83      0.80      0.81      1396
FALSE POSITIVE       0.97      1.00      0.98      2604

      accuracy                           0.89      5244
     macro avg       0.86      0.85      0.86      5244
  weighted avg       0.89      0.89      0.89      5244



In [28]:
best_rf = RandomForestClassifier(max_depth=150, n_estimators=250)


In [29]:
best_rf.fit(X_train_scaled, y_train)
print(f"Training Data best_RandomForest Score: {best_rf.score(X_train_scaled, y_train)}")
print(f"Testing Data best_RandomForest Score: {best_rf.score(X_test_scaled, y_test)}")

Training Data best_RandomForest Score: 1.0
Testing Data best_RandomForest Score: 0.8892067124332571


# Reducing the number of features (Top 21)

In [30]:
#using feature_imporntances_ to see the top21 most relevant features

top21_features = sorted(zip(best_rf.feature_importances_, df.columns), reverse=True)[:21]
top21_features

[(0.10651760310250921, 'koi_fpflag_ss'),
 (0.10215253076684883, 'koi_disposition'),
 (0.06746860017782709, 'koi_fpflag_nt'),
 (0.06241729204653372, 'koi_depth_err2'),
 (0.061284923994393094, 'koi_duration'),
 (0.04577805089237081, 'koi_impact'),
 (0.04356054189567794, 'koi_duration_err1'),
 (0.039109874735009494, 'koi_prad_err2'),
 (0.03641741470481821, 'koi_period_err2'),
 (0.03246086025253065, 'koi_time0bk'),
 (0.03243599294574473, 'koi_fpflag_co'),
 (0.032029660943530464, 'koi_fpflag_ec'),
 (0.0303212989281698, 'koi_period'),
 (0.030107239258624525, 'koi_impact_err1'),
 (0.028130196686651687, 'koi_time0bk_err2'),
 (0.023610370138148866, 'koi_period_err1'),
 (0.02287325780233609, 'koi_impact_err2'),
 (0.022500095329861202, 'koi_depth_err1'),
 (0.02247964809008997, 'koi_duration_err2'),
 (0.021340024380585708, 'koi_depth'),
 (0.01775599901316504, 'koi_insol_err2')]

In [31]:
# using the top21 features
features = []
for data in top21_features:
    features.append(data[1]) 

    
top21 = df[features]

X1 = top21.drop(["koi_disposition"], axis=1)
y1 = top21["koi_disposition"].values.reshape(-1, 1)
print(X1.shape, y1.shape)

(6991, 20) (6991, 1)


In [32]:
label_encoder = LabelEncoder()
label_encoder.fit(y1)
encoded_y1 = label_encoder.transform(y1).reshape(-1,1).ravel()
encoded_y1


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([1, 2, 2, ..., 0, 2, 2])

In [33]:
#split the data
X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, encoded_y1,train_size=0.25, random_state=42)

In [34]:
X1_scaler = MinMaxScaler().fit(X1_train)
X1_train_scaled = X1_scaler.transform(X1_train)
X1_test_scaled = X1_scaler.transform(X1_test)

### SVC Top21

In [35]:
from sklearn.svm import SVC 
model_SVC_top21 = SVC(kernel='linear')
model_SVC_top21.fit(X1_train_scaled, y1_train)
predictions1 = model_SVC_top21.predict(X1_test)

In [36]:
print(f"Training SVC_top21 Data Score: {model_SVC_top21.score(X1_train_scaled, y1_train)}")
print(f"Testing SVC_top21 Data Score: {model_SVC_top21.score(X1_test_scaled, y1_test)}")

Training SVC_top21 Data Score: 0.7985117344018318
Testing SVC_top21 Data Score: 0.8037757437070938


In [37]:
# Train the model1 with GridSearch
grid_top21 = GridSearchCV(model_SVC_top21, param_grid_SVC, verbose=3)
grid_top21.fit(X1_train_scaled, y1_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.779, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.799, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.792, total=   0.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.779, total=   0.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.799, total=   0.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.792, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] .................... C=1, gamma=0.005, score=0.779, total=   0.0s
[CV] C=1, gamma=0.005 ................................................
[CV] .................... C=1, gamma=0.005, score=0.799, total=   0.0s
[CV] C=1, gamma=0.005 ................................................
[CV] .................... C=1, gamma=0.005, score=0.792, total=   0.0s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.813, total=   0.0s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.844, total=   0.0s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.824, total=   0.0s
[CV] C=5, gamma=0.0005 ...............................................
[CV] ................... C=5, gamma=0.0005, score=0.813, total=   0.0s
[CV] C=5, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    1.2s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10, 50, 100],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [38]:
print(grid_top21.best_params_)
print(grid_top21.best_score_)

{'C': 100, 'gamma': 0.0001}
0.8746422438465942


In [39]:
bestSVC_model_top21 = SVC(C=100, kernel='linear', gamma=0.0001)
#Best SVC model results
bestSVC_model_top21.fit(X1_test_scaled, y1_test)
print(f"Testing bestSVC_top21 Data Score:: {bestSVC_model_top21.score(X1_test_scaled, y1_test)}")

Testing bestSVC_top21 Data Score:: 0.8829138062547673


### Deep Learning Top21

In [40]:
deep_model_top21 = Sequential()
deep_model_top21.add(Dense(units=6, activation='relu', input_dim=20))
deep_model_top21.add(Dense(units=3, activation='softmax'))

In [41]:
# One-hot encoding y
y1_train_categorical = to_categorical(y1_train)
y1_test_categorical = to_categorical(y1_test)
print(y_test_categorical)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [42]:
# Compile and fit the model
deep_model_top21.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])
deep_model_top21.fit(
    X1_train_scaled,
    y1_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 1747 samples
Epoch 1/100
1747/1747 - 0s - loss: 1.1762 - accuracy: 0.2914
Epoch 2/100
1747/1747 - 0s - loss: 0.9907 - accuracy: 0.5495
Epoch 3/100
1747/1747 - 0s - loss: 0.9002 - accuracy: 0.6846
Epoch 4/100
1747/1747 - 0s - loss: 0.8158 - accuracy: 0.7401
Epoch 5/100
1747/1747 - 0s - loss: 0.7460 - accuracy: 0.7556
Epoch 6/100
1747/1747 - 0s - loss: 0.6858 - accuracy: 0.7596
Epoch 7/100
1747/1747 - 0s - loss: 0.6338 - accuracy: 0.7624
Epoch 8/100
1747/1747 - 0s - loss: 0.5906 - accuracy: 0.7653
Epoch 9/100
1747/1747 - 0s - loss: 0.5549 - accuracy: 0.7653
Epoch 10/100
1747/1747 - 0s - loss: 0.5265 - accuracy: 0.7653
Epoch 11/100
1747/1747 - 0s - loss: 0.5031 - accuracy: 0.7653
Epoch 12/100
1747/1747 - 0s - loss: 0.4844 - accuracy: 0.7653
Epoch 13/100
1747/1747 - 0s - loss: 0.4693 - accuracy: 0.7653
Epoch 14/100
1747/1747 - 0s - loss: 0.4563 - accuracy: 0.7653
Epoch 15/100
1747/1747 - 0s - loss: 0.4459 - accuracy: 0.7653
Epoch 16/100
1747/1747 - 0s - loss: 0.4365 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x200040a6c08>

In [43]:
model_loss, model_accuracy = deep_model_top21.evaluate(
    X1_test_scaled, y1_test_categorical, verbose=3)
print(
    f"Top 21 Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Top 21 Normal Neural Network - Loss: 0.36154555974344915, Accuracy: 0.845919132232666


### RandomForest Top21

In [44]:
# rf = RandomForestClassifier()
rf_top21 = rf.fit(X1_train_scaled, y1_train)
print(f"Training RandonForest Data Score: {rf_top21.score(X1_train_scaled, y1_train)}")
print(f"Testing RandonForest Data Score: {rf_top21.score(X1_test_scaled, y1_test)}")

Training RandonForest Data Score: 0.994848311390956
Testing RandonForest Data Score: 0.8535469107551488


In [45]:
grid_rf_top21 = GridSearchCV(rf_top21, param_grid_rf, verbose=3)

In [46]:
grid_rf_top21.fit(X1_train_scaled, y1_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] max_depth=125, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... max_depth=125, n_estimators=250, score=0.868, total=   0.7s
[CV] max_depth=125, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ..... max_depth=125, n_estimators=250, score=0.883, total=   0.6s
[CV] max_depth=125, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV] ..... max_depth=125, n_estimators=250, score=0.862, total=   0.6s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.873, total=   0.7s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.877, total=   0.7s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.867, total=   0.7s
[CV] max_depth=125, n_estimators=350 .................................
[CV] ..... max_depth=125, n_estimators=350, score=0.868, total=   0.8s
[CV] max_depth=125, n_estimators=350 .................................
[CV] ..... max_depth=125, n_estimators=350, score=0.883, total=   0.8s
[CV] max_depth=125, n_estimators=350 .................................
[CV] ..... max_depth=125, n_estimators=350, score=0.874, total=   0.8s
[CV] max_depth=150, n_estimators=250 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   18.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [47]:
print(grid_rf_top21.best_params_)
print(grid_rf_top21.best_score_)

{'max_depth': 150, 'n_estimators': 350}
0.8780767029192902


In [51]:
best_rf_top21 = RandomForestClassifier(max_depth = 150, n_estimators= 350)

In [53]:
best_rf_top21.fit(X1_train_scaled, y1_train)
print(f"Training Data best_RandomForest_top21 Score: {best_rf_top21.score(X1_train_scaled, y1_train)}")
print(f"Testing Data best_RandomForest_top21 Score: {best_rf_top21.score(X1_test_scaled, y1_test)}")

Training Data best_RandomForest_top21 Score: 1.0
Testing Data best_RandomForest_top21 Score: 0.8836765827612509


# Save the Model

In [54]:
# save the best model
import joblib
filename = 'best_model_top21.h5'
joblib.dump(best_rf_top21, filename)

['best_model_top21.h5']