In [1]:
################################
# Scientific imports
###
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

###
# General imports
###
import csv, math, io, os, os.path, sys, random, time
import pandas as pd
import seaborn as sb
from tqdm.notebook import tqdm, trange

###
# SciKitLearn Imports
###
import sklearn
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

from IPython.display import display

###
# MatPlotLib Settings
###

plt.rcParams["figure.figsize"] = (20,9)
sb.set()

In [2]:
# Load the Data files
fitsarr = np.load("fitslist.npy")
fluxarr = np.load("fluxlist.npy")
planetarr = np.load("planetlist.npy")
isplanetarr = np.load("isplanetlist.npy")

In [3]:
# 6 sets of SVC parameters to plot a basic ROC/AUC with -- used for later on! Since params are defined outside the grid, can always modify for future expansion

svcC = [1, 5, 10, 25, 50, 75]
svcG = [0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.0075]

param_grid = {'svc__C': svcC,
              'svc__gamma': svcG}

#param_grid = np.vstack((svcC, svcG)).T

for key, pairs in param_grid.items():
    for pair in pairs:
        print (key,"::",pair)

svc__C :: 1
svc__C :: 5
svc__C :: 10
svc__C :: 25
svc__C :: 50
svc__C :: 75
svc__gamma :: 0.0001
svc__gamma :: 0.0005
svc__gamma :: 0.001
svc__gamma :: 0.0025
svc__gamma :: 0.005
svc__gamma :: 0.0075


In [4]:
param_grid.items()

dict_items([('svc__C', [1, 5, 10, 25, 50, 75]), ('svc__gamma', [0.0001, 0.0005, 0.001, 0.0025, 0.005, 0.0075])])

In [5]:
# Make a PCA Pipeline
pca = PCA(svd_solver='randomized', n_components=2, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)

# Sort data into Test and Train
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(fluxarr, isplanetarr, random_state=42)

# IsPlanetArr is an array of TRUEs and FALSEs -- need it to be 0s and 1s
isplanetarrbin = [1 if x==True else 0 for x in isplanetarr]
#plt.hist(isplanetarrbin)

# Fit model
pca.fit(Xtrain,ytrain)

# Use PCA transform on Xtrain --- I don't quite know what this does but it seems to make things work so wahey!
pca.transform(Xtrain)

array([[ 0.74475579, -0.1849862 ],
       [-0.13956263,  0.03318039],
       [-0.16695046,  0.03208855],
       ...,
       [-0.16671307,  0.03203889],
       [-0.07404131,  0.12968034],
       [-0.09193118,  0.01484429]])

In [6]:
# So svc__C = 50, svc__gamma = 0.005 is best, which is the highest of each grid. Let's try more to see
from sklearn.model_selection import GridSearchCV
#param_grid = {'svc__C': [50],
#              'svc__gamma': [0.005]}
grid = GridSearchCV(model, param_grid)

%time grid.fit(Xtrain, ytrain)
print(grid.best_params_)

CPU times: user 2h 40min 5s, sys: 1h 1min 6s, total: 3h 41min 11s
Wall time: 35min 9s
{'svc__C': 50, 'svc__gamma': 0.005}


Need to find a way to output the True Positive Rate (TPR, aks Recall) and False Positive Rate (FPR) from my models, based on different svc__ parameters.  
Therefore, need to run my model at each set of params, and then record the TPR and FPR at each interval.  
This can be done by making a confusion matrix:  
``from sklearn.metrics import confusion_matrix``  
``mat = confusion_matrix(ytest, yfitnew)``  

And using the following:  
True Positive (TP)  = mat[0][0]  
False Positive (FP) = mat[1][0]  
True Negative (TN)  = mat[1][1]  
False Negative (FN) = mat[0][1]

We can work out the TPR and FPR values, and record them somewhere.  
It's a tad convoluted, but it will work. I just need to work out how to loop thru values of my grid_params