In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import time
from scipy import ndimage, fft
from pathlib import Path
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler



In [0]:
from sklearn.svm import LinearSVC

In [0]:
df_train = pd.read_csv('exoTrain.csv', encoding = "ISO-8859-1")
df_dev = pd.read_csv('exoTest.csv', encoding = "ISO-8859-1")

In [4]:
df_train.shape

(5087, 3198)

In [0]:
df_train_x = df_train.drop('LABEL', axis=1)
df_train_y = df_train.LABEL
df_dev_x = df_dev.drop('LABEL', axis=1)
df_dev_y = df_dev.LABEL

In [0]:
class LightFluxProcessor:

    def __init__(self, fourier=True, normalize=True, gaussian=True, standardize=True):
        self.fourier = fourier
        self.normalize = normalize
        self.gaussian = gaussian
        self.standardize = standardize

    def fourier_transform(self, X):
        return np.abs(fft(X, n=X.size))

    def process(self, df_train_x, df_dev_x):
        # Fourier transform
        if self.fourier:
            print("Applying Fourier...")
            df_train_x = df_train_x.apply(self.fourier_transform,axis=1)
            df_dev_x = df_dev_x.apply(self.fourier_transform,axis=1)

            
            df_train_x = df_train_x.iloc[:,:(df_train_x.shape[1]//2)].values
            df_dev_x = df_dev_x.iloc[:,:(df_dev_x.shape[1]//2)].values

        # Normalize
        if self.normalize:
            print("Normalizing...")
            df_train_x = pd.DataFrame(normalize(df_train_x))
            df_dev_x = pd.DataFrame(normalize(df_dev_x))

        # Gaussian filter to smooth out data
        if self.gaussian:
            print("Applying Gaussian Filter...")
            df_train_x = ndimage.filters.gaussian_filter(df_train_x, sigma=10)
            df_dev_x = ndimage.filters.gaussian_filter(df_dev_x, sigma=10)

        if self.standardize:
            # Standardize X data
            print("Standardizing...")
            std_scaler = StandardScaler()
            df_train_x = std_scaler.fit_transform(df_train_x)
            df_dev_x = std_scaler.transform(df_dev_x)

        print("Finished Processing!")
        return df_train_x, df_dev_x


In [0]:
def np_X_Y_from_df(df):
    df = shuffle(df)
    df_X = df.drop(['LABEL'], axis=1)
    X = np.array(df_X)
    Y_raw = np.array(df['LABEL']).reshape((len(df['LABEL']),1))
    Y = Y_raw == 2
    return X, Y

In [8]:
LFP = LightFluxProcessor(
    fourier=True,
    normalize=True,
    gaussian=True,
    standardize=True)
df_train_x, df_dev_x = LFP.process(df_train_x, df_dev_x)

Applying Fourier...
Normalizing...
Applying Gaussian Filter...
Standardizing...
Finished Processing!


In [0]:
# Rejoin X and Y
df_train_processed = pd.DataFrame(df_train_x).join(pd.DataFrame(df_train_y))
df_dev_processed = pd.DataFrame(df_dev_x).join(pd.DataFrame(df_dev_y))

In [0]:
X_train, Y_train = np_X_Y_from_df(df_train_processed)
X_dev, Y_dev = np_X_Y_from_df(df_dev_processed)

In [20]:
print("X_train.shape: ", X_train.shape)
print("Y_train.shape: ", Y_train.shape)
print("X_test.shape: ", X_dev.shape)
print("Y_test.shape: ", Y_dev.shape)

X_train.shape:  (5087, 1598)
Y_train.shape:  (5087, 1)
X_test.shape:  (570, 1598)
Y_test.shape:  (570, 1)


In [23]:
!pip install -U imblearn

Collecting imblearn
  Downloading https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl
Collecting imbalanced-learn (from imblearn)
[?25l  Downloading https://files.pythonhosted.org/packages/c5/ea/f027ceb21114abe8189a2804640b2d5dd49a7a271c4814695482c5bc94d8/imbalanced_learn-0.4.2-py3-none-any.whl (166kB)
[K    100% |████████████████████████████████| 174kB 7.0MB/s 
[?25hCollecting scikit-learn>=0.20 (from imbalanced-learn->imblearn)
[?25l  Downloading https://files.pythonhosted.org/packages/0c/b2/05be9b6da9ae4a4c54f537be22e95833f722742a02b1e355fdc09363877c/scikit_learn-0.20.0-cp36-cp36m-manylinux1_x86_64.whl (5.3MB)
[K    100% |████████████████████████████████| 5.3MB 7.5MB/s 
Installing collected packages: scikit-learn, imbalanced-learn, imblearn
  Found existing installation: scikit-learn 0.19.2
    Uninstalling scikit-learn-0.19.2:
      Successfully uninstalled scikit-learn-0.19.2
Successfully

In [0]:
from imblearn.over_sampling import SMOTE

In [25]:
model = LinearSVC()
X_train_sm, Y_train_sm = X_train, Y_train

# Train
print("Training...")
model.fit(X_train_sm, Y_train_sm)

train_outputs = model.predict(X_train_sm)
dev_outputs = model.predict(X_dev)
print("Finished Training!")

  y = column_or_1d(y, warn=True)


Training...


  y = column_or_1d(y, warn=True)


Finished Training!


In [26]:
X_train_sm.shape

(5087, 1598)

In [27]:
#Overfitting
s=model.predict(X_train_sm)
confusion_matrix(Y_train_sm,s)

array([[5050,    0],
       [   0,   37]])

In [0]:
#Now lets test on dataset

In [29]:
confusion_matrix(Y_dev, dev_outputs)

array([[561,   4],
       [  0,   5]])

In [19]:
accuracy_score(Y_dev, dev_outputs)

0.9929824561403509

In [0]:
#This accuracy doesn't tell anything because we have only 5 confirmed exoplanet

In [31]:
print(classification_report(Y_dev, dev_outputs))

             precision    recall  f1-score   support

      False       1.00      0.99      1.00       565
       True       0.56      1.00      0.71         5

avg / total       1.00      0.99      0.99       570



In [0]:
from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier(n_neighbors=3)

In [33]:

X_train_sm2, Y_train_sm2 = X_train, Y_train

# Train
print("Training...")
model2.fit(X_train_sm2, Y_train_sm2)

train_outputs = model2.predict(X_train_sm2)
dev_outputs = model2.predict(X_dev)
print("Finished Training!")

Training...


  


Finished Training!


In [34]:
confusion_matrix(Y_dev, dev_outputs)

array([[565,   0],
       [  5,   0]])

In [0]:
# Hence linearSVC give better results