# Import of required libraries 

In [29]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, auc
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import RandomOverSampler


In [30]:

from collections import Counter

# Load of Dataset

In [31]:
train_data = pd.read_csv("../input/kepler-labelled-time-series-data/exoTrain.csv") 
test_data=pd.read_csv('../input/kepler-labelled-time-series-data/exoTest.csv')
train_data.head()

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,2,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
3,2,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,2,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54


So, it's clearly seems it contains 3197 Flux Values and 1 column for Label

Reading the description I found : 
*"Each star has a binary label of 2 or 1. 2 indicated that that the star is confirmed to have at least one exoplanet in orbit; some observations are in fact multi-planet systems"
*

In [32]:
train_data['LABEL'].value_counts()

1    5050
2      37
Name: LABEL, dtype: int64

In [33]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5087 entries, 0 to 5086
Columns: 3198 entries, LABEL to FLUX.3197
dtypes: float64(3197), int64(1)
memory usage: 124.1 MB


In [34]:
nulls = train_data.isnull().sum()
nulls[nulls > 0]

Series([], dtype: int64)

So, No Null values :)

# Split in datasets

In [35]:
x_train=train_data.drop('LABEL',axis=1)
y_train=train_data[['LABEL']]

In [36]:
os=RandomOverSampler(0.8)
x_train_ns,y_train_ns=os.fit_sample(x_train,y_train)

In [37]:
y_train_ns['LABEL'].value_counts()

1    5050
2    4040
Name: LABEL, dtype: int64

In [38]:
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({'LABEL': 1})
The number of classes after fit Counter({'LABEL': 1})


In [39]:
x_test=test_data.drop('LABEL',axis=1)
y_test=test_data[['LABEL']]

# Defining Model Function

In [40]:
def model(algo):
    algo_model = algo.fit(x_train_ns, y_train_ns)
    global y_prob, y_pred
    y_prob = algo.predict_proba(x_test)
    y_pred = algo_model.predict(x_test)

    print('Accuracy Score: {}\n\nConfusion Matrix:\n {}'
      .format(accuracy_score(y_test,y_pred), confusion_matrix(y_test,y_pred),roc_auc_score(y_test,y_pred)))

## Decision Tree

In [41]:
print('Decision Tree\n')
model(DecisionTreeClassifier(max_depth = 12))

Decision Tree

Accuracy Score: 0.9719298245614035

Confusion Matrix:
 [[553  12]
 [  4   1]]


In [42]:
print('Decision Tree\n')
model(DecisionTreeClassifier(max_depth = 6))

Decision Tree

Accuracy Score: 0.9473684210526315

Confusion Matrix:
 [[539  26]
 [  4   1]]


## Random Forest

In [43]:
print('Random Forest\n')
model(RandomForestClassifier())

Random Forest



  


Accuracy Score: 0.9912280701754386

Confusion Matrix:
 [[565   0]
 [  5   0]]


Great!! That's a very good accuracy

## XGBoost

In [52]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()
xgb_classifier.fit(x_train_ns, y_train_ns)
y_pred_xgb = xgb_classifier.predict(x_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(accuracy_xgb)

0.9912280701754386


In [45]:
xgb_params={
 "learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] 
}

In [46]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()

In [47]:
from sklearn.model_selection import RandomizedSearchCV

xgb_random_search = RandomizedSearchCV(xgb_classifier, param_distributions = xgb_params,
                                       scoring= 'roc_auc',
                                       n_jobs= -1, verbose= 3)

xgb_random_search.fit(x_train_ns, y_train_ns)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 24.1min finished
  return f(**kwargs)


RandomizedSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_job...
                                           random_state=None, reg_alpha=None,
                                           reg_lambda=None,
                                           scale_pos_weight=No

In [48]:
xgb_random_search.best_params_

{'min_child_weight': 5,
 'max_depth': 3,
 'learning_rate': 0.05,
 'gamma': 0.3,
 'colsample_bytree': 0.3}

In [50]:
tuned_xgb_classifier = XGBClassifier(min_child_weight = 5,
                                     max_depth = 3,
                                     learning_rate = 0.05,
                                     gamma = 0.3,
                                     colsample_bytree = 0.3)
tuned_xgb_classifier.fit(x_train_ns, y_train_ns)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0.3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [51]:
y_pred_tuned_xgb = tuned_xgb_classifier.predict(x_test)
accuracy_tuned_xgb = accuracy_score(y_test, y_pred_tuned_xgb)
print(accuracy_tuned_xgb)

0.9894736842105263


In [49]:
print("Done")

Done
