# Importing all required libraries and processing given dataset

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

import pandas as pd
import numpy as np

data = pd.read_csv('kepler-tess.csv').dropna()
#disposition as target variable
y = data.pop('disp')
X = data

#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#feature scaling
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
knn = KNeighborsClassifier(n_neighbors=23)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=8)
sgd = SGDClassifier()
svm = SVC(kernel='rbf', C=1.0, gamma=0.1, probability=True)

gbc = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=8)


model = VotingClassifier(
    estimators=[('xgb', xgb), ('gbc', gbc)],
    voting='hard'
)

# Model tuning through GridSearchCV

In [7]:
#XGBClassifier tuning
xgb_param_grid = {
    'n_estimators':[50, 100, 150, 200],
    'learning_rate':[0.1, 0.05, 0.01],
    'max_depth':[3, 5, 8, 10]
}

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=xgb_param_grid,
    cv=5,              # 5-fold CV
    scoring='accuracy', # Metric to optimize
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
grid_predict = grid_search.predict(X_test)
print('After tuning: ', accuracy_score(y_test, grid_predict))

After tuning:  0.7839370078740158


In [3]:
#GradientBoostingClassifier tuning
gbc_param_grid = {
    'n_estimators': [10, 100, 200],
    'max_depth': [3, 5, 8],
    'learning_rate': [0.1, 0.05, 0.01]
}

grid_search = GridSearchCV(
    estimator=gbc,
    param_grid=gbc_param_grid,
    cv=5,              # 5-fold CV
    scoring='accuracy', # Metric to optimize
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
grid_predict = grid_search.predict(X_test)
print('After tuning: ', accuracy_score(y_test, grid_predict))

After tuning:  0.7899212598425197


# Studying the metrics of trained models

In [21]:
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print("XGB report", classification_report(y_test, xgb_pred))

XGB report               precision    recall  f1-score   support

           0       0.79      0.65      0.71      1312
           1       0.78      0.88      0.83      1863

    accuracy                           0.78      3175
   macro avg       0.78      0.76      0.77      3175
weighted avg       0.78      0.78      0.78      3175



In [23]:
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_test)
print("Gradient Boosting Classifier report", classification_report(y_test, gbc_pred))

Gradient Boosting Classifier report               precision    recall  f1-score   support

           0       0.80      0.66      0.72      1312
           1       0.79      0.88      0.83      1863

    accuracy                           0.79      3175
   macro avg       0.79      0.77      0.78      3175
weighted avg       0.79      0.79      0.79      3175



In [22]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("RF report", classification_report(y_test, rf_pred))

RF report               precision    recall  f1-score   support

           0       0.80      0.65      0.72      1312
           1       0.78      0.89      0.83      1863

    accuracy                           0.79      3175
   macro avg       0.79      0.77      0.77      3175
weighted avg       0.79      0.79      0.78      3175



In [17]:
model.fit(X_train, y_train)
model_pred = model.predict(X_test)
print(f"Report:", classification_report(y_test, model_pred))

Report:               precision    recall  f1-score   support

           0       0.76      0.70      0.73      1312
           1       0.80      0.84      0.82      1863

    accuracy                           0.78      3175
   macro avg       0.78      0.77      0.77      3175
weighted avg       0.78      0.78      0.78      3175

