### **MODEL TRAINING**

In [104]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import classification_report

In [105]:
Kepler_data=pd.read_csv('Training_data.csv')

In [106]:
Kepler_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7325 entries, 0 to 7324
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   koi_period       7325 non-null   float64
 1   koi_time0bk      7325 non-null   float64
 2   koi_duration     7325 non-null   float64
 3   koi_depth        7325 non-null   float64
 4   ra               7325 non-null   float64
 5   dec              7325 non-null   float64
 6   koi_kepmag       7325 non-null   float64
 7   koi_disposition  7325 non-null   object 
dtypes: float64(7), object(1)
memory usage: 457.9+ KB


In [107]:
X=Kepler_data.drop(columns=['koi_disposition'])
y=Kepler_data['koi_disposition']

In [108]:
from sklearn.preprocessing import LabelEncoder

In [109]:
le=LabelEncoder()
le.fit(y)
y_trf=le.transform(y)

In [110]:
le.classes_

array(['CONFIRMED', 'FALSE POSITIVE'], dtype=object)

In [111]:
X_train,X_test,y_train,y_test=train_test_split(X,y_trf,test_size=0.2,random_state=42,stratify=y)

In [112]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((5860, 7), (1465, 7), (5860,), (1465,))

In [113]:
from sklearn.ensemble import RandomForestClassifier

In [114]:
rdf=RandomForestClassifier(n_estimators=200,criterion='gini',max_depth=None,bootstrap=True,oob_score=True)

In [115]:
rdf.fit(X_train,y_train)

In [116]:
y_predict=rdf.predict(X_test)

In [117]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       549
           1       0.91      0.88      0.90       916

    accuracy                           0.87      1465
   macro avg       0.86      0.87      0.86      1465
weighted avg       0.87      0.87      0.87      1465



In [118]:
cross_val_score(estimator=rdf,X=X,y=y,cv=10,scoring='accuracy').mean()

np.float64(0.8266469855895752)

In [119]:
rdf.feature_importances_

array([0.22927612, 0.13116923, 0.14858814, 0.2290262 , 0.09939285,
       0.08286412, 0.07968333])

The random forest couldnot give much of the accuracy thus we will go to other models.

In [120]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier as GBC

### **MAKING GRADIENT BOOSTING MODEL**

In [121]:
param_grid = {
    'n_estimators': [20,50,100,120,140,150,180,200],
    'learning_rate': [0.01, 0.1,0.001],
    'max_depth': [3, 5,None]
}

gbc = GBC(random_state=42)

gs = GridSearchCV(
    estimator=gbc,
    param_grid=param_grid,
    scoring=['accuracy','precision_macro','recall_macro','f1_macro'],
    refit='f1_macro',   # final model chosen by f1_macro
    cv=5,
    n_jobs=-1,
    verbose=1
)

# fit on training data (use X_train, y_train from your notebook)
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("Best f1_macro (cv):", gs.best_score_)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150}
Best f1_macro (cv): 0.8366395957233376


In [122]:
param_grid = {
    'n_estimators': [20,50,100,120,140,150,180,200],
    'criterion':['gini'],
    'max_depth': [3, 5,6,7,8,9,10,None],
    'min_samples_leaf':[1,2,3,4],
    'oob_score':[True,False],
    'bootstrap':[True,False],
    'max_samples':[0,0.1,0.5,0.8,1]
}

rfc = RandomForestClassifier(random_state=42)

gs2 = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    scoring=['accuracy','precision_macro','recall_macro','f1_macro'],
    refit='f1_macro',   # final model chosen by f1_macro
    cv=5,
    n_jobs=-1,
    verbose=1
)

# fit on training data (use X_train, y_train from your notebook)
gs2.fit(X_train, y_train)

print("Best params:", gs2.best_params_)
print("Best f1_macro (cv):", gs2.best_score_)


Fitting 5 folds for each of 5120 candidates, totalling 25600 fits


15360 fits failed out of a total of 25600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Acer\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Acer\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Users\Acer\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Acer\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_valid

Best params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_samples': 0.5, 'min_samples_leaf': 1, 'n_estimators': 150, 'oob_score': True}
Best f1_macro (cv): 0.8337660364906497


In [123]:
from xgboost import XGBClassifier
# X_train, X_test, y_train, y_test should already exist in the notebook
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200,500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

gs_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
    refit='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# fit
gs_xgb.fit(X_train, y_train)

# results
print("Best params:", gs_xgb.best_params_)
print("Best f1_macro (cv):", gs_xgb.best_score_)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Best f1_macro (cv): 0.8453032057984998


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [124]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, ClassifierMixin, clone

class OOFStackingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_models, meta_model, n_splits=5, random_state=42):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_splits = n_splits
        self.random_state = random_state
        self.fitted_base_models = []  # will hold final trained clones of base models

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        n_samples = X.shape[0]

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        oof_preds = np.zeros((n_samples, len(self.base_models)))

        # Out-of-fold predictions
        for i, model in enumerate(self.base_models):
            oof = np.zeros(n_samples)
            for train_idx, val_idx in skf.split(X, y):
                mdl_clone = clone(model)
                mdl_clone.fit(X[train_idx], y[train_idx])
                oof[val_idx] = mdl_clone.predict_proba(X[val_idx])[:, 1]
            oof_preds[:, i] = oof

        # Train meta-model on OOF predictions
        self.meta_model.fit(oof_preds, y)

        # Retrain base models on full dataset
        self.fitted_base_models = [clone(m).fit(X, y) for m in self.base_models]

        return self

    def predict_proba(self, X):
        X = np.asarray(X)
        # Get predictions from fully trained base models
        meta_features = np.column_stack([
            m.predict_proba(X)[:, 1] for m in self.fitted_base_models
        ])
        return self.meta_model.predict_proba(meta_features)

    def predict(self, X):
        return (self.predict_proba(X)[:, 1] > 0.5).astype(int)


In [141]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# Define base learners
rf = RandomForestClassifier(bootstrap= True, criterion= 'gini', max_depth= None, 
                            max_samples= 0.5, min_samples_leaf= 1, n_estimators= 150, oob_score=True)

xgb = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                    subsample=1, colsample_bytree=1,
                    eval_metric='logloss', use_label_encoder=False, random_state=42)

gb = GradientBoostingClassifier(learning_rate= 0.5, max_depth= 5, n_estimators= 150,
                                 subsample=1, random_state=42)


lgb = LGBMClassifier(n_estimators=500, learning_rate=0.05,
                     subsample=0.8, colsample_bytree=0.8, random_state=42)

svc = SVC(C=2.0, kernel='rbf', probability=True, random_state=42)

base_models = [rf, xgb, gb, lgb, svc]

# Option 1: Logistic Regression as meta learner
meta_log = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=500, random_state=42)

# Option 2: Random Forest as meta learnern
meta_rf = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)

# Build stacking model (just swap meta model here)
stack_clf = OOFStackingClassifier(base_models=base_models, meta_model=meta_rf, n_splits=5)

# Fit and predict
stack_clf.fit(X_train, y_train)
y_pred = stack_clf.predict(X_test)
y_pred_proba = stack_clf.predict_proba(X_test)[:, 1]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 2932, number of negative: 1756
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 4688, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625427 -> initscore=0.512646
[LightGBM] [Info] Start training from score 0.512646




[LightGBM] [Info] Number of positive: 2932, number of negative: 1756
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 4688, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625427 -> initscore=0.512646
[LightGBM] [Info] Start training from score 0.512646




[LightGBM] [Info] Number of positive: 2932, number of negative: 1756
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 4688, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625427 -> initscore=0.512646
[LightGBM] [Info] Start training from score 0.512646




[LightGBM] [Info] Number of positive: 2932, number of negative: 1756
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 4688, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625427 -> initscore=0.512646
[LightGBM] [Info] Start training from score 0.512646
[LightGBM] [Info] Number of positive: 2932, number of negative: 1756
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 4688, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625427 -> initscore=0.512646
[LightGBM] [Info] Start training from score 0.512646


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3665, number of negative: 2195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000361 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 5860, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625427 -> initscore=0.512646
[LightGBM] [Info] Start training from score 0.512646




In [142]:
from sklearn.metrics import accuracy_score

In [143]:
accuracy_score(y_test,y_pred)

0.868259385665529

In [144]:
tess_df=pd.read_csv('testing_tess.csv')

In [145]:
X=tess_df.drop(columns=['koi_disposition'])
y=tess_df['koi_disposition']

In [146]:
X.shape,y.shape

((2527, 7), (2527,))

In [147]:
y_pred3=stack_clf.predict(X)



In [148]:
accuracy_score(y,y_pred3)

0.4665611396913336