### **MODEL TRAINING**

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import classification_report

In [12]:
Kepler_data=pd.read_csv('Combined.csv')

In [13]:
Kepler_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9716 entries, 0 to 9715
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   koi_period       9716 non-null   float64
 1   koi_duration     9716 non-null   float64
 2   koi_depth        9716 non-null   float64
 3   koi_model_snr    9716 non-null   float64
 4   koi_prad         9716 non-null   float64
 5   koi_steff        9716 non-null   float64
 6   koi_srad         9716 non-null   float64
 7   koi_disposition  9716 non-null   float64
dtypes: float64(8)
memory usage: 607.4 KB


In [14]:
X=Kepler_data.drop(columns=['koi_disposition'])
y=Kepler_data['koi_disposition']

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le=LabelEncoder()
le.fit(y)
y_trf=le.transform(y)

In [17]:
le.classes_

array([0., 1.])

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y_trf,test_size=0.2,random_state=42,stratify=y)

In [19]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((7772, 7), (1944, 7), (7772,), (1944,))

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rdf=RandomForestClassifier(n_estimators=200,criterion='gini',max_depth=None,bootstrap=True,oob_score=True)

In [22]:
rdf.fit(X_train,y_train)

In [23]:
y_predict=rdf.predict(X_test)

In [24]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       797
           1       0.93      0.91      0.92      1147

    accuracy                           0.90      1944
   macro avg       0.90      0.90      0.90      1944
weighted avg       0.90      0.90      0.90      1944



In [25]:
cross_val_score(estimator=rdf,X=X,y=y,cv=10,scoring='accuracy').mean()

np.float64(0.8817231609684981)

In [26]:
rdf.feature_importances_

array([0.19830332, 0.1107489 , 0.09277452, 0.17517658, 0.2212534 ,
       0.09771396, 0.10402933])

The random forest couldnot give much of the accuracy thus we will go to other models.

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier as GBC

### **MAKING GRADIENT BOOSTING MODEL**

In [28]:
param_grid = {
    'n_estimators': [100,120,150,180,200,500],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5,None]
}

gbc = GBC(random_state=42)

gs = GridSearchCV(
    estimator=gbc,
    param_grid=param_grid,
    scoring=['accuracy','precision_macro','recall_macro','f1_macro'],
    refit='f1_macro',   # final model chosen by f1_macro
    cv=5,
    n_jobs=-1,
    verbose=1
)

# fit on training data (use X_train, y_train from your notebook)
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("Best f1_macro (cv):", gs.best_score_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Best f1_macro (cv): 0.8902639281333572


In [29]:
param_grid = {
    'n_estimators': [100,120,500,150,180,200],
    'criterion':['gini'],
    'max_depth': [3, 5,6,10,None],
    'min_samples_leaf':[1,2,4],
    'oob_score':[True,False],
    'bootstrap':[True,False],
    'max_samples':[0,0.1,0.5,0.8,1]
}

rfc = RandomForestClassifier(random_state=42)

gs2 = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    scoring=['accuracy','precision_macro','recall_macro','f1_macro'],
    refit='f1_macro',   # final model chosen by f1_macro
    cv=5,
    n_jobs=-1,
    verbose=1
)

# fit on training data (use X_train, y_train from your notebook)
gs2.fit(X_train, y_train)

print("Best params:", gs2.best_params_)
print("Best f1_macro (cv):", gs2.best_score_)


Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


5400 fits failed out of a total of 9000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1800 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Acer\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Acer\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Users\Acer\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Acer\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validat

Best params: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_samples': 0.8, 'min_samples_leaf': 1, 'n_estimators': 500, 'oob_score': True}
Best f1_macro (cv): 0.8909114297899794


In [30]:
from xgboost import XGBClassifier
# X_train, X_test, y_train, y_test should already exist in the notebook
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200,500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

gs_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
    refit='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# fit
gs_xgb.fit(X_train, y_train)

# results
print("Best params:", gs_xgb.best_params_)
print("Best f1_macro (cv):", gs_xgb.best_score_)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best f1_macro (cv): 0.8976190541398129


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [31]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, ClassifierMixin, clone

class OOFStackingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_models, meta_model, n_splits=5, random_state=42):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_splits = n_splits
        self.random_state = random_state
        self.fitted_base_models = []  # will hold final trained clones of base models

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        n_samples = X.shape[0]

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        oof_preds = np.zeros((n_samples, len(self.base_models)))

        # Out-of-fold predictions
        for i, model in enumerate(self.base_models):
            oof = np.zeros(n_samples)
            for train_idx, val_idx in skf.split(X, y):
                mdl_clone = clone(model)
                mdl_clone.fit(X[train_idx], y[train_idx])
                oof[val_idx] = mdl_clone.predict_proba(X[val_idx])[:, 1]
            oof_preds[:, i] = oof

        # Train meta-model on OOF predictions
        self.meta_model.fit(oof_preds, y)

        # Retrain base models on full dataset
        self.fitted_base_models = [clone(m).fit(X, y) for m in self.base_models]

        return self

    def predict_proba(self, X):
        X = np.asarray(X)
        # Get predictions from fully trained base models
        meta_features = np.column_stack([
            m.predict_proba(X)[:, 1] for m in self.fitted_base_models
        ])
        return self.meta_model.predict_proba(meta_features)

    def predict(self, X):
        return (self.predict_proba(X)[:, 1] > 0.5).astype(int)


In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# Define base learners
rf = RandomForestClassifier(bootstrap= True, criterion= 'gini', max_depth= None, 
                            max_samples= 0.8, min_samples_leaf= 1, n_estimators= 500, oob_score=True)

xgb = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                    subsample=1, colsample_bytree=1,
                    eval_metric='logloss', use_label_encoder=False, random_state=42)

gb = GradientBoostingClassifier(learning_rate= 0.1, max_depth= 3, n_estimators= 500,
                                 subsample=1, random_state=42)


lgb = LGBMClassifier(n_estimators=500, learning_rate=0.05,
                     subsample=0.8, colsample_bytree=0.8, random_state=42)

svc = SVC(C=2.0, kernel='rbf', probability=True, random_state=42)

base_models = [rf, xgb, gb, lgb, svc]

# Option 1: Logistic Regression as meta learner
meta_log = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=500, random_state=42)

# Option 2: Random Forest as meta learnern
meta_rf = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)

# Build stacking model (just swap meta model here)
stack_clf = OOFStackingClassifier(base_models=base_models, meta_model=meta_rf, n_splits=5)

# Fit and predict
stack_clf.fit(X_train, y_train)
y_pred = stack_clf.predict(X_test)
y_pred_proba = stack_clf.predict_proba(X_test)[:, 1]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3668, number of negative: 2549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6217, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589995 -> initscore=0.363945
[LightGBM] [Info] Start training from score 0.363945




[LightGBM] [Info] Number of positive: 3668, number of negative: 2549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6217, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589995 -> initscore=0.363945
[LightGBM] [Info] Start training from score 0.363945




[LightGBM] [Info] Number of positive: 3668, number of negative: 2550
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6218, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589900 -> initscore=0.363553
[LightGBM] [Info] Start training from score 0.363553




[LightGBM] [Info] Number of positive: 3668, number of negative: 2550
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6218, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589900 -> initscore=0.363553
[LightGBM] [Info] Start training from score 0.363553




[LightGBM] [Info] Number of positive: 3668, number of negative: 2550
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 6218, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589900 -> initscore=0.363553
[LightGBM] [Info] Start training from score 0.363553


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 4585, number of negative: 3187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 7772, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.589938 -> initscore=0.363710
[LightGBM] [Info] Start training from score 0.363710




In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.897119341563786