# Enzyme Substrate Classification  

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e18/sample_submission.csv
/kaggle/input/playground-series-s3e18/train.csv
/kaggle/input/playground-series-s3e18/test.csv
/kaggle/input/ec-mixed-class/mixed_fcfp.csv
/kaggle/input/ec-mixed-class/mixed_desc.csv
/kaggle/input/ec-mixed-class/mixed_ecfp.csv


# Loading Libraries

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputClassifier

import os
import warnings
warnings.filterwarnings('ignore')
import math 
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RandomizedSearchCV,RepeatedKFold, RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_predict
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils.parallel import Parallel

# Helper Function to visualize feature importance
plt.rcParams['figure.figsize'] = (8, 4)


# Reading Data

In [3]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e18/train.csv',index_col="id")
test_df = pd.read_csv('/kaggle/input/playground-series-s3e18/test.csv', index_col="id")
submission_df = pd.read_csv('/kaggle/input/playground-series-s3e18/sample_submission.csv')

fcfp_df = pd.read_csv('/kaggle/input/ec-mixed-class/mixed_fcfp.csv')
desc_df = pd.read_csv('/kaggle/input/ec-mixed-class/mixed_desc.csv')
ecfp_df = pd.read_csv('/kaggle/input/ec-mixed-class/mixed_ecfp.csv')

In [4]:
#Check NANs
print("Nulls in train: ",train_df.isna().sum().max())
print("Nulls in test: ",test_df.isna().sum().max())

Nulls in train:  0
Nulls in test:  0


In [5]:
print(train_df.shape)
print(test_df.shape)
print(desc_df.shape)

(14838, 37)
(9893, 31)
(1039, 198)


In [6]:
target = ['EC1','EC2']

In [7]:
#Extract target columns in desc_df

split_values = desc_df['EC1_EC2_EC3_EC4_EC5_EC6'].str.split('_', expand=True)

desc_df['EC1'] = split_values[0].astype(int)
desc_df['EC2'] = split_values[1].astype(int)

In [8]:
# Drop Unnecessary Columns from Train

train_df.drop(['EC3','EC4','EC5','EC6'], axis=1,inplace=True)

In [9]:
#Combine train and desc_df

final_cols = train_df.columns
temp_df = desc_df[final_cols]
train_df=pd.concat([train_df.reset_index(drop=True),temp_df],keys=('train','original'),axis=0)

In [10]:
#Check NULLS
train_df.isna().sum().max()

0

In [11]:
to_consider = train_df.drop(columns = ['EC1','EC2'], axis = 1).columns.tolist()

train_dup = train_df.drop(columns = [ 'EC1','EC2'], axis = 1).drop_duplicates()
print('There are', train_df.shape[0]- train_dup.shape[0], 'rows that are duplicated within train')

test_dup = test_df.drop_duplicates()
print('There are', test_df.shape[0]- test_dup.shape[0], 'rows that are duplicated within test')

duplicates = pd.merge(train_dup, test_dup, on = to_consider)

print('There are', duplicates.shape[0], 'rows that appear in the train and test datasets.\n')

train_df=train_df.drop_duplicates()

There are 108 rows that are duplicated within train
There are 0 rows that are duplicated within test
There are 10 rows that appear in the train and test datasets.



## Outlier Removal

In [12]:
train_df.drop(train_df[train_df.FpDensityMorgan1<-100].index, axis=0,inplace=True)
train_df.drop(train_df[train_df.FpDensityMorgan1<-2.5].index, axis=0,inplace=True)

# Feature Engineering

In [13]:
feat = [c for c in train_df.columns if c not in target]

cat_cols = ['NumHeteroatoms','fr_COO','fr_COO2']

num_cols = [c for c in feat if c not in cat_cols]

In [14]:
def divide_with_check(a,b):
    result = np.where(b != 0, np.divide(a, b), 0)
    return result

def fe(df):
    
    df['Enzyme_Complexity'] = df['Chi1'] + df['Chi2v'] + df['Chi3v'] + df['Chi4n']
    df['Molecular_Weight_Ratio'] = divide_with_check(df['ExactMolWt'] , df['HeavyAtomMolWt'])
    df['EState_VSA_Ratio'] = divide_with_check(df['EState_VSA1'] , df['EState_VSA2'])
    df['Heteroatom_Proportion'] = divide_with_check(df['NumHeteroatoms'] , (df['NumHeteroatoms'] + df['HeavyAtomMolWt']))
    df['frCOO_Average']= (df['fr_COO']+df['fr_COO2'])/2
    
    df['Molecular_Complexity'] = df['BertzCT'] * df['ExactMolWt']
    df['Structural_Flexibility'] = df['Chi1'] * df['Chi2n']
    df['Functional_Specificity'] = df['Chi1n'] * df['Chi3v']
    df['Size_Related_Descriptors'] = df['ExactMolWt'] * df['FpDensityMorgan1']
    df['Topological_Patterns'] = df['FpDensityMorgan2'] * df['FpDensityMorgan3']
    df['Electronic_Structure'] = df['HallKierAlpha'] * df['MaxAbsEStateIndex']
    df['Atom_Weight_and_Charge'] = df['HeavyAtomMolWt'] * df['MinEStateIndex']
    df['Geometrical_Shape'] = df['Kappa3'] * df['NumHeteroatoms']
    df['Molecular_Surface_Properties'] = df['PEOE_VSA10'] * df['PEOE_VSA14']
    
    df['Chemical_Diversity'] = divide_with_check(df['EState_VSA1'], df['NumHeteroatoms'])
    df['Functional_Group_Diversity'] = df['PEOE_VSA6'] * df['PEOE_VSA7']
    df['Molecular_Size'] = df['ExactMolWt'] + df['HeavyAtomMolWt']
    df['Electronegativity_Difference'] = df['MaxAbsEStateIndex'] - df['MinEStateIndex']
    df['Ring_Density'] = divide_with_check(df['NumHeteroatoms'], df['SlogP_VSA3'])
    df['Steric_Effects'] = divide_with_check(df['SMR_VSA5'], df['SMR_VSA10'])
    df['Hydrophilic_Surface'] = df['VSA_EState9'] * df['EState_VSA2']
    df['Molecular_Polarity'] = divide_with_check(df['PEOE_VSA8'], df['EState_VSA1'])
    
    df['Ring_System_Diversity'] = divide_with_check(df['fr_COO'], df['fr_COO2'])
    df['Molecular_Flexibility'] = df['Chi2n'] * df['Chi3v']
    df['Electrostatic_Potential'] = df['EState_VSA1'] - df['EState_VSA2']
    df['Hydrophobicity_Index'] = divide_with_check(df['SMR_VSA10'] , df['SMR_VSA5'])
    df['Molecular_Conformation'] = df['Chi1'] + df['Chi4n'] - df['Chi2n']
    df['Functional_Group_Connectivity'] = df['PEOE_VSA14'] * df['PEOE_VSA10']
    df['Steric_Bulkiness'] = df['PEOE_VSA7'] - df['PEOE_VSA6']
    df['Aromaticity'] = df['FpDensityMorgan1'] + df['FpDensityMorgan2'] + df['FpDensityMorgan3']
    df['Hydrogen_Bonding_Potential'] = df['EState_VSA1'] * df['NumHeteroatoms']
    df['Molecular_Polarizability'] = divide_with_check(df['HallKierAlpha'] , df['Chi2v'])

fe(train_df)
fe(test_df)

In [15]:
train_df.columns

Index(['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v', 'Chi4n',
       'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
       'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
       'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
       'fr_COO', 'fr_COO2', 'EC1', 'EC2', 'Enzyme_Complexity',
       'Molecular_Weight_Ratio', 'EState_VSA_Ratio', 'Heteroatom_Proportion',
       'frCOO_Average', 'Molecular_Complexity', 'Structural_Flexibility',
       'Functional_Specificity', 'Size_Related_Descriptors',
       'Topological_Patterns', 'Electronic_Structure',
       'Atom_Weight_and_Charge', 'Geometrical_Shape',
       'Molecular_Surface_Properties', 'Chemical_Diversity',
       'Functional_Group_Diversity', 'Molecular_Size',
       'Electronegativity_Difference', 'Ring_Density', 'S

In [16]:
pca = PCA(n_components=3)
train_pca_features = pca.fit_transform(train_df[num_cols+cat_cols])
test_pca_features = pca.transform(test_df[num_cols+cat_cols])

# Create new column names for the PCA features
pca_columns = ['PCA_{}'.format(i+1) for i in range(3)]

# Create a new DataFrame with the original features and PCA features
train_df = pd.concat([train_df.reset_index(drop=True), pd.DataFrame(train_pca_features, columns=pca_columns)], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), pd.DataFrame(test_pca_features, columns=pca_columns)], axis=1)

In [17]:
def add_poly_feat(df):
    
    for feat in num_cols:
        df[f'{feat}Squared'] = df[feat] ** 2
        df[feat] = df[feat].apply(lambda x: math.log(x) if x > 0 else 0)
add_poly_feat(train_df)
add_poly_feat(test_df)

In [18]:
def generate_features(df):

    for c in cat_cols + num_cols:
        
        df[f'count_{c}'] = df.groupby(c)[c].transform('count')
        
    for c in cat_cols:
        for n in num_cols:
            df[f'mean_{n}_per_{c}'] = df.groupby(c)[n].transform('mean')
            df[f'sum_{n}_per_{c}'] = df.groupby(c)[n].transform('sum')
            
    return df

In [19]:
print(train_df.shape)
print(test_df.shape)

(15802, 96)
(9893, 94)


In [20]:
train_df.columns

Index(['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v', 'Chi4n',
       'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
       'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
       'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
       'fr_COO', 'fr_COO2', 'EC1', 'EC2', 'Enzyme_Complexity',
       'Molecular_Weight_Ratio', 'EState_VSA_Ratio', 'Heteroatom_Proportion',
       'frCOO_Average', 'Molecular_Complexity', 'Structural_Flexibility',
       'Functional_Specificity', 'Size_Related_Descriptors',
       'Topological_Patterns', 'Electronic_Structure',
       'Atom_Weight_and_Charge', 'Geometrical_Shape',
       'Molecular_Surface_Properties', 'Chemical_Diversity',
       'Functional_Group_Diversity', 'Molecular_Size',
       'Electronegativity_Difference', 'Ring_Density', 'S

In [21]:
test_df.columns

Index(['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v', 'Chi4n',
       'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
       'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
       'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
       'fr_COO', 'fr_COO2', 'Enzyme_Complexity', 'Molecular_Weight_Ratio',
       'EState_VSA_Ratio', 'Heteroatom_Proportion', 'frCOO_Average',
       'Molecular_Complexity', 'Structural_Flexibility',
       'Functional_Specificity', 'Size_Related_Descriptors',
       'Topological_Patterns', 'Electronic_Structure',
       'Atom_Weight_and_Charge', 'Geometrical_Shape',
       'Molecular_Surface_Properties', 'Chemical_Diversity',
       'Functional_Group_Diversity', 'Molecular_Size',
       'Electronegativity_Difference', 'Ring_Density', 'Steric_Effects'

# Data Preprocessing

In [22]:
X_train = train_df.drop(['EC1','EC2'],axis=1)
y_train_ec1 = train_df['EC1']
y_train_ec2 = train_df['EC2']
X_test = test_df

In [23]:
X_train.isna().sum().max()

0

In [24]:
X_test.isna().sum().max()

0

# Baseline

In [25]:
class Ensemble():
    def __init__(self):
        
        self.brf_params_ec1= {'n_estimators': 891, 
                              'max_depth': 15, 
                              'min_samples_split': 3, 
                              'min_samples_leaf': 8, 
                              'max_features': 'auto'}

        self.brf_params_ec2= {'n_estimators': 841,
                              'max_depth': 8,
                              'min_samples_split': 4,
                              'min_samples_leaf': 2,
                              'max_features': 'auto'}
            
        self.gb_params_ec1={'n_estimators': 505, 
                            'learning_rate': 0.010175876138186907,
                            'max_depth': 4, 
                            'min_samples_split': 3,
                            'min_samples_leaf': 3,
                            'max_features': 'sqrt'}

        self.gb_params_ec2 = {'subsample': 0.5,
                             'n_estimators': 600,
                             'max_leaf_nodes': 50,
                             'max_features': 'log2',
                             'learning_rate': 0.01}
        
        self.ada_params_ec1= {'n_estimators': 400, 
                             'learning_rate': 0.1}

        self.ada_params_ec2 = {'n_estimators': 400,
                              'learning_rate': 0.1}
            
        self.etc_params_ec1 =  {'n_estimators': 504,
                                'max_depth': 7,
                                'min_samples_split': 5,
                                'min_samples_leaf': 9, 
                                'max_features': 'auto'}

        self.etc_params_ec2 = {'n_estimators': 542, 
                               'max_depth': 3,
                               'min_samples_split': 10,
                               'min_samples_leaf': 2,
                               'max_features': 'sqrt'}    
        
        self.xgb_params_ec1 = {'n_estimators': 243, 
                               'max_depth': 5,
                               'min_samples_split': 9,
                               'min_samples_leaf': 7,
                               'subsample': 0.6134380629818089,
                               'max_features': 'log2',
                               'learning_rate': 0.012865193665403818,
                               'gamma': 0.09709595444007102, 
                               'colsample_bytree': 0.6385771736797623,
                               'reg_alpha': 0.754461833024866,
                               'reg_lambda': 0.9215567020261451
                                 }

        self.xgb_params_ec2 = {'n_estimators': 596, 
                               'max_depth': 3, 
                               'min_samples_split': 6,
                               'min_samples_leaf': 5, 
                               'subsample': 0.5770767952635681,
                               'max_features': None,
                               'learning_rate': 0.010211972762251683, 
                               'gamma': 0.6089697084387038, 
                               'colsample_bytree': 0.536169354835996, 
                               'reg_alpha': 0.8054294448608303, 
                               'reg_lambda': 0.835494772934265}

        self.lgbm_params_ec1 = { 'boosting_type': 'goss',
                                'num_leaves': 29, 
                                'learning_rate': 0.02901454142546342, 
                                'subsample': 0.9518618304221032, 
                                'colsample_bytree': 0.5874366889191849, 
                                'reg_alpha': 0.9402007367524896, 
                                'reg_lambda': 0.4597320144003674,
                                'min_child_samples': 8,
                                'cat_smooth': 1.7988340131970062,
                                'max_depth': 3
                 }

        self.lgbm_params_ec2 ={'boosting_type': 'gbdt', 
                               'num_leaves': 59, 
                               'learning_rate': 0.019864360680261133, 
                               'subsample': 0.7367262805335248,
                               'colsample_bytree': 0.7910442599840853, 
                               'reg_alpha': 0.3219393587141358, 
                               'reg_lambda': 0.732886517682904,
                               'min_child_samples': 7,
                               'cat_smooth': 0.9310654882173364,
                               'max_depth': 4
        }

        self.model_ec1 =[
                    ('xgb',xgb.XGBClassifier(**self.xgb_params_ec1, random_state=42, eval_metric="auc")),
                    ('brf', BalancedRandomForestClassifier(**self.brf_params_ec1,random_state=42)),
                    ('gb',GradientBoostingClassifier(**self.gb_params_ec1,random_state=42)),
                    #('lgbm',lgb.LGBMClassifier(**self.lgbm_params_ec1,random_state=42,metric='auc')),
                    #('etc',ExtraTreesClassifier(**self.etc_params_ec1,random_state=42)),
                        ]
        
        self.model_ec2=[
                    ('xgb',xgb.XGBClassifier(**self.xgb_params_ec2, random_state=42, eval_metric="auc")),
                    ('brf', BalancedRandomForestClassifier(**self.brf_params_ec2,random_state=42)),
                    ('gb',GradientBoostingClassifier(**self.gb_params_ec2,random_state=42)),
                    #('etc',ExtraTreesClassifier(**self.etc_params_ec2,random_state=42)),
                    ('lgbm',lgb.LGBMClassifier(**self.lgbm_params_ec2,random_state=42,metric='auc')),
                        ]
        
        self.model_table = pd.DataFrame(columns = ['Model Name', 'ROC AUC'])
        
        self.scaler = StandardScaler()
        
    def fit(self,X,y,target):
        feat = [c for c in X.columns if c not in target]
        self.cat_cols = cat_cols
        self.num_cols = [c for c in feat if c not in cat_cols]
        
        #X[self.num_cols] = self.scaler.fit_transform(X[self.num_cols])
        if(target == 'EC1'):
            for name,classifier in self.model_ec1:
                classifier.fit(X, y)
        else:
            for name,classifier in self.model_ec2:
                classifier.fit(X, y)

    def predict_proba(self,train,val,target):
        #train[self.num_cols] = self.scaler.transform(train[self.num_cols])
        #val[self.num_cols] = self.scaler.transform(val[self.num_cols])
        
        train_preds={}
        test_preds={}
        
        if(target == 'EC1'):
            for name,classifier in self.model_ec1:
                train_preds[name]=classifier.predict_proba(train)[:,1]
                test_preds[name]=classifier.predict_proba(val)[:,1]
        else:  
            for name,classifier in self.model_ec2:
                train_preds[name]=classifier.predict_proba(train)[:,1]
                test_preds[name]=classifier.predict_proba(val)[:,1]
            
        return train_preds,test_preds

In [26]:
def rfecv(X,y):

    print("Starting RFECV")
    # Create the classifier or model
    model = BalancedRandomForestClassifier(random_state=42)
    
    # Create the RFECV object with the desired model and scoring metric
    rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(4), scoring='roc_auc', 
                  min_features_to_select=50, verbose=3)

    # Fit the RFECV object to your data
    rfecv.fit(X, y)

    # Print the selected features
    selected_features = X.columns[rfecv.support_]
    print("Selected Features:")
    print(selected_features)

    unnecessary_features = [item for item in X.columns if item not in selected_features]
    print(unnecessary_features)
    return selected_features,unnecessary_features

In [27]:
def model_training(X,y,test,target):
    n_folds=10
    ensemble = Ensemble()
    model_list =[]
    # Get keys from ensemble
    if(target == 'EC1'):
        model_list = ensemble.model_ec1
    else:
        model_list = ensemble.model_ec2
    
    models = [name for name, models in model_list]
    
    # Set rows = models and columns = folds
    scores = {key: 0 for key in models}
    train_scores = {key: 0 for key in models}
    oof_preds = {key: np.zeros(len(X)) for key in models} 
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    for fold,(train_idx, valid_idx) in enumerate(skf.split(X,y)):
        print(f"FOLD {fold}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        X_train = generate_features(X_train)
        X_val = generate_features(X_val)
        
        ensemble.fit(X_train, y_train,target)
        
        train_preds,val_preds = ensemble.predict_proba(X_train,X_val,target)
        keys = train_preds.keys()
        
        for model in keys:
            train_pred = train_preds[model]
            train_auc = roc_auc_score(y_train, train_pred)
            print(f"ROC SCORE FOR TRAIN : {model} is {train_auc}")

            val_pred = val_preds[model]
            val_auc = roc_auc_score(y_val, val_pred)
            print(f"ROC SCORE FOR VAL : {model} is {val_auc}")
            
            train_scores[model]+=train_auc/n_folds
            scores[model]+=val_auc/n_folds
            oof_preds[model][valid_idx] += val_pred
        
    for model in models:
        oof_preds[model] /= n_folds
        
    #Voting Ensemble
    
    # Convert oof_preds to numpy arrays
    oof_preds_arr = np.array(list(oof_preds.values())).T
    
    #Calculate weights for voting classifier
    weights = LogisticRegression(random_state = 42).fit(oof_preds_arr, y).coef_[0]
    weights_df = pd.DataFrame(weights, index=models, columns=['weight per model'])
    voting_clf = VotingClassifier(model_list, weights=weights, voting='soft')
    pipe = Pipeline([
            #('scaler',StandardScaler()),
            ('classifier',voting_clf) 
        ])
    
    print('='*30)
    #Scoring
    scores['Voting'] = 0 
    train_scores['Voting'] = 0 
    for fold,(train_idx, valid_idx) in enumerate(skf.split(X,y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]
        
        X_train = generate_features(X_train)
        X_val = generate_features(X_val)
        
        pipe.fit(X_train,y_train)
        train_preds = pipe.predict_proba(X_train)[:,1]
        val_preds = pipe.predict_proba(X_val)[:,1]

        train_auc = roc_auc_score(y_train, train_preds)
        val_auc = roc_auc_score(y_val, val_preds)
        
        print(f"ROC SCORE FOR TRAIN : Voting is {train_auc}")
        print(f"ROC SCORE FOR VAL : Voting is {val_auc}")
        
        train_scores['Voting']+=train_auc/n_folds
        scores['Voting']+=val_auc/n_folds
        
    print('='*30)
    #Final Test Prediction
    X = generate_features(X)
    test = generate_features(test)
    
    test_preds = pipe.fit(X,y).predict_proba(test)[:,1]
    
    return train_scores,scores, test_preds

In [28]:
scores_ec1 ={}
train_scores_ec1 ={}
test_preds_ec1 =[]
train_scores_ec1,scores_ec1, test_preds_ec1 = model_training(X_train,y_train_ec1,X_test,'EC1')

FOLD 0
Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

ROC SCORE FOR TRAIN : xgb is 0.7870389092300125
ROC SCORE FOR VAL : xgb is 0.7261890215414692
ROC SCORE FOR TRAIN : brf is 0.8947179018623348
ROC SCORE FOR VAL : brf is 0.7211899918779245
ROC SCORE FOR TRAIN : gb is 0.7643541097141121
ROC SCORE FOR VAL : gb is 0.7185646926908507
FOLD 1
Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

ROC SCORE FOR TRAIN : xgb is 0.7878292363037298
ROC SCORE FOR VAL : xgb is 0.7157300523981686
ROC SCORE FOR TRAIN : brf is 0.8956862230243157
ROC SCORE FOR VAL : brf is 0.7188288398369834
ROC SCORE FOR TRAIN : gb is 0.7650383956295981
ROC SCORE FOR VAL : gb is 0.7183266008754591
FOLD 2
Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

ROC SCORE FOR TRAIN : xgb is 0.7872181691040345
ROC SCORE FOR VAL : xgb is 0.7081172946191957
ROC SCORE FOR TRAIN : brf is 0.893426841374032
ROC SCORE FOR VAL

In [29]:
scores_ec1_df = pd.DataFrame(scores_ec1.items(), columns=['Model Name', 'Average Score'])

display(scores_ec1_df.sort_values(by='Average Score', ascending=False).style.background_gradient(cmap='summer_r'))

Unnamed: 0,Model Name,Average Score
1,brf,0.709818
3,Voting,0.709565
2,gb,0.708439
0,xgb,0.705271


## EC2

In [30]:
scores_ec2 ={}
train_scores_ec2 = {}
test_preds_ec2 =[]
train_scores_ec2,scores_ec2, test_preds_ec2 = model_training(X_train,y_train_ec2,X_test,'EC2')

FOLD 0
Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

ROC SCORE FOR TRAIN : xgb is 0.6964392472459346
ROC SCORE FOR VAL : xgb is 0.5932262621390285
ROC SCORE FOR TRAIN : brf is 0.7781431076970895
ROC SCORE FOR VAL : brf is 0.5936717084959664
ROC SCORE FOR TRAIN : gb is 0.665700978577852
ROC SCORE FOR VAL : gb is 0.6033041783360487
ROC SCORE FOR TRAIN : lgbm is 0.6842981698766384
ROC SCORE FOR VAL : lgbm is 0.6018497098446106
FOLD 1
Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

ROC SCORE FOR TRAIN : xgb is 0.6948133427449388
ROC SCORE FOR VAL : xgb is 0.5899813946162517
ROC SCORE FOR TRAIN : brf is 0.7759748295312333
ROC SCORE FOR VAL : brf is 0.5892184754413856
ROC SCORE FOR TRAIN : gb is 0.6675507635169924
ROC SCORE FOR VAL : gb is 0.5748952831906756
ROC SCORE FOR TRAIN : lgbm is 0.6819528918941686
ROC SCORE FOR VAL : lgbm is 0.5776885517825237
FOLD 2
Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

ROC SCORE FOR T

In [31]:
scores_ec2_df = pd.DataFrame(scores_ec2.items(), columns=['Model Name', 'Average Score'])

display(scores_ec2_df.sort_values(by='Average Score', ascending=False).style.background_gradient(cmap='summer_r'))

Unnamed: 0,Model Name,Average Score
4,Voting,0.595365
0,xgb,0.59429
1,brf,0.592164
3,lgbm,0.588628
2,gb,0.586106


# Submission

In [32]:
#Average Score

print("Average CV Score:",(scores_ec1['Voting'] + scores_ec2['Voting'])/2)

Average CV Score: 0.6524654034932933


In [33]:
print("Average Train CV Score:",(train_scores_ec1['Voting'] + train_scores_ec2['Voting'])/2)

Average Train CV Score: 0.7788523527893036


In [34]:
submission_df['EC1'] = test_preds_ec1
submission_df['EC2'] = test_preds_ec2
submission_df.to_csv('submission.csv',index=False)
submission_df

Unnamed: 0,id,EC1,EC2
0,14838,0.395151,0.678146
1,14839,0.762014,0.733241
2,14840,0.721508,0.663310
3,14841,0.639022,0.736650
4,14842,0.618447,0.669624
...,...,...,...
9888,24726,0.541766,0.677754
9889,24727,0.721040,0.747912
9890,24728,0.297667,0.760970
9891,24729,0.404276,0.757510
