In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time as time

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import log_loss

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from category_encoders import CatBoostEncoder, JamesSteinEncoder, WOEEncoder
from feature_engine.outliers import Winsorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
train= pd.read_csv('train.csv')
test= pd.read_csv('test.csv')
sample= pd.read_csv('submission.csv')

In [3]:
train= train.drop(['Payment Plan', 'Accounts Delinquent'], axis= 1)
test= test.drop(['Payment Plan', 'Accounts Delinquent'], axis= 1)

In [4]:
df= pd.concat([train, test], axis=0)

In [5]:
df= df.rename(columns={'ID': 'id',
                   'Loan Amount': 'loan_amount',
                   'Funded Amount': 'funded_amount',
                   'Funded Amount Investor': 'funded_amount_investor',
                   'Term': 'term',
                   'Batch Enrolled': 'batch_enrolled',
                   'Interest Rate': 'interest_rate',
                   'Grade': 'grade',
                   'Sub Grade': 'sub_grade',
                   'Employment Duration': 'emp_duration',
                   'Home Ownership': 'home_owener',
                   'Verification Status': 'verification_status',
                   'Loan Title': 'loan_title',
                   'Debit to Income': 'debit_to_income',
                   'Delinquency - two years': 'delinquency_two_years',
                   'Inquires - six months': 'inquires_six_months',
                   'Open Account': 'open_account',
                   'Public Record': 'public_record',
                   'Revolving Balance': 'revolving_balance',
                   'Revolving Utilities': 'revolving_uitilities',
                   'Total Accounts': 'total_accounts',
                   'Initial List Status': 'initial_list_status',
                   'Total Received Interest': 'total_interest_received',
                   'Total Received Late Fee': 'total_late_fee_received',
                   'Recoveries': 'recoveries',
                   'Collection Recovery Fee': 'collection_recovery_fee',
                   'Collection 12 months Medical': 'collection_12_months_medical',
                   'Application Type': 'application_type',
                   'Last week Pay': 'last_week_pay',
                   'Total Collection Amount': 'total_amount_collected',
                   'Total Current Balance': 'total_current_balance',
                   'Total Revolving Credit Limit': 'total_revolving_credit_limit'})

In [6]:
win= Winsorizer(capping_method='gaussian', tail='right', fold=1.5, variables=['funded_amount_investor', 'interest_rate',
                                                                         'home_owener', 'total_interest_received',
                                                                         'total_current_balance', 'total_revolving_credit_limit'])
win.fit(df)
df= win.transform(df)

In [None]:
df_all_corr = df.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_all_corr[df_all_corr['Feature 1'] == 'Loan Status']

In [None]:
#df.grade= df.grade.map(df.grade.value_counts(normalize=True))

In [7]:
df['no_of_people_in_batch']= df.groupby(['batch_enrolled'])['id'].transform('count')
df['no_of_people_varified']= df.groupby(['verification_status'])['id'].transform('count')
df['no_of_open_accounts_by_users']= df.groupby(['open_account'])['id'].transform('count')

In [8]:
df['actual_fund']= df.funded_amount_investor - df.funded_amount
df['term_yearly']= df.term/12
df['loan_in_hand']= df.loan_amount - df.funded_amount
df['interest_to_be_paid']= (df.funded_amount * df.interest_rate)/100
df['interest_to_be_paid_monthly']= (df.funded_amount * df.interest_rate)/(100 * df.term)
df['delinquency_in_days']= df.delinquency_two_years * 30
df['inquires_in_two_years']= df.inquires_six_months * 4
df['initial_list_status']= df.initial_list_status.map({'w': 0, 'f': 1})
df['received']= df.total_interest_received + df.total_late_fee_received
df['revolving_amount_remain_on_card']= df.total_revolving_credit_limit - df.revolving_balance
df.application_type= df.application_type.map({'INDIVIDUAL':0, 'JOINT':1})

In [9]:
df['graded_houses']= df.groupby(['grade'])['id'].transform('count')
df['sub_graded_houses']= df.groupby(['sub_grade'])['id'].transform('count')
df['good_graded_houses']= df.groupby(['grade','sub_grade'])['id'].transform('count')

In [10]:
df.grade= df.grade.map({'A':0,'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6})
df.sub_grade= df.sub_grade.map({'A1':0,'A2':1, 'A3':2, 'A4':3, 'A5':4,
                                'B1':5,'B2':6, 'B3':7, 'B4':8, 'B5':9,
                                'C1':10,'C2':11, 'C3':12, 'C4':13, 'C5':14,
                                'D1':15,'D2':16, 'D3':17, 'D4':18, 'D5':19,
                                'E1':20,'E2':21, 'E3':22, 'E4':23, 'E5':24,
                                'F1':25,'F2':26, 'F3':27, 'F4':28, 'F5':29,
                                'G1':30,'G2':31, 'G3':32, 'G4':33, 'G5':34})

In [11]:
df.batch_enrolled= df.batch_enrolled.map(df.batch_enrolled.value_counts(normalize=True))

In [12]:
df['term_yearly']= np.round(df.term/12)

In [13]:
df['pay_monthly']= df.loan_amount * df.interest_rate/df.term
df['pay_yearly']= df.loan_amount * df.interest_rate/df.term_yearly

In [14]:
df['pay_monthly_if_funded']= df.funded_amount_investor * df.interest_rate/df.term
df['pay_yearly_if_funded']= df.funded_amount_investor * df.interest_rate/df.term_yearly

In [15]:
df.term= df.term.map(df.term.value_counts(normalize=True))

In [16]:
df['emp_by_home_std']= df.groupby(['emp_duration'])['home_owener'].transform('std')
df['emp_by_home_mean']= df.groupby(['emp_duration'])['home_owener'].transform('mean')
df['emp_by_home_med']= df.groupby(['emp_duration'])['home_owener'].transform('median')

In [17]:
df.emp_duration= df.emp_duration.map(df.emp_duration.value_counts(normalize=True))

In [18]:
df.verification_status= df.verification_status.map(df.verification_status.value_counts(normalize=True))

In [19]:
#df.loan_title= df.loan_amount.map(df.loan_title.value_counts(normalize=True))

In [20]:
df.initial_list_status= df.initial_list_status.map(df.initial_list_status.value_counts(normalize=True))

In [21]:
df.application_type= df.application_type.map(df.application_type.value_counts(normalize=True))

In [None]:
df.head()

In [None]:
df.total_interest_received.min()

In [None]:
df.revolving_uitilities.min()

In [22]:
train= df[:67463]
test= df[67463:]

In [None]:
[var for var in train.columns if train[var].dtypes != 'O']

In [23]:
cont_cols= ['id',
 'loan_amount',
 'funded_amount',
 'funded_amount_investor',
 'term',
 'batch_enrolled',
 'interest_rate',
 'grade',
 'sub_grade',
 'emp_duration',
 'home_owener',
 'verification_status',
 'debit_to_income',
 'delinquency_two_years',
 'inquires_six_months',
 'open_account',
 'public_record',
 'revolving_balance',
 'revolving_uitilities',
 'total_accounts',
 'initial_list_status',
 'total_interest_received',
 'total_late_fee_received',
 'recoveries',
 'collection_recovery_fee',
 'collection_12_months_medical',
 'application_type',
 'last_week_pay',
 'total_amount_collected',
 'total_current_balance',
 'total_revolving_credit_limit',
 'actual_fund',
 'term_yearly',
 'loan_in_hand',
 'interest_to_be_paid',
 'interest_to_be_paid_monthly',
 'delinquency_in_days',
 'inquires_in_two_years',
 'received',
 'revolving_amount_remain_on_card',
 'graded_houses',
 'sub_graded_houses',
 'good_graded_houses',
 'pay_monthly',
 'pay_yearly',
 'pay_monthly_if_funded',
 'pay_yearly_if_funded',
 'emp_by_home_std',
 'emp_by_home_mean',
 'emp_by_home_med']

In [61]:
X_train= train[cont_cols]
y_train= train['Loan Status']
X_test= test[cont_cols]

In [None]:
xgb_search_params= {
        'max_depth': scope.int(hp.quniform('max_depth', 3, 24, 1)),
        'learning_rate': hp.uniform('learning_rate', 1e-7, 0.1),
        'gamma': hp.uniform('gamma', 1, 10),
        'min_child_weight': scope.int(hp.quniform('min_child_weight', 2, 700, 1)),
        #'n_estimators': 500,
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.9),
        'subsample': hp.uniform('subsample', 0.001, 1.0),
        'reg_lambda': hp.uniform('reg_lambda', 0, 100),
        'reg_alpha': hp.uniform('reg_alpha', 1e-5, 0.5),
        #'tree_method': 'exact'
}

def xgb_search(params):
    X_tr, X_tst, y_tr, y_tst= train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify= y_train)

    model = XGBClassifier(**params, random_state=10, objective='binary:logistic',eval_metric='logloss', n_estimators= 100)
    
    evaluation = [(X_tst, y_tst)]

    model.fit(X_tr, y_tr,
              eval_set=evaluation,
              early_stopping_rounds=30, verbose=False)

    val_preds = model.predict_proba(X_tst)[:,-1]

    score = log_loss(y_tst, val_preds)
    neg_auc_score = score

    return {"loss": neg_auc_score, "status": STATUS_OK}

In [None]:
# search for model
trials = Trials()

best_hyperparams_xgb = fmin(fn=xgb_search,
                       space=xgb_search_params,
                       algo=tpe.suggest,
                       max_evals=500,
                       trials=trials)

In [None]:
best_hyperparams_xgb

In [None]:
xgb_param= {'booster': 'dart',
 'lambda': 1.7800060362493527e-08,
 'alpha': 2.315582418892958e-07,
 'max_depth': 2,
 'eta': 0.0808503042845254,
 'gamma': 0.2202760015065291,
 'grow_policy': 'depthwise',
 'sample_type': 'uniform',
 'normalize_type': 'forest',
 'rate_drop': 5.627374583340628e-07,
 'skip_drop': 1.5691739428897544e-06}

lgbm_param= {'reg_lambda': 0.05990285576078542,
 'reg_alpha': 0.002761887597431145,
 'colsample_bytree': 0.8,
 'subsample': 0.4,
 'learning_rate': 0.03959160086546493,
 'max_depth': 4,
 'min_child_samples': 54,
 'num_leaves': 58}

In [None]:
subbed = []
r2= []
n_folds=10
preds = np.zeros(len(X_test.index),)
f= StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
for fold, (train_idx, test_idx) in enumerate(f.split(X_train, y_train)):
    print('=============== Fold No:',fold+1,'===============')
    X_tr, X_tst = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_tr, y_tst = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    model= LGBMClassifier(metric='binary_logloss', n_estimators=5000, random_state= 10, **lgbm_param)
    model.fit(X_tr, y_tr, eval_set=[(X_tst, y_tst)], early_stopping_rounds=500, verbose=500)
    print(log_loss(y_tst, model.predict_proba(X_tst)[:,1]))
    subbed.append(log_loss(y_tst, model.predict_proba(X_tst)[:,1]))
    test_oof_preds = model.predict_proba(X_test)[:,1]
    preds += test_oof_preds / n_folds
    
print('RMSE train: ',(np.mean(subbed)))

In [None]:
# function for creating a feature importance dataframe
def imp_df(column_names, importances):
    df = pd.DataFrame({'feature': column_names,
                       'feature_importance': importances}) \
           .sort_values('feature_importance', ascending = False) \
           .reset_index(drop = True)
    return df

# plotting a feature importance dataframe (horizontal barchart)
def var_imp_plot(imp_df):
    plt.figure(figsize=(15,10))
    imp_df.columns = ['feature', 'feature_importance']
    sns.barplot(x = 'feature_importance', y = 'feature', data = imp_df, orient = 'h', color = 'royalblue')

In [None]:
from rfpimp import permutation_importances

def r2(rf, X_train, y_train):
    return log_loss(y_train, model.predict_proba(X_train)[:,-1])

perm_imp_rfpimp = permutation_importances(model, X_train, y_train, r2)
perm_imp_rfpimp.reset_index(drop = False, inplace = True)


In [None]:
var_imp_plot(perm_imp_rfpimp)

In [65]:
sub= pd.DataFrame()
sub['Loan Status']= preds
sub.to_csv('my_submission14.csv', index=False)

In [None]:
import xgboost as xgb
def objective_xgb(trial, data = X_train, target = y_train):
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25, stratify=y_train)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    param = {
        "verbosity": 0,
        #"objective": "binary:logistic",
        #'n_estimators': trial.suggest_int("n_estimators", 50, 5000),
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True)}

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    gbm = XGBClassifier(**param,n_estimators= 100,objective='binary:logistic',eval_metric='logloss')

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose = 100)

    preds = gbm.predict_proba(valid_x)[:,1]

    roc = log_loss(valid_y, preds)
    return roc

In [None]:
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=100)
print("Number of finished trials: {}".format(len(study_xgb.trials)))
print("Best trial:", study_xgb.best_trial.params)

In [None]:
study_xgb.best_params

In [52]:
def objective_log(trial, data = X_train, target = y_train):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=10, stratify=target)
    sc= RobustScaler()
    sc.fit(train_x)
    train_x= sc.transform(train_x)
    test_x= sc.transform(test_x)
    
    param = {
        "tol": trial.suggest_float('tol', 1e-7, 1.0, log= True),
        'Cs': trial.suggest_int('Cs', 1.0,15.0),
        'max_iter': trial.suggest_int('max_iter', 100,5000),
        'penalty': trial.suggest_categorical('penalty', ['l2'])
        #'warm_start': trial.suggest_categorical('warm_start', [True, False]),
        
        #"objective": "binary:logistic",
        #'n_estimators': trial.suggest_int("n_estimators", 50, 5000),
        #"solver": trial.suggest_categorical("solver", ['liblinear', 'saga'])
    }

    #if param["solver"] == "saga":
    #    param["penalty"] = trial.suggest_categorical("penalty", ['l2'])
    #if param["solver"] == "newton-cg":
    #    param["penalty"] = trial.suggest_categorical("penalty", ['l2'])
    #if param["solver"] == "lbfgs":
    #    param["penalty"] = trial.suggest_categorical("penalty", ['l2'])
    #if param["solver"] == "sag":
    #    param["penalty"] = trial.suggest_categorical("penalty", ['l2'])
    #if param['solver'] == 'liblinear':
    #    param['penalty'] = trial.suggest_categorical('penalty', ['l2'])

    gbm = LogisticRegressionCV(**param,random_state=10, scoring='neg_log_loss')

    gbm.fit(train_x, train_y)

    preds = gbm.predict_proba(test_x)[:,1]

    roc = log_loss(test_y, preds)
    return roc

In [53]:
study_log = optuna.create_study(direction="minimize")
study_log.optimize(objective_log, n_trials=100)
print("Number of finished trials: {}".format(len(study_log.trials)))
print("Best trial:", study_log.best_trial.params)

[32m[I 2021-12-07 12:46:42,861][0m A new study created in memory with name: no-name-4253ff81-6486-4d7a-ad63-dd76fb86e78e[0m
[32m[I 2021-12-07 12:47:08,702][0m Trial 0 finished with value: 0.3075670371158694 and parameters: {'tol': 2.3663413794121485e-07, 'Cs': 13, 'max_iter': 2849, 'penalty': 'l2'}. Best is trial 0 with value: 0.3075670371158694.[0m
[32m[I 2021-12-07 12:47:36,540][0m Trial 1 finished with value: 0.30758547668034103 and parameters: {'tol': 5.146336855655604e-07, 'Cs': 11, 'max_iter': 4944, 'penalty': 'l2'}. Best is trial 0 with value: 0.3075670371158694.[0m
[32m[I 2021-12-07 12:48:04,048][0m Trial 2 finished with value: 0.30770870706131814 and parameters: {'tol': 1.6006692674135188e-05, 'Cs': 3, 'max_iter': 3469, 'penalty': 'l2'}. Best is trial 0 with value: 0.3075670371158694.[0m
[32m[I 2021-12-07 12:48:30,987][0m Trial 3 finished with value: 0.3075825768163608 and parameters: {'tol': 7.477784244700018e-07, 'Cs': 6, 'max_iter': 3896, 'penalty': 'l2'}. Bes

[32m[I 2021-12-07 13:04:38,415][0m Trial 36 finished with value: 0.30756834821155765 and parameters: {'tol': 0.2544699252544379, 'Cs': 12, 'max_iter': 684, 'penalty': 'l2'}. Best is trial 8 with value: 0.3075632486769288.[0m
[32m[I 2021-12-07 13:05:16,658][0m Trial 37 finished with value: 0.30758547668034103 and parameters: {'tol': 0.974360005521117, 'Cs': 11, 'max_iter': 919, 'penalty': 'l2'}. Best is trial 8 with value: 0.3075632486769288.[0m
[32m[I 2021-12-07 13:06:00,219][0m Trial 38 finished with value: 0.3075632486769288 and parameters: {'tol': 0.10481168797609357, 'Cs': 14, 'max_iter': 3333, 'penalty': 'l2'}. Best is trial 8 with value: 0.3075632486769288.[0m
[32m[I 2021-12-07 13:06:37,157][0m Trial 39 finished with value: 0.3075670371158694 and parameters: {'tol': 0.125190343612174, 'Cs': 13, 'max_iter': 3207, 'penalty': 'l2'}. Best is trial 8 with value: 0.3075632486769288.[0m
[32m[I 2021-12-07 13:07:08,603][0m Trial 40 finished with value: 0.30770870706131814 an

[32m[I 2021-12-07 13:25:54,726][0m Trial 72 finished with value: 0.3075670371158694 and parameters: {'tol': 0.2584905884179722, 'Cs': 13, 'max_iter': 1110, 'penalty': 'l2'}. Best is trial 8 with value: 0.3075632486769288.[0m
[32m[I 2021-12-07 13:26:23,575][0m Trial 73 finished with value: 0.3076379776163745 and parameters: {'tol': 0.7227952132865528, 'Cs': 9, 'max_iter': 608, 'penalty': 'l2'}. Best is trial 8 with value: 0.3075632486769288.[0m
[32m[I 2021-12-07 13:26:50,557][0m Trial 74 finished with value: 0.3075632486769288 and parameters: {'tol': 6.2427948100151214e-06, 'Cs': 14, 'max_iter': 218, 'penalty': 'l2'}. Best is trial 8 with value: 0.3075632486769288.[0m
[32m[I 2021-12-07 13:27:16,993][0m Trial 75 finished with value: 0.3075564645920559 and parameters: {'tol': 0.10374701392653762, 'Cs': 13, 'max_iter': 155, 'penalty': 'l2'}. Best is trial 75 with value: 0.3075564645920559.[0m
[32m[I 2021-12-07 13:27:52,085][0m Trial 76 finished with value: 0.30759454393720465

Number of finished trials: 100
Best trial: {'tol': 0.10374701392653762, 'Cs': 13, 'max_iter': 155, 'penalty': 'l2'}


In [55]:
study_log.best_params

{'tol': 0.10374701392653762, 'Cs': 13, 'max_iter': 155, 'penalty': 'l2'}

In [None]:
def objective(trial,data=X_train,target=y_train):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    
    # To select which parameters to optimize, please look at the XGBoost documentation:
    # https://xgboost.readthedocs.io/en/latest/parameter.html
    param = {
        #'random_state': 10,
        #'random_state': trial.suggest_int('random_state', 0,2021),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001,0.1),
        #'n_estimators': trial.suggest_int("n_estimators", 50, 5000),
        'max_depth': trial.suggest_categorical('max_depth', [4,5,7,9,11,13,15,17,20]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'num_leaves': trial.suggest_int('num_leaves', 15, 120),
    }
    model = LGBMClassifier(**param, n_estimators=5000, metric= 'binary_logloss', random_state=10)  
    
    model.fit(train_x,train_y,eval_set=[(test_x, test_y)], early_stopping_rounds=300, verbose=500)
    
    preds = model.predict_proba(test_x)[:,1]
    
    roc = log_loss(test_y, preds)
    
    return roc

In [None]:
study_lgbm = optuna.create_study(direction="minimize")
study_lgbm.optimize(objective, n_trials=100)
print("Number of finished trials: {}".format(len(study_lgbm.trials)))
print("Best trial:", study_lgbm.best_trial.params)

In [None]:
study_lgbm.best_params

In [57]:
sc = RobustScaler()
sc.fit(X_train)
X_train= sc.transform(X_train)
X_test= sc.transform(X_test)

In [None]:
X_train.shape

In [None]:
def my_model():
    inputs= layers.Input(shape = (50))
    first= layers.Dense(128, activation= 'relu')
    #embed = layers.Flatten()(first)
    second= layers.Dense(64, activation='relu')(first)
    dropout= layers.Dropout(0.2)(first)
    third= layers.Dense(64, activation='relu')(dropout)
    dropout= layers.Dropout(0.2)(third)
    conc= layers.concatenate()([second, third])
    final= layers.Dense(1, activation='sigmoid')(conc)
    model = Model(inputs, final)
    return model

In [None]:
cce = tf.keras.losses.BinaryCrossentropy()

es = tf.keras.callbacks.EarlyStopping(
    monitor='val_binary_crossentropy', min_delta=1e-05, patience=5, verbose=0,
    mode='min', baseline=None, restore_best_weights=True)

plateau = tf.keras.callbacks.ReduceLROnPlateau(mode='min',
    monitor='val_binary_crossentropy', factor=0.7, patience=5, verbose=0)

In [64]:
subbed = []
r2= []
n_folds=10
preds = np.zeros(len(X_test.index),)
f= StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
for fold, (train_idx, test_idx) in enumerate(f.split(X_train, y_train)):
    print('=============== Fold No:',fold+1,'===============')
    X_tr, X_tst = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_tr, y_tst = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    model= RandomForestClassifier(random_state=10,n_estimators= 500)
    model.fit(X_tr, y_tr)
    print(log_loss(y_tst, model.predict_proba(X_tst)[:,1]))
    subbed.append(log_loss(y_tst, model.predict_proba(X_tst)[:,1]))
    test_oof_preds = model.predict_proba(X_test)[:,1]
    preds += test_oof_preds / n_folds
    
print('RMSE train: ',(np.mean(subbed)))

0.31206345447112666
0.316764690722545
0.31931428502381176
0.31137947721280373
0.31767239512738604
0.31650907321564636
0.31643262879992473
0.31561264601610395
0.3149506778784241
0.31667088761697193
RMSE train:  0.3157370216084744


In [None]:
def conv_model():

    conv_inputs = layers.Input(shape = (83))
    #----------- Embedding layers ----------------------#
    embed = layers.Embedding (input_dim = 354, 
                              output_dim = 7,
                              embeddings_regularizer='l2')(conv_inputs)
    #----------- Convolution layers ----------------------#
    embed = layers.Conv1D(18,1,activation = 'relu')(embed)
    embed = layers.Flatten()(embed)
    hidden = layers.Dropout(0.3)(embed)
    
    #----------- Residual blocks layers ----------------------#
    hidden = tfa.layers.WeightNormalization(
                layers.Dense(
                units=32,
                activation ='selu',
                kernel_initializer = "lecun_normal"))(hidden)
    
    output = layers.Dropout(0.3)(layers.Concatenate()([embed, hidden]))
    output = tfa.layers.WeightNormalization(
    layers.Dense(
                units = 32,
                activation='relu',
                kernel_initializer = "lecun_normal"))(output) 
    output = layers.Dropout(0.4)(layers.Concatenate()([embed, hidden, output]))
    output = tfa.layers.WeightNormalization(
    layers.Dense(
                units = 32, 
                activation = 'elu',
                kernel_initializer = "lecun_normal"))(output)
    
    #----------- Final layer -----------------------#
    conv_outputs = layers.Dense(
                units = 1, 
                activation ='sigmoid',
                kernel_initializer ="lecun_normal")(output)
    
    #----------- Model instantiation  ---------------#
    model = Model(conv_inputs,conv_outputs)
    
    return model