In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold 
from sklearn.ensemble import ExtraTreesRegressor
import datetime
from sklearn.preprocessing import StandardScaler
import keras.backend as K
import timeit
import lightgbm as lgb 
from sklearn.model_selection import StratifiedKFold 


In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train = reduce_mem_usage(pd.read_csv('/kaggle/input/elo-merchant-category-recommendation/train.csv',parse_dates=['first_active_month']))
test = reduce_mem_usage(pd.read_csv('/kaggle/input/elo-merchant-category-recommendation/test.csv',parse_dates=['first_active_month']))
mer = reduce_mem_usage(pd.read_csv('/kaggle/input/elo-merchant-category-recommendation/merchants.csv'))
nmt = reduce_mem_usage(pd.read_csv('/kaggle/input/elo-merchant-category-recommendation/new_merchant_transactions.csv',parse_dates=['purchase_date']))
ht = reduce_mem_usage(pd.read_csv('/kaggle/input/elo-merchant-category-recommendation/historical_transactions.csv',parse_dates=['purchase_date']))

print("Shape of train set                 : ",train.shape)
print("Shape of test set                  : ",test.shape)
print("Shape of historical_transactions   : ",ht.shape)
print("Shape of merchants                 : ",mer.shape)
print("Shape of new_merchant_transactions : ",nmt.shape)

def cleaning(df):
  scaler = StandardScaler()
  for col in ['authorized_flag', 'category_1']:
    df[col] = df[col].map({'Y':1, 'N':0})  
    df[col] = df[col].apply(pd.to_numeric, errors='coerce')
  for col in ['installments']:
    df[col] = df[col].map({-1:14, 0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9,10:10,11:11,12:12,999:13})
    df[col] = df[col].apply(pd.to_numeric, errors='coerce')
  for col in ['category_3']:
    df[col] = df[col].map({'A':1, 'B':2,'C':3})
    df[col] = df[col].apply(pd.to_numeric, errors='coerce')
  for col in ['category_2']:
    df[col] = df[col].apply(pd.to_numeric, errors='coerce')   
  for col in ['purchase_amount']:        
    df[col] = scaler.fit_transform(df[[col]])      
  return df

ht_copy = cleaning(ht.copy())
nmt_copy = cleaning(nmt.copy())


# Missing values handling
for df in [ht_copy, nmt_copy]: # Filling with most common value
  df['category_2'].fillna(1,inplace=True)
  df['category_3'].fillna(1,inplace=True)
  df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
  # Purchase date - year, month, week, hour
  df['purchase_date'] = pd.to_datetime(df['purchase_date'])
  df['year'] = df['purchase_date'].dt.year
  df['weekofyear'] = df['purchase_date'].dt.weekofyear
  df['month'] = df['purchase_date'].dt.month
  df['dayofweek'] = df['purchase_date'].dt.dayofweek
  df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
  df['hour'] = df['purchase_date'].dt.hour

from datetime import datetime
# Here we are trying to calculate recency, frequency, monetary and age.
# Recency is how many days back did customer perform a last transaction.
# Frequency is how many transactions are performed in time period from dataset.
# Monetary is how much was spent in all the transactions.

hist = ht_copy[['card_id','purchase_date','purchase_amount']]
hist = hist.sort_values(by=['card_id', 'purchase_date'], ascending=[True, True])
print(hist.head())

z = hist.groupby('card_id')['purchase_date'].max().reset_index()
q = hist.groupby('card_id')['purchase_date'].min().reset_index()

z.columns = ['card_id', 'Max']
q.columns = ['card_id', 'Min']

## Extracting current timestamp
now = datetime.now()
curr_date = now.strftime("%m-%d-%Y, %H:%M:%S")
curr_date = pd.to_datetime(curr_date)

rec = pd.merge(z,q,how = 'left',on = 'card_id')
rec['Min'] = pd.to_datetime(rec['Min'])
rec['Max'] = pd.to_datetime(rec['Max'])

## Recency value 
rec['Recency'] = (curr_date - rec['Max']).astype('timedelta64[D]') ## current date - most recent date

## Age value
rec['Age'] = (rec['Max'] - rec['Min']).astype('timedelta64[D]') ## Age of customer, MAX - MIN

rec = rec[['card_id','Age','Recency']]


## Frequency
freq = hist.groupby('card_id').size().reset_index()
freq.columns = ['card_id', 'Frequency']

## Monetary
mon = hist.groupby('card_id')['purchase_amount'].sum().reset_index()
mon.columns = ['card_id', 'Monetary']

final = pd.merge(freq,mon,how = 'left', on = 'card_id')
final = pd.merge(final,rec,how = 'left', on = 'card_id')

final['AvOrderValue'] = final['Monetary']/final['Frequency'] ## AOV - Average order value (i.e) total_purchase_amt/total_trans
final['AgeRecencyRatio'] = final['Age']/final['Recency'] ## 

print("No. of null columns in CustomerLifeValue: \n",final.isnull().sum())
final.head()


ht_copy['purchase_date'] = pd.to_datetime(ht_copy['purchase_date'])
ht_copy['month_diff'] = ((datetime.today() - ht_copy['purchase_date']).dt.days)//30
ht_copy['month_diff'] += ht_copy['month_lag']

nmt_copy['purchase_date'] = pd.to_datetime(nmt_copy['purchase_date'])
nmt_copy['month_diff'] = ((datetime.today() - nmt_copy['purchase_date']).dt.days)//30
nmt_copy['month_diff'] += nmt_copy['month_lag']

hc = ht_copy.isnull().sum()
nc = nmt_copy.isnull().sum()


Mem. usage decreased to  4.04 Mb (56.2% reduction)
Mem. usage decreased to  2.24 Mb (52.5% reduction)
Mem. usage decreased to 30.32 Mb (46.0% reduction)
Mem. usage decreased to 114.20 Mb (45.5% reduction)
Mem. usage decreased to 1749.11 Mb (43.7% reduction)
Shape of train set                 :  (201917, 6)
Shape of test set                  :  (123623, 5)
Shape of historical_transactions   :  (29112361, 14)
Shape of merchants                 :  (334696, 22)
Shape of new_merchant_transactions :  (1963031, 14)


  sqr = np.multiply(arr, arr, out=arr)


                  card_id       purchase_date  purchase_amount
19095896  C_ID_00007093c1 2017-02-14 14:00:43     6.895150e-07
19095775  C_ID_00007093c1 2017-02-14 15:47:45    -4.484254e-04
19095845  C_ID_00007093c1 2017-02-16 15:37:58    -5.420467e-04
19095866  C_ID_00007093c1 2017-02-20 12:19:01    -3.275201e-04
19095808  C_ID_00007093c1 2017-03-03 00:24:15    -4.631373e-04
No. of null columns in CustomerLifeValue: 
 card_id            0
Frequency          0
Monetary           0
Age                0
Recency            0
AvOrderValue       0
AgeRecencyRatio    0
dtype: int64


In [3]:
def aggregate_transactions_small(history):
    
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {    
        
    'month': ['mean','nunique'],     
    'month_diff': ['mean', 'min', 'max','nunique'],     
    'year': ['nunique', 'mean'],
    'category_1' : ['sum',  'nunique'],
    'category_2' : ['sum', 'nunique'],
    'category_3' : ['sum',  'nunique'],
    'purchase_amount': ['sum', 'mean', 'max', 'min'],
    'installments': ['sum', 'mean', 'max', 'min'],     
    'authorized_flag': ['nunique'],
    'month_lag': ['mean']  
    }
    
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')    
    return agg_history
  
history = aggregate_transactions_small(ht_copy)
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]

new = aggregate_transactions_small(nmt_copy)
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]

# Merge all dataframes based on card_id
train = pd.merge(train, history, on='card_id', how='left')
test = pd.merge(test, history, on='card_id', how='left')

train = pd.merge(train, new, on='card_id', how='left')
test = pd.merge(test, new, on='card_id', how='left')

train = pd.merge(train, final, on='card_id', how='left')
test = pd.merge(test, final, on='card_id', how='left')

train.drop(['first_active_month','card_id'], inplace=True, axis=1)
test.drop(['first_active_month','card_id'], inplace=True, axis=1)
print(train.columns)
print("Train/Test Shape: ",train.shape,test.shape)
train.dropna(inplace=True)
print(train.shape)
y = train['target']
print("y shape: ",y.shape)
print("Before drop - X shape: ",train.shape)
X = train.drop(['target'], axis=1)
print("After drop - X shape: ",X.shape)
print("Nulls in X: ",(X.isnull().sum() > 0 ).sum())
print("Nulls in y: ",(y.isnull().sum() > 0 ).sum())

X_train_all, X_test_com, y_train_all, y_test_com = train_test_split(X, y, test_size=0.20, random_state=42)
print("X_train_all, X_test_com, y_train_all, y_test_com shape:",X_train_all.shape, X_test_com.shape, y_train_all.shape, y_test_com.shape)

X_train_com, X_val_com, y_train_com, y_val_com = train_test_split(X_train_all, y_train_all, test_size=0.20, random_state=42)
print("X_train_com, X_val_com, y_train_com, y_val_com shape:",X_train_com.shape, X_val_com.shape, y_train_com.shape, y_val_com.shape)

#definind the rmse metric
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

Index(['feature_1', 'feature_2', 'feature_3', 'target',
       'hist_transactions_count', 'hist_month_mean', 'hist_month_nunique',
       'hist_month_diff_mean', 'hist_month_diff_min', 'hist_month_diff_max',
       'hist_month_diff_nunique', 'hist_year_nunique', 'hist_year_mean',
       'hist_category_1_sum', 'hist_category_1_nunique', 'hist_category_2_sum',
       'hist_category_2_nunique', 'hist_category_3_sum',
       'hist_category_3_nunique', 'hist_purchase_amount_sum',
       'hist_purchase_amount_mean', 'hist_purchase_amount_max',
       'hist_purchase_amount_min', 'hist_installments_sum',
       'hist_installments_mean', 'hist_installments_max',
       'hist_installments_min', 'hist_authorized_flag_nunique',
       'hist_month_lag_mean', 'new_transactions_count', 'new_month_mean',
       'new_month_nunique', 'new_month_diff_mean', 'new_month_diff_min',
       'new_month_diff_max', 'new_month_diff_nunique', 'new_year_nunique',
       'new_year_mean', 'new_category_1_sum', 'new_c


#  Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression() 
regressor.fit(X_train_all, y_train_all) #training the algorithm

y_pred_linear = regressor.predict(X_test_com)
print("RMSE for Bagging Regressor::{:.3f}".format(rmse(y_test_com, y_pred_linear)))

RMSE for Bagging Regressor::3.485


# Ridge Regression

In [5]:

from sklearn.linear_model import Ridge 
ridge = Ridge()

parameters={'alpha': [1e-3,1e-2]} 
ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_root_mean_squared_error',cv=5) 
ridge_regressor.fit(X_train_all, y_train_all) 
y_pred_ridge = regressor.predict(X_test_com)
print("RMSE for Bagging Regressor::{:.3f}".format(rmse(y_test_com, y_pred_ridge)))

RMSE for Bagging Regressor::3.485


# Lasso Regression

In [6]:
from sklearn.linear_model import Lasso 

lasso = Lasso() 
parameters={'alpha':[1e-3], 'max_iter':[5000]} 
lasso_regressor=GridSearchCV(lasso,parameters,scoring='neg_root_mean_squared_error',cv=5)
lasso_regressor.fit(X_train_all, y_train_all) 
y_pred_lasso = lasso_regressor.predict(X_test_com) 
print("RMSE for Lasso Regressor::{:.3f}".format(rmse(y_test_com, y_pred_lasso)))

  positive)
  positive)
  positive)
  positive)
  positive)


RMSE for Lasso Regressor::3.492


  positive)


# DecisionTreeRegressor

In [7]:
from sklearn.tree import DecisionTreeRegressor 
treeRegressor = DecisionTreeRegressor()

param_grid = {"criterion": ["mse"], "max_depth": [5], "min_samples_split": [8], "max_leaf_nodes": [15], "max_features": [25], "min_impurity_decrease":[0.1] } 
grid_decision = GridSearchCV(treeRegressor, param_grid, cv=3,verbose=1,n_jobs=-1) 
grid_decision.fit(X_train_all, y_train_all) 
y_pred_decision = grid_decision.predict(X_test_com)
print("RMSE for Decision Tree Regressor::{:.3f}".format(rmse(y_test_com, y_pred_decision)))

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.6s finished


RMSE for Decision Tree Regressor::3.475


# RandomForestRegressor

In [8]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor 
param_grid = {"criterion": ["mse"], 'n_estimators': [1000], "max_depth": [5], "max_leaf_nodes" : [5], "min_samples_split":[8] , "max_features": [25], "min_impurity_decrease":[0.1] } 
forestRegressor = RandomForestRegressor(random_state = 10)

grid_forest = GridSearchCV(forestRegressor, param_grid, cv=3, verbose=1,n_jobs=-1) 
grid_forest.fit(X_train_all, y_train_all)
y_pred_forest = grid_forest.predict(X_test_com)
print("RMSE for Random Forest Regressor::{:.3f}".format(rmse(y_test_com, y_pred_decision)))

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.6min finished


RMSE for Random Forest Regressor::3.475


# ExtraTreesRegressor 

In [9]:
from sklearn.ensemble import ExtraTreesRegressor

extraRegressor = ExtraTreesRegressor(random_state = 10) 
param_grid = {"criterion": ["mse"], 'n_estimators': [500], "max_depth": [5], "max_leaf_nodes" : [5], "min_samples_leaf":[2], "min_samples_split":[2] , "max_features": [25], "min_impurity_decrease":[0.1] }

grid_extra = GridSearchCV(extraRegressor, param_grid, cv=3, verbose=1,n_jobs=-1) 
grid_extra.fit(X_train_all, y_train_all) 
y_pred_extra = grid_extra.predict(X_test_com)
print("RMSE for Extra Trees Regressor::{:.3f}".format(rmse(y_test_com, y_pred_extra)))

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   26.6s finished


RMSE for Extra Trees Regressor::3.525


# AdaboostRegressor

In [10]:
from sklearn.ensemble import AdaBoostRegressor 
param = { 'n_estimators':[50], 'learning_rate':[1e-2], 'loss':['exponential'] } 
adaRegressor = AdaBoostRegressor(random_state = 10)

grid_ada = GridSearchCV(adaRegressor, param, cv = 3, n_jobs = -1, verbose=1) 
grid_ada.fit(X_train_all, y_train_all) 
y_pred_ada = grid_ada.predict(X_test_com)
print("RMSE for Ada Boost Regressor::{:.3f}".format(rmse(y_test_com, y_pred_ada)))

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.1min finished


RMSE for Ada Boost Regressor::3.471


# GradientBoostingRegressor

In [11]:
from sklearn.ensemble import GradientBoostingRegressor 
param_grid = {'n_estimators': [100]} 
gbRegressor = GradientBoostingRegressor(random_state = 10)

grid_gb = GridSearchCV(gbRegressor, param_grid, cv=3, verbose=1,n_jobs=-1) 
grid_gb.fit(X_train_all, y_train_all) 
y_pred_gb = grid_gb.predict(X_test_com)
print("RMSE for Gradient Boost Regressor::{:.3f}".format(rmse(y_test_com, y_pred_gb)))

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.7min finished


RMSE for Gradient Boost Regressor::3.425


# XGBRegressor

In [12]:
import timeit
from xgboost import XGBRegressor
start = timeit.default_timer()
xgb = XGBRegressor() 
parameters = { 'gamma': [8], 'eval_metric' :['rmse'],'eta': [0.5], 'colsample_bytree':[0.3], 'min_child_weight': [3], 'max_depth' :[3], 'max_features':[5],'subsample': [0.7],'tree_method':['auto'], 'reg_alpha':[1000], "criterion": ["mse"],'n_estimators': [1000] ,'seed':[11] }

grid_xgb = GridSearchCV(xgb, parameters, cv = 6, n_jobs = -1, verbose=1) 
grid_xgb.fit(X_train_all, y_train_all) 
y_pred_xgb = grid_xgb.predict(X_test_com)

stop = timeit.default_timer()
execution_time = (stop - start)/60 
print("Ensemble Executed in {} minutes".format(str(execution_time)))
print("RMSE for XG Boost Regressor::{:.3f}".format(rmse(y_test_com, y_pred_gb)))

Fitting 6 folds for each of 1 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  5.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  5.0min finished


Parameters: { criterion, max_features } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Ensemble Executed in 6.062381015150004 minutes
RMSE for XG Boost Regressor::3.425


# Ensemble

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
print("X_train, X_test, y_train, y_test shape:",X_train.shape, X_test.shape, y_train.shape, y_test.shape)

SEED = 10 

#Using Decision Tree as a base model
def get_model():
    dt = DecisionTreeRegressor(criterion = 'mse', max_depth = 5, max_features = 50, max_leaf_nodes = 15, min_impurity_decrease = 0.1, random_state=SEED)
    return dt

from random import choices 

def get_samples(D1_train, D2_train, n_estimators): 
    print("\nCalculating Samples with replacement...") 
    samples_train_appender = [] 
    samples_test_appender = [] 
    all_indexes = D1_train.index 
    population_size = D1_train.shape[0]
    sample_size = round(population_size/n_estimators)
    bumpedup_sample_size = int(sample_size * 1.85)

    for s in range(n_estimators): 
        samples_index = choices(all_indexes, k = sample_size)
        sample_train_df = D1_train[D1_train.index.isin(samples_index)] 
        sample_test_df = D2_train[D2_train.index.isin(samples_index)]
        samples_train_appender.append(sample_train_df) 
        samples_test_appender.append(sample_test_df)

    all_train_samples = pd.concat(samples_train_appender,ignore_index=True) 
    all_test_samples = pd.concat(samples_test_appender,ignore_index=True) 
    print("Samples calculation Done.")
    return all_train_samples, all_test_samples, sample_size

#Calculate RMSE
def evaluate_model(y_pred, y_actual): 
    print("Evaluating Score...\n") 
    mse = rmse(y_actual, y_pred) 
    print('RMSE %.3f' % (np.sqrt(mse)))

def train_predict(n_estimators, all_train_samples, all_test_samples, D1_test,D2_test, sample_size): 
    """Fit models in list on training set and return preds""" 
    P = np.zeros((D2_test.shape[0], n_estimators)) 
    P = pd.DataFrame(P) 
    models_list = []

    print("Sample size: ", sample_size) 
    cols = list() 
    print("Base models {} - fitting and predicting ...".format(n_estimators))

    for i in range(0,n_estimators):
        j = sample_size * i 
        k = sample_size * (i + 1) 
        x_data = all_train_samples[j:k] 
        y_data = all_test_samples[j:k] 
        model = get_model()

        model.fit(x_data, y_data)  
        models_list.append(model)           
        pred = (model.predict(D1_test))        
        P.iloc[:,i] = pred
        cols.append(i)
    P.columns = cols 
    print("Base models done.") 
    return P, models_list

def custom_ensemble(X_train,y_train,n_estimators): 
    print("Preparing Custom Ensemble...")

    #Split X_train into D1,D2 (50-50) 
    D1_train, D1_test, D2_train, D2_test = train_test_split(X_train, y_train, test_size=0.5, random_state=42) 
    print("D1_train, D1_test, D2_train, D2_test: ",D1_train.shape, D1_test.shape, D2_train.shape, D2_test.shape)

    #Get Samples 
    all_samples_train, all_samples_test, sample_size = get_samples(D1_train, D2_train, n_estimators)

    #Get predictions 
    P, models_list = train_predict(n_estimators, all_samples_train, all_samples_test,D1_test, D2_test, sample_size) 
    print("Custom Ensemble Done.") 
    return P, models_list, D2_test

def super_train_predict(n_estimators, models_list, test_set): 
    """Fit models in list on training set and return preds""" 
    meta_pred = np.zeros((test_set.shape[0] , n_estimators)) 
    meta_pred = pd.DataFrame(meta_pred)

    print("Predicting {} models from metalearner...".format(n_estimators)) 
    cols = list() 
    for i in range(0,n_estimators):
        model = models_list[i]
        pred = (model.predict(test_set))
        meta_pred.iloc[:,i] = pred 
        cols.append(i)
    
    meta_pred.columns = cols
    print("Meta Learner prediction Done.") 
    return meta_pred


start = timeit.default_timer()
xgb = XGBRegressor() 
parameters = { 'gamma': [8], 'eval_metric' :['rmse'],'eta': [0.5], 'colsample_bytree':[0.3], 'min_child_weight': [3], 'max_depth' :[3], 'max_features':[5],'subsample': [0.7],'tree_method':['auto'], 'reg_alpha':[1000], "criterion": ["mse"],'n_estimators': [1000] ,'seed':[11] }

meta_learner = GridSearchCV(xgb, parameters, cv = 6, n_jobs = -1, verbose=1)

n_estimators = 100 
print("Fitting models to meta-learner.") 
P, models_list, D2_test = custom_ensemble(X_train, y_train, n_estimators) 
meta_learner.fit(P, D2_test)

# Ensemble final prediction and evaluation
meta_pred = super_train_predict(n_estimators, models_list, X_test) # X_test brought from first split 
pred_final = meta_learner.predict(meta_pred)

rmse = evaluate_model(pred_final,y_test)

stop = timeit.default_timer() 
execution_time = (stop - start)/60

print("Ensemble Executed in {} minutes".format(str(execution_time)))


X_train, X_test, y_train, y_test shape: (143988, 59) (35998, 59) (143988,) (35998,)
Fitting models to meta-learner.
Preparing Custom Ensemble...
D1_train, D1_test, D2_train, D2_test:  (71994, 59) (71994, 59) (71994,) (71994,)

Calculating Samples with replacement...
Samples calculation Done.
Sample size:  720
Base models 100 - fitting and predicting ...
Base models done.
Custom Ensemble Done.
Fitting 6 folds for each of 1 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.9min finished


Parameters: { criterion, max_features } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Predicting 100 models from metalearner...
Meta Learner prediction Done.
Evaluating Score...

RMSE 1.859
Ensemble Executed in 3.5083673163833358 minutes


# Neural Networks

In [15]:
X_nn_train, X_nn_val, y_nn_train, y_nn_val = train_test_split(X, y, test_size=0.25, random_state=42) 
print("X_train, y_train, X_val, y_val Shape: ",X_nn_train.shape, y_nn_train.shape, X_nn_val.shape, y_nn_val.shape)

test_min = test_df[important_cols]
test_min.replace({np.inf: 0, -np.inf: 0}, inplace=True)
imputer_test = SimpleImputer(missing_values=np.nan,strategy='median')
imputer_test = imputer_test.fit(test_min)
test_nn = imputer_test.transform(test_min)
print("test_nn.shape: ",test_nn.shape)

# Scaling data
sc = StandardScaler()
X_nn_train = sc.fit_transform(X_nn_train)
test_nn = sc.transform(test_nn)

import keras.backend as K
from keras.layers import Dense, BatchNormalization, Dropout, Input
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential

#definind the rmse metric
def rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

fh_neurons = 1024 #first hidden layer
drop_rate = 0.7

#the model is just a sequence of fully connected layers, batch normalization and dropout using RELUs as activation functions
nmodel = Sequential()
nmodel.add(Dense(fh_neurons, input_dim=X_nn_train.shape[1], activation='elu'))
nmodel.add(BatchNormalization())
nmodel.add(Dropout(drop_rate))
nmodel.add(Dense(fh_neurons*2, activation='relu'))
nmodel.add(BatchNormalization())
nmodel.add(Dropout(drop_rate))
nmodel.add(Dense(fh_neurons*2, activation='relu'))
nmodel.add(Dense(fh_neurons, activation='relu'))
nmodel.add(Dense(1, activation='linear'))

nmodel.compile(optimizer='adam',loss=rmse)
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 5)
checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True)

nmodel.fit(X_nn_train, y_nn_train, validation_data = (X_nn_val, y_nn_val), epochs=15, batch_size=256, callbacks = [early_stopping, checkpointer])
pred_nn = nmodel.predict(test_nn)
print("RMSE for NN::{:.3f}".format(rmse(y_te, y_pred_nn)))

Epoch 1/15
Epoch 00001: val_loss improved from inf to 1591.00891, saving model to weights.hdf5
Epoch 2/15
Epoch 00002: val_loss improved from 1591.00891 to 40.64721, saving model to weights.hdf5
Epoch 3/15
Epoch 00003: val_loss improved from 40.64721 to 1.62873, saving model to weights.hdf5
Epoch 4/15
Epoch 00004: val_loss improved from 1.62873 to 1.56361, saving model to weights.hdf5
Epoch 5/15
Epoch 00005: val_loss did not improve from 1.56361
Epoch 6/15
Epoch 00006: val_loss improved from 1.56361 to 1.53994, saving model to weights.hdf5
Epoch 7/15
Epoch 00007: val_loss did not improve from 1.53994
Epoch 8/15
Epoch 00008: val_loss did not improve from 1.53994
Epoch 9/15
Epoch 00009: val_loss did not improve from 1.53994
Epoch 10/15
Epoch 00010: val_loss did not improve from 1.53994
Epoch 11/15
Epoch 00011: val_loss did not improve from 1.53994 
Early Stopping... 
RMSE for NN::{:.3f}: 3.4278


In [3]:
from prettytable import PrettyTable
print("\n =========== RMSE of Various Models ==========\n")

af = PrettyTable()

af.field_names = ["S No.","Model name", "RMSE"]

af.add_row(["1","Linear Regression", 3.485])
af.add_row(["2","Ridge Regression", 3.485])
af.add_row(["3","Lasso Regression", 3.492])
af.add_row(["4","Decision Tree Regressor", 3.475])
af.add_row(["5","Random Forest Regressor", 3.475])
af.add_row(["6","Extra Trees Regressor", 3.525])
af.add_row(["7","XGBoost Regressor", 3.425])
af.add_row(["8","AdaBoost Regressor", 3.471])
af.add_row(["9","GradientBoost Regressor", 3.425])
af.add_row(["10","Ensemble", 1.859])
af.add_row(["11","Neural Networks", 3.4278])
print(af)
print("Ensemble score is lowest at nearly half of other scores !!!")



+-------+-------------------------+--------+
| S No. |        Model name       |  RMSE  |
+-------+-------------------------+--------+
|   1   |    Linear Regression    | 3.485  |
|   2   |     Ridge Regression    | 3.485  |
|   3   |     Lasso Regression    | 3.492  |
|   4   | Decision Tree Regressor | 3.475  |
|   5   | Random Forest Regressor | 3.475  |
|   6   |  Extra Trees Regressor  | 3.525  |
|   7   |    XGBoost Regressor    | 3.425  |
|   8   |    AdaBoost Regressor   | 3.471  |
|   9   | GradientBoost Regressor | 3.425  |
|   10  |         Ensemble        | 1.859  |
|   11  |     Neural Networks     | 3.4278 |
+-------+-------------------------+--------+
Ensemble score is lowest at nearly half of other scores !!!
