In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='dark')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.metrics import mean_squared_error, mean_absolute_error

In [14]:
data = pd.read_csv(r"D:\Data Science\Projects\Analytics vidya\Black Friday Sales Prediction\train.csv")


In [3]:
train = pd.read_csv(r"D:\Data Science\Projects\Analytics vidya\Black Friday Sales Prediction\ensemble\ensemble_train.csv")

test = pd.read_csv(r"D:\Data Science\Projects\Analytics vidya\Black Friday Sales Prediction\ensemble\ensemble_test.csv")

sample_sub = pd.read_csv(r"D:\Data Science\Projects\Analytics vidya\Black Friday Sales Prediction\sample_submission_V9Inaty.csv")

In [39]:
train

Unnamed: 0,Purchase,User_ID,lgb,cb,xgb
0,8370,0,11163.782224,12176.196858,11086.746094
1,15200,0,16383.360569,16565.548807,14189.988281
2,1422,0,1384.388960,1420.149736,1653.466187
3,1057,0,1762.852569,1328.322984,1769.646729
4,7969,1,8141.290906,8037.352183,8271.673828
...,...,...,...,...,...
550063,368,5883,230.738345,57.602734,1107.480835
550064,371,5885,591.494319,1314.820736,513.187134
550065,137,5886,323.080429,641.406065,441.502075
550066,365,5888,225.670613,130.310366,-198.016510


In [40]:
target = 'Purchase'

features = ['lgb', 'cb', 'xgb']

# Models

**1. LGBM** : 
        
            {'max_depth': 10, 'n_estimators': 498, 'learning_rate': 0.10086566531362942, 'num_leaves': 2424, 'colsample_bytree': 0.594087419202551, 'min_child_samples': 98, 'reg_alpha': 4, 'reg_lambda': 3}
            
**2. XGBOOST** : 
            
            {'max_depth': 7, 'n_estimators': 383, 'learning_rate': 0.2049999461881386, 'colsample_bytree': 0.8082989108818205, 'reg_alpha': 9, 'reg_lambda': 9}
            
            
**3. CATBOOST**:

             {'max_depth': 16, 'n_estimators': 822, 'learning_rate': 0.34579951469275394, 'rsm': 0.255416100953266, 'reg_lambda': 860}
               

In [37]:
def boosting_cross_val(regressor, train, test, features, name):
    N_splits = 5
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True)
    stratified_target = pd.qcut( train[target], 10, labels = False, duplicates = 'drop')
    
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n=========================Fold{index+1}============================')
        
        ####### Getting Train, Validation and Test sets.
        
        ## Training Set
        X_trn, y_trn = train[features].iloc[trn_idx], target_col.iloc[trn_idx]
        
        ## Validation Set
        X_val, y_val = train[features].iloc[val_idx], target_col.iloc[val_idx]
        
        ## Test Set
        X_test = test[features]
        
        if name != 'cat':
            ###### Scaling Data ######
            scaler = StandardScaler()
            _ = scaler.fit(X_trn)

            X_trn = scaler.transform(X_trn)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
        
        
        ############ Fitting And Predicting #############
        _ = regressor.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
        
        ## Predicting
        val_preds = regressor.predict(X_val)
        test_preds = regressor.predict(X_test)
        
        fold_score = np.sqrt( mean_squared_error(y_val, val_preds))
        print(f'\n RMSE score for Validation set is : {fold_score}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    oofs_score = np.sqrt( mean_squared_error(target_col, oofs))
    print(f'\n\nRMSE score for oofs is {oofs_score}')
    
    return oofs, preds

In [38]:
def cross_val(regressor, train, test, features):
    N_splits = 5
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True)
    stratified_target = pd.qcut( train[target], 10, labels = False, duplicates = 'drop')
    
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n=========================Fold{index+1}============================')
        
        ####### Getting Train, Validation and Test sets.
        
        ## Training Set
        X_trn, y_trn = train[features].iloc[trn_idx], target_col.iloc[trn_idx]
        
        ## Validation Set
        X_val, y_val = train[features].iloc[val_idx], target_col.iloc[val_idx]
        
        ## Test Set
        X_test = test[features]
        ###### Scaling Data ######
        scaler = StandardScaler()
        _ = scaler.fit(X_trn)

        X_trn = scaler.transform(X_trn)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
        
        
        ############ Fitting And Predicting #############
        _ = regressor.fit(X_trn, y_trn)
        
        ## Predicting
        val_preds = regressor.predict(X_val)
        test_preds = regressor.predict(X_test)
        
        fold_score = np.sqrt( mean_squared_error(y_val, val_preds))
        print(f'\n RMSE score for Validation set is : {fold_score}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    oofs_score = np.sqrt( mean_squared_error(target_col, oofs))
    print(f'\n\nRMSE score for oofs is {oofs_score}')
    
    return oofs, preds

---
# Ensembling

In [43]:
lr = LinearRegression(normalize = True)

lr_oofs, lr_preds = cross_val(lr,train, test, features)



 RMSE score for Validation set is : 2405.050160302663


 RMSE score for Validation set is : 2417.7733254354785


 RMSE score for Validation set is : 2411.0560513485398


 RMSE score for Validation set is : 2396.9527964545587


 RMSE score for Validation set is : 2396.7570574116253


RMSE score for oofs is 2405.531677933634


In [44]:
lgb = LGBMRegressor()

lgb_oofs, lgb_preds = boosting_cross_val(lgb, train, test, features, 'lgb')



 RMSE score for Validation set is : 2406.821556218717


 RMSE score for Validation set is : 2417.9619358003347


 RMSE score for Validation set is : 2398.02200576747


 RMSE score for Validation set is : 2412.0027712346955


 RMSE score for Validation set is : 2398.941044065629


RMSE score for oofs is 2406.7619358811307


In [71]:
xgb = XGBRegressor(learning_rate = 0.1)

xgb_oofs, xgb_preds = boosting_cross_val(xgb, train, test, features, 'xgb')



 RMSE score for Validation set is : 2408.4025694053453


 RMSE score for Validation set is : 2411.460861766103


 RMSE score for Validation set is : 2402.7315119951786


 RMSE score for Validation set is : 2403.421695496742


 RMSE score for Validation set is : 2410.2805773550626


RMSE score for oofs is 2407.262074913859


In [50]:
%%time
cat = CatBoostRegressor()

cat_oofs, cat_preds = boosting_cross_val(cat, train, test, features, 'cat')



 RMSE score for Validation set is : 2403.746829211243


 RMSE score for Validation set is : 2425.8563458407507


 RMSE score for Validation set is : 2399.4299203702903


 RMSE score for Validation set is : 2410.1875173102003


 RMSE score for Validation set is : 2397.8009156670923


RMSE score for oofs is 2407.425810761386
Wall time: 1min 2s
