In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='dark')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
train = pd.read_csv(r"C:\Users\sunil\Projects\Analytics vidya\Black Friday Sales Prediction\train.csv")
test = pd.read_csv(r"C:\Users\sunil\Projects\Analytics vidya\Black Friday Sales Prediction\test.csv")
sample_sub = pd.read_csv(r"C:\Users\sunil\Projects\Analytics vidya\Black Friday Sales Prediction\sample_submission_V9Inaty.csv")

In [4]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)
df.shape

(783667, 12)

In [6]:
df

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370.0
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422.0
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057.0
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969.0
...,...,...,...,...,...,...,...,...,...,...,...,...
783662,1006036,P00118942,F,26-35,15,B,4+,1,8,,,
783663,1006036,P00254642,F,26-35,15,B,4+,1,5,8.0,,
783664,1006036,P00031842,F,26-35,15,B,4+,1,1,5.0,12.0,
783665,1006037,P00124742,F,46-50,1,C,4+,0,10,16.0,,


In [4]:
df = df.fillna(0)
df.isnull().sum().sum()

0

In [5]:
df['Gender'].replace( { 'M' : 1, 'F' : 0}, inplace=True)
df['Age'].replace( {'0-17': 0, '18-25':1, '26-35':2, '36-45': 3, '46-50': 4, '51-55': 5, '55+': 6}, inplace=True)
df['Stay_In_Current_City_Years'].replace( {'0': 0, '1': 1, '2': 2, '3': 3, '4+': 4}, inplace=True)

In [6]:
le = LabelEncoder()
scaler = StandardScaler()
##### User_ID
df['User_ID'] = df['User_ID'] - 1000000
df['User_ID'] = le.fit_transform(df['User_ID'])

##### Product_ID
df['Product_ID'] = df['Product_ID'].str.replace('P00', '')
df['Product_ID'] = le.fit_transform(df['Product_ID'])

##### City Category
df['City_Category'] = le.fit_transform(df['City_Category'])

In [7]:
# Changing Datatype

df['User_ID'] = df['User_ID'].astype('int16')
df['Product_ID'] = df['Product_ID'].astype('int16')
df['Gender'] = df['Gender'].astype('uint8')
df['Age'] = df['Age'].astype('uint8')
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype('uint8')
df['Marital_Status'] = df['Marital_Status'].astype('uint8')
df['Product_Category_1'] = df['Product_Category_1'].astype('int8')
df['Product_Category_2'] = df['Product_Category_2'].astype('int16')
df['Product_Category_3'] = df['Product_Category_3'].astype('int16')
df['Occupation'] = df['Product_Category_3'].astype('int8')



df['Purchase'] = df["Purchase"].astype('int32')

In [8]:
train_proc, test_proc = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop = True)

target = 'Purchase'

features = [col for col in train_proc.columns if col not in [target]]

In [9]:
trn, val = train_test_split(train_proc, test_size = 0.2, random_state = 1999)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

##### Features for test data that we will be predicting
X_test = test_proc[features]

# Cross Validation

In [10]:
def boosting_cross_val(regressor, train, test, features, name):
    N_splits = 5
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True)
    stratified_target = pd.qcut( train[target], 10, labels = False, duplicates = 'drop')
    
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n=========================Fold{index+1}============================')
        
        ####### Getting Train, Validation and Test sets.
        
        ## Training Set
        X_trn, y_trn = train[features].iloc[trn_idx], target_col.iloc[trn_idx]
        
        ## Validation Set
        X_val, y_val = train[features].iloc[val_idx], target_col.iloc[val_idx]
        
        ## Test Set
        X_test = test[features]
        
        if name != 'cat':
            ###### Scaling Data ######
            scaler = StandardScaler()
            _ = scaler.fit(X_trn)

            X_trn = scaler.transform(X_trn)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
        
        
        ############ Fitting And Predicting #############
        _ = regressor.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
        
        ## Predicting
        val_preds = regressor.predict(X_val)
        test_preds = regressor.predict(X_test)
        
        fold_score = np.sqrt( mean_squared_error(y_val, val_preds))
        print(f'\n RMSE score for Validation set is : {fold_score}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    oofs_score = np.sqrt( mean_squared_error(target_col, oofs))
    print(f'\n\nRMSE score for oofs is {oofs_score}')
    
    return oofs, preds

---
# Feature Engineering

In [11]:
def helper(df, column_list):
    for col in column_list:
        train_proc[col] = df[: train.shape[0]][col].reset_index(drop = True)
        test_proc[col] = df[train_proc.shape[0]:][col].reset_index(drop = True)
        features = [col for col in train_proc.columns if col not in [target]]
    return features

### 1. Features from EDA

In [12]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)

In [13]:
# Does user belongs to working class or not
df['is_working'] = df['Age'].apply(lambda x: 1 if (x!='0-17' and x!='55+' and x!='51-55' and x!='46-50') else 0)

# Does particular product_cat_1 is popular or not
df['is_popular'] = df['Product_Category_1'].apply(lambda x: 1 if x in [1,5,8] else 0)

col_ls = ['is_working', 'is_popular', ]

features = helper(df, col_ls)

In [14]:
female_prod_ls = list(set(df[ df['Gender'] == 'F']['Product_ID'].to_list()))
male_prod_ls = list(set(df[ df['Gender'] == 'M']['Product_ID'].to_list()))

female_products = list(np.setdiff1d(female_prod_ls, male_prod_ls))
male_products = list(np.setdiff1d( male_prod_ls, female_prod_ls))

# Does only females are buying particluar product_id
df['is_female_product'] = df['Product_ID'].apply(lambda x: 1 if x in female_products else 0)

# Does only males are buying particluar product_id
df['is_male_product'] = df['Product_ID'].apply(lambda x: 1 if x in male_products else 0)

In [15]:
col_ls = ['is_female_product', 'is_male_product']

features = helper(df, col_ls)

---
### 2. Grouping Features

In [16]:
# Number of unique product id's per user
df['Users_unique_products'] = df.groupby('User_ID')['Product_ID'].transform('nunique')

# Number of unique users per product_id
df['Products_unique_users'] = df.groupby('Product_ID')['User_ID'].transform('nunique')

# Number of unique products per city
df['Citys_unique_products'] = df.groupby('City_Category')['Product_ID'].transform('nunique')

# Number of unique products per Age_group
df['Age_unique_products'] = df.groupby('Age')['Product_ID'].transform('nunique')

# Number of unique Users per Age_group
df['Age_unique_users'] = df.groupby('Age')['User_ID'].transform('nunique')

# Number of unique Products per Occupation
df['Occupation_unique_products'] = df.groupby('Occupation')['Product_ID'].transform('nunique')

# Number of unique Users per Occupation
df['Occupation_unique_users'] = df.groupby('Occupation')['User_ID'].transform('nunique')

In [17]:
df['Users_Mean_Purchase'] = df.groupby('User_ID')['Purchase'].transform('mean')
df['Users_Median_Purchase'] = df.groupby('User_ID')['Purchase'].transform('median')
#df['Products_Mean_Purchase'] = df.groupby('Product_ID')['Purchase'].transform('mean')
#df['Products_Median_Purchase'] = df.groupby('Product_ID')['Purchase'].transform('median')

df['Product_Category_1_Mean_Purchase'] = df.groupby('Product_Category_1')['Purchase'].transform('mean')
df['Product_Category_1_Median_Purchase'] = df.groupby('Product_Category_1')['Purchase'].transform('median')



In [18]:
col_ls = ['Users_unique_products', 'Products_unique_users', 'Citys_unique_products', 'Age_unique_products', 'Age_unique_users',
 'Occupation_unique_products', 'Occupation_unique_users','Users_Mean_Purchase', 'Users_Median_Purchase',
         'Product_Category_1_Mean_Purchase', 'Product_Category_1_Median_Purchase']

features = helper(df, col_ls)

In [19]:
df['Users_Total_Purchase'] = df.groupby('User_ID')['Purchase'].transform('sum')
#df['Products_Total_Purchase'] = df.groupby('Product_ID')['Purchase'].transform('sum')

df['Users_Minimum_Purchase'] = df.groupby('User_ID')['Purchase'].transform('min')
#df['Products_Minimum_Purchase'] = df.groupby('Product_ID')['Purchase'].transform('min')

df['Users_Maximum_Purchase'] = df.groupby('User_ID')['Purchase'].transform('max')
#df['Products_Maximum_Purchase'] = df.groupby('Product_ID')['Purchase'].transform('max')


In [20]:
col_ls = ['Users_Total_Purchase', 'Users_Minimum_Purchase', 
         'Users_Maximum_Purchase']

features = helper(df, col_ls)
#####

#### Imputing Products_Mean_Purchase and Products_Median_Purchase by Product_Category_1_Mean_Purchase and Product_Category_1_Median_Purchase respectively

In [22]:
#test_proc.loc[ test_proc['Products_Mean_Purchase'].isna(), 'Products_Mean_Purchase'] = test_proc[ test_proc['Products_Mean_Purchase'].isna()]['Product_Category_1_Mean_Purchase'].apply(lambda x: x)

#test_proc.loc[ test_proc['Products_Median_Purchase'].isna(), 'Products_Median_Purchase'] = test_proc[ test_proc['Products_Median_Purchase'].isna()]['Product_Category_1_Median_Purchase'].apply(lambda x: x)

In [23]:
#test_proc.loc[ test_proc['Products_Minimum_Purchase'].isna(), 'Products_Minimum_Purchase'] = test_proc[ test_proc['Products_Minimum_Purchase'].isna()]['Products_Total_Purchase'].apply(lambda x: x)

#test_proc.loc[ test_proc['Products_Maximum_Purchase'].isna(), 'Products_Maximum_Purchase'] = test_proc[ test_proc['Products_Maximum_Purchase'].isna()]['Products_Total_Purchase'].apply(lambda x: x)

---
# Changing Datatype

In [30]:
uint8_cols = ['is_working', 'is_popular', 'is_female_product', 'is_male_product', 'City_Category', ]

int_8_cols = ['Product_Category_2', 'Product_Category_3']

int16_cols = ['Users_unique_products', 'Products_unique_users', 'Citys_unique_products', 'Age_unique_products', 'Age_unique_users',
             'Occupation_unique_products', 'Occupation_unique_users', 'Users_Mean_Purchase', 'Users_Median_Purchase',
            'Product_Category_1_Mean_Purchase', 'Product_Category_1_Median_Purchase', 
             'Users_Total_Purchase', 'Users_Minimum_Purchase', 
         'Users_Maximum_Purchase']

In [31]:
train_proc[uint8_cols] = train_proc[uint8_cols].astype('uint8')
test_proc[uint8_cols] = test_proc[uint8_cols].astype('uint8')

In [32]:
train_proc[int_8_cols] = train_proc[int_8_cols].astype('int8')
test_proc[int_8_cols] = test_proc[int_8_cols].astype('int8')

In [33]:
train_proc[int16_cols] = train_proc[int16_cols].astype('int16')
test_proc[int16_cols] = test_proc[int16_cols].astype('int16')

---
## Splitting into train, val and test

In [34]:
traget = 'Purchase'

features = [col for col in train_proc.columns if col not in [target]]

In [35]:
trn, val = train_test_split(train_proc, test_size = 0.2, random_state = 1999)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

##### Features for test data that we will be predicting
X_test = test_proc[features]

### 1. LGBM

In [38]:
%%time
lgb = LGBMRegressor(random_state=1999)
lgb.fit(X_trn, y_trn)
preds = lgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
print(f'RMSE score is: {rmse}')


RMSE score is: 2557.4681108774544
Wall time: 3.22 s


#### Hyperparameter Tuning

In [39]:
import optuna
from optuna.samplers import TPESampler

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 30)
    n_estimators = trial.suggest_int("n_estimators", 1, 500)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 1)
    num_leaves = trial.suggest_int("num_leaves", 2, 5000)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 0.9),
    min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    reg_alpha = trial.suggest_int("reg_alpha", 1, 10)
    reg_lambda = trial.suggest_int("reg_lambda", 1, 10)
    model = LGBMRegressor(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth,
        num_leaves=num_leaves,
        colsample_bytree = colsample_bytree,
        min_child_samples=min_child_samples,
        random_state=1999
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=50)

lgb_params = study.best_params
lgb_params['random_state'] = 0
lgb = LGBMRegressor(**lgb_params)
lgb.fit(X_trn, y_trn)
preds = lgb.predict(X_val)
print('Optimized LightGBM roc_auc_score', np.sqrt(mean_squared_error(y_val, preds)))

[I 2020-10-08 22:15:23,869] A new study created in memory with name: no-name-b5b05bc8-405f-45ff-aa73-fc287e8f4231
[I 2020-10-08 22:15:29,290] Trial 0 finished with value: 2587.8838894652417 and parameters: {'max_depth': 14, 'n_estimators': 48, 'learning_rate': 0.7436704297351775, 'num_leaves': 4933, 'colsample_bytree': 0.7863564940982054, 'min_child_samples': 106, 'reg_alpha': 10, 'reg_lambda': 4}. Best is trial 0 with value: 2587.8838894652417.
[I 2020-10-08 22:16:28,195] Trial 1 finished with value: 2786.042032412337 and parameters: {'max_depth': 23, 'n_estimators': 243, 'learning_rate': 0.49382849013642327, 'num_leaves': 3470, 'colsample_bytree': 0.40675321506062223, 'min_child_samples': 42, 'reg_alpha': 8, 'reg_lambda': 9}. Best is trial 0 with value: 2587.8838894652417.
[I 2020-10-08 22:16:44,267] Trial 2 finished with value: 2774.001957023837 and parameters: {'max_depth': 19, 'n_estimators': 166, 'learning_rate': 0.8524708871836397, 'num_leaves': 2165, 'colsample_bytree': 0.39459

[I 2020-10-08 22:27:22,842] Trial 48 finished with value: 2427.0009325318297 and parameters: {'max_depth': 24, 'n_estimators': 185, 'learning_rate': 0.10119675514520346, 'num_leaves': 2501, 'colsample_bytree': 0.4259991748714801, 'min_child_samples': 137, 'reg_alpha': 7, 'reg_lambda': 4}. Best is trial 38 with value: 2422.378370778711.
[I 2020-10-08 22:27:44,359] Trial 49 finished with value: 2677.106171022139 and parameters: {'max_depth': 26, 'n_estimators': 182, 'learning_rate': 0.750886614080417, 'num_leaves': 2505, 'colsample_bytree': 0.19168932339557335, 'min_child_samples': 147, 'reg_alpha': 7, 'reg_lambda': 2}. Best is trial 38 with value: 2422.378370778711.


Optimized LightGBM roc_auc_score 2422.405397386889


In [40]:
%%time
lgb = lgb
lgb.fit(X_trn, y_trn)
preds = lgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
print(f'RMSE score is: {rmse}')

RMSE score is: 2422.405397386889
Wall time: 14.9 s


In [41]:
%%time
lgb_fe_oofs, lgb_fe_preds = boosting_cross_val(lgb, train_proc, test_proc, features, 'lgb')
sample_sub['Purchase'] = lgb_fe_preds
sample_sub.to_csv(r"D:\Data Science\Projects\Analytics vidya\Black Friday Sales Prediction\final\LGBM_Tuned.csv", index = False)



 RMSE score for Validation set is : 2429.030850296995


 RMSE score for Validation set is : 2431.0283203052


 RMSE score for Validation set is : 2420.0737577032637


 RMSE score for Validation set is : 2420.836016872785


 RMSE score for Validation set is : 2432.1053030762155


RMSE score for oofs is 2426.620275074255
Wall time: 1min 24s


### 2. XGBOOST

In [42]:
%%time
xgb = XGBRegressor(random_state=1999)
xgb.fit(X_trn, y_trn)
preds = xgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
print(f'RMSE score is: {rmse}')


RMSE score is: 2500.1502293901635
Wall time: 22.5 s


#### Hyperparameter Tuning

In [43]:
import optuna
from optuna.samplers import TPESampler

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 30)
    n_estimators = trial.suggest_int("n_estimators", 1, 500)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 1)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 0.9)
    #num_leaves = trial.suggest_int("num_leaves", 2, 5000)
    #min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    reg_alpha = trial.suggest_int("reg_alpha", 1, 10)
    reg_lambda = trial.suggest_int("reg_lambda", 1, 10)
    model = XGBRegressor(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth,
        colsample_bytree = colsample_bytree,
        #num_leaves=num_leaves, 
        #min_child_samples=min_child_samples,
        random_state=1999,
        n_jobs = -1
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=30)

xgb_params = study.best_params
xgb_params['random_state'] = 0
xgb = XGBRegressor(**xgb_params)
xgb.fit(X_trn, y_trn)
preds = xgb.predict(X_val)
print('Optimized XGBR RMSE', np.sqrt(mean_squared_error(y_val, preds)))

[I 2020-10-08 22:30:00,772] A new study created in memory with name: no-name-500674f5-aa45-430c-a308-125b581f03e9
[I 2020-10-08 22:30:22,928] Trial 0 finished with value: 2813.0052598074003 and parameters: {'max_depth': 14, 'n_estimators': 48, 'learning_rate': 0.7436704297351775, 'colsample_bytree': 0.5822107008573151, 'reg_alpha': 4, 'reg_lambda': 8}. Best is trial 0 with value: 2813.0052598074003.
[I 2020-10-08 22:31:28,487] Trial 1 finished with value: 2752.299025235469 and parameters: {'max_depth': 11, 'n_estimators': 212, 'learning_rate': 0.6813047017599905, 'colsample_bytree': 0.45006976901015405, 'reg_alpha': 7, 'reg_lambda': 9}. Best is trial 1 with value: 2752.299025235469.
[I 2020-10-08 22:38:28,542] Trial 2 finished with value: 2763.4347519476623 and parameters: {'max_depth': 26, 'n_estimators': 397, 'learning_rate': 0.4450973669431999, 'colsample_bytree': 0.7333800304661316, 'reg_alpha': 8, 'reg_lambda': 9}. Best is trial 1 with value: 2752.299025235469.
[I 2020-10-08 22:39

Optimized XGBR RMSE 2442.506719650969


In [44]:
%%time

xgb = xgb
xgb.fit(X_trn, y_trn)
preds = xgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
print(f'RMSE score is: {rmse}')

RMSE score is: 2442.506719650969
Wall time: 40.8 s


In [45]:
# Cross val

xgb_fe_oofs, xgb_fe_preds = boosting_cross_val(xgb, train_proc, test_proc, features, 'xgb')
sample_sub['Purchase'] = xgb_fe_preds
sample_sub.to_csv(r"D:\Data Science\Projects\Analytics vidya\Black Friday Sales Prediction\final\XGB_FE_Boosting.csv", index = False)



 RMSE score for Validation set is : 2432.770566062838


 RMSE score for Validation set is : 2451.748402985729


 RMSE score for Validation set is : 2455.147101330811


 RMSE score for Validation set is : 2450.335227558207


 RMSE score for Validation set is : 2449.4270833351206


RMSE score for oofs is 2447.898107978903


### 3. Catboost

In [46]:
%%time
params = {'max_depth': 16, 'n_estimators': 822, 'learning_rate': 0.34579951469275394, 'rsm': 0.255416100953266, 'reg_lambda': 860}
cat = CatBoostRegressor(**params)
cat.fit(X_trn, y_trn)
preds = cat.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
print(f'RMSE score is: {rmse}')

0:	learn: 4094.8099743	total: 165ms	remaining: 2m 15s
1:	learn: 3538.6939085	total: 332ms	remaining: 2m 16s
2:	learn: 3186.5311065	total: 649ms	remaining: 2m 57s
3:	learn: 2996.2617617	total: 852ms	remaining: 2m 54s
4:	learn: 2897.4472686	total: 937ms	remaining: 2m 33s
5:	learn: 2839.1599806	total: 2.1s	remaining: 4m 45s
6:	learn: 2793.1522435	total: 2.18s	remaining: 4m 13s
7:	learn: 2769.9820793	total: 3.35s	remaining: 5m 41s
8:	learn: 2748.3652508	total: 4.16s	remaining: 6m 15s
9:	learn: 2731.8352068	total: 4.64s	remaining: 6m 16s
10:	learn: 2716.0784362	total: 5.73s	remaining: 7m 2s
11:	learn: 2708.8969950	total: 6.89s	remaining: 7m 45s
12:	learn: 2701.3592483	total: 7.76s	remaining: 8m 2s
13:	learn: 2692.5575286	total: 9.04s	remaining: 8m 41s
14:	learn: 2682.7152670	total: 10.2s	remaining: 9m 10s
15:	learn: 2676.8902614	total: 11.5s	remaining: 9m 38s
16:	learn: 2672.9233313	total: 12.6s	remaining: 9m 55s
17:	learn: 2663.6799835	total: 13.7s	remaining: 10m 13s
18:	learn: 2660.421351

146:	learn: 2459.4812591	total: 2m 42s	remaining: 12m 24s
147:	learn: 2457.3814826	total: 2m 43s	remaining: 12m 23s
148:	learn: 2456.1697237	total: 2m 44s	remaining: 12m 23s
149:	learn: 2455.1662308	total: 2m 45s	remaining: 12m 21s
150:	learn: 2453.9262396	total: 2m 46s	remaining: 12m 21s
151:	learn: 2453.8122699	total: 2m 47s	remaining: 12m 19s
152:	learn: 2452.3663679	total: 2m 48s	remaining: 12m 18s
153:	learn: 2451.4374482	total: 2m 49s	remaining: 12m 16s
154:	learn: 2450.4200161	total: 2m 51s	remaining: 12m 16s
155:	learn: 2450.1955948	total: 2m 52s	remaining: 12m 15s
156:	learn: 2448.9537586	total: 2m 53s	remaining: 12m 15s
157:	learn: 2448.3199587	total: 2m 54s	remaining: 12m 14s
158:	learn: 2447.2556464	total: 2m 55s	remaining: 12m 13s
159:	learn: 2446.6990896	total: 2m 57s	remaining: 12m 12s
160:	learn: 2445.8045191	total: 2m 58s	remaining: 12m 11s
161:	learn: 2445.0789708	total: 2m 59s	remaining: 12m 10s
162:	learn: 2444.3470619	total: 3m	remaining: 12m 10s
163:	learn: 2443.2

289:	learn: 2362.6384938	total: 5m 32s	remaining: 10m 9s
290:	learn: 2362.5706611	total: 5m 33s	remaining: 10m 8s
291:	learn: 2362.0175621	total: 5m 34s	remaining: 10m 7s
292:	learn: 2361.4900960	total: 5m 35s	remaining: 10m 6s
293:	learn: 2360.7399913	total: 5m 37s	remaining: 10m 5s
294:	learn: 2360.2881104	total: 5m 38s	remaining: 10m 4s
295:	learn: 2359.6027217	total: 5m 39s	remaining: 10m 3s
296:	learn: 2358.8197020	total: 5m 40s	remaining: 10m 2s
297:	learn: 2358.2494491	total: 5m 42s	remaining: 10m 1s
298:	learn: 2357.8534298	total: 5m 43s	remaining: 10m
299:	learn: 2357.4472170	total: 5m 44s	remaining: 9m 59s
300:	learn: 2357.0391808	total: 5m 45s	remaining: 9m 58s
301:	learn: 2356.5192976	total: 5m 47s	remaining: 9m 57s
302:	learn: 2356.1984044	total: 5m 48s	remaining: 9m 56s
303:	learn: 2355.4630692	total: 5m 49s	remaining: 9m 55s
304:	learn: 2355.0162387	total: 5m 50s	remaining: 9m 54s
305:	learn: 2354.4020731	total: 5m 52s	remaining: 9m 53s
306:	learn: 2353.8845649	total: 5m

434:	learn: 2294.2085753	total: 8m 24s	remaining: 7m 28s
435:	learn: 2293.5515980	total: 8m 25s	remaining: 7m 27s
436:	learn: 2292.9660494	total: 8m 26s	remaining: 7m 26s
437:	learn: 2292.3916880	total: 8m 28s	remaining: 7m 25s
438:	learn: 2291.8143389	total: 8m 29s	remaining: 7m 24s
439:	learn: 2291.7426237	total: 8m 30s	remaining: 7m 23s
440:	learn: 2291.4532283	total: 8m 31s	remaining: 7m 22s
441:	learn: 2290.8411969	total: 8m 32s	remaining: 7m 20s
442:	learn: 2290.1722783	total: 8m 34s	remaining: 7m 19s
443:	learn: 2289.7510817	total: 8m 35s	remaining: 7m 18s
444:	learn: 2289.4187045	total: 8m 36s	remaining: 7m 17s
445:	learn: 2288.9473627	total: 8m 37s	remaining: 7m 16s
446:	learn: 2288.5895551	total: 8m 38s	remaining: 7m 15s
447:	learn: 2288.1609791	total: 8m 39s	remaining: 7m 13s
448:	learn: 2287.7648246	total: 8m 41s	remaining: 7m 12s
449:	learn: 2287.1996434	total: 8m 42s	remaining: 7m 11s
450:	learn: 2286.6900022	total: 8m 43s	remaining: 7m 10s
451:	learn: 2286.2847934	total:

578:	learn: 2229.8939961	total: 11m 13s	remaining: 4m 42s
579:	learn: 2229.5720827	total: 11m 14s	remaining: 4m 41s
580:	learn: 2229.0496808	total: 11m 15s	remaining: 4m 40s
581:	learn: 2228.6510092	total: 11m 16s	remaining: 4m 39s
582:	learn: 2228.5374875	total: 11m 17s	remaining: 4m 37s
583:	learn: 2228.2289798	total: 11m 19s	remaining: 4m 36s
584:	learn: 2227.7755091	total: 11m 20s	remaining: 4m 35s
585:	learn: 2227.2690354	total: 11m 21s	remaining: 4m 34s
586:	learn: 2226.7428341	total: 11m 23s	remaining: 4m 33s
587:	learn: 2226.2966845	total: 11m 24s	remaining: 4m 32s
588:	learn: 2225.8256335	total: 11m 25s	remaining: 4m 31s
589:	learn: 2225.2933895	total: 11m 26s	remaining: 4m 29s
590:	learn: 2224.7504574	total: 11m 27s	remaining: 4m 28s
591:	learn: 2224.1859834	total: 11m 29s	remaining: 4m 27s
592:	learn: 2223.6248809	total: 11m 30s	remaining: 4m 26s
593:	learn: 2223.1428642	total: 11m 31s	remaining: 4m 25s
594:	learn: 2222.7574627	total: 11m 32s	remaining: 4m 24s
595:	learn: 22

721:	learn: 2172.9566788	total: 13m 58s	remaining: 1m 56s
722:	learn: 2172.9285620	total: 13m 59s	remaining: 1m 54s
723:	learn: 2172.5666753	total: 14m	remaining: 1m 53s
724:	learn: 2172.3451173	total: 14m 1s	remaining: 1m 52s
725:	learn: 2171.9062360	total: 14m 3s	remaining: 1m 51s
726:	learn: 2171.4290182	total: 14m 4s	remaining: 1m 50s
727:	learn: 2171.0999134	total: 14m 5s	remaining: 1m 49s
728:	learn: 2170.7217093	total: 14m 6s	remaining: 1m 48s
729:	learn: 2170.3427057	total: 14m 8s	remaining: 1m 46s
730:	learn: 2170.0527616	total: 14m 9s	remaining: 1m 45s
731:	learn: 2169.5896925	total: 14m 10s	remaining: 1m 44s
732:	learn: 2169.4469558	total: 14m 12s	remaining: 1m 43s
733:	learn: 2168.9042861	total: 14m 13s	remaining: 1m 42s
734:	learn: 2168.7494548	total: 14m 14s	remaining: 1m 41s
735:	learn: 2168.2443640	total: 14m 15s	remaining: 1m 40s
736:	learn: 2167.9006309	total: 14m 17s	remaining: 1m 38s
737:	learn: 2167.4646659	total: 14m 18s	remaining: 1m 37s
738:	learn: 2167.0313123	

#### Hyperparameter Tuning

In [47]:
import optuna
from optuna.samplers import TPESampler

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 1, 16)
    n_estimators = trial.suggest_int("n_estimators", 500, 1500)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 1)
    rsm = trial.suggest_uniform('rsm', 0.1, 0.99)
    #num_leaves = trial.suggest_int("num_leaves", 2, 5000)
    #min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    reg_lambda = trial.suggest_int("reg_lambda", 1, 1000)
    model = CatBoostRegressor(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth,
        rsm = rsm,
        reg_lambda = reg_lambda,
        #num_leaves=num_leaves, 
        #min_child_samples=min_child_samples,
        random_state=0,
        verbose = False
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=20)

cat_params = study.best_params
cat_params['random_state'] = 0
cat = CatBoostRegressor(**cat_params, verbose = False)
cat.fit(X_trn, y_trn)
preds = cat.predict(X_val)
print('Optimized Catboost RMSE', np.sqrt(mean_squared_error(y_val, preds)))

[I 2020-10-08 23:30:57,621] A new study created in memory with name: no-name-b9a14af7-9a2f-4ef7-be44-c49e593fda9a
[I 2020-10-08 23:35:39,780] Trial 0 finished with value: 2464.109824128356 and parameters: {'max_depth': 13, 'n_estimators': 1059, 'learning_rate': 0.7436704297351775, 'rsm': 0.636459404703763, 'reg_lambda': 708}. Best is trial 0 with value: 2464.109824128356.
[I 2020-10-08 23:36:09,793] Trial 1 finished with value: 2460.4662309121377 and parameters: {'max_depth': 8, 'n_estimators': 509, 'learning_rate': 0.6612073271073751, 'rsm': 0.44209971949050286, 'reg_lambda': 600}. Best is trial 1 with value: 2460.4662309121377.
[I 2020-10-08 23:36:59,288] Trial 2 finished with value: 2466.9949993602977 and parameters: {'max_depth': 7, 'n_estimators': 972, 'learning_rate': 0.9672964844509263, 'rsm': 0.4412629517549421, 'reg_lambda': 487}. Best is trial 1 with value: 2460.4662309121377.
[I 2020-10-08 23:37:33,918] Trial 3 finished with value: 2456.811854562535 and parameters: {'max_dep

KeyboardInterrupt: 

In [48]:
%%time
cat_fe_oofs, cat_fe_preds = boosting_cross_val(cat, train_proc, test_proc, features, 'cat')
sample_sub['Purchase'] = cat_fe_preds
sample_sub.to_csv(r"D:\Data Science\Projects\Analytics vidya\Black Friday Sales Prediction\final\CAT_FE_Boosting.csv", index = False)




KeyboardInterrupt: 

In [52]:
def cross_val(regressor, train, test, features):
    N_splits = 5
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True)
    stratified_target = pd.qcut( train[target], 10, labels = False, duplicates = 'drop')
    
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n=========================Fold{index+1}============================')
        
        ####### Getting Train, Validation and Test sets.
        
        ## Training Set
        X_trn, y_trn = train[features].iloc[trn_idx], target_col.iloc[trn_idx]
        
        ## Validation Set
        X_val, y_val = train[features].iloc[val_idx], target_col.iloc[val_idx]
        
        ## Test Set
        X_test = test[features]
        ###### Scaling Data ######
        scaler = StandardScaler()
        _ = scaler.fit(X_trn)

        X_trn = scaler.transform(X_trn)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
        
        
        ############ Fitting And Predicting #############
        _ = regressor.fit(X_trn, y_trn)
        
        ## Predicting
        val_preds = regressor.predict(X_val)
        test_preds = regressor.predict(X_test)
        
        fold_score = np.sqrt( mean_squared_error(y_val, val_preds))
        print(f'\n RMSE score for Validation set is : {fold_score}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    oofs_score = np.sqrt( mean_squared_error(target_col, oofs))
    print(f'\n\nRMSE score for oofs is {oofs_score}')
    
    return oofs, preds

In [53]:
train_new = train_proc[[target, 'User_ID']].copy()
test_new = test_proc[[target, 'User_ID']].copy()

train_new['lgb'] = lgb_fe_oofs
test_new['lgb'] = lgb_fe_preds

#train_new['cb'] = cat_fe_oofs
#test_new['cb'] = cat_fe_preds

train_new['xgb'] = xgb_fe_oofs
test_new['xgb'] = xgb_fe_preds

ens_features = [c for c in train_new.columns if c not in [target, 'User_ID']]

In [54]:
ens_features = ['lgb', 'xgb']

In [55]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()

ens_oofs, ens_preds = cross_val(clf, train_new, test_new, ens_features)



 RMSE score for Validation set is : 2420.8209636112333


 RMSE score for Validation set is : 2412.40119409859


 RMSE score for Validation set is : 2408.725162723656


 RMSE score for Validation set is : 2425.385914767702


 RMSE score for Validation set is : 2414.8601005705746


RMSE score for oofs is 2416.446001290232


In [56]:
sample_sub['Purchase'] = ens_preds
sample_sub.to_csv(r"D:\Data Science\Projects\Analytics vidya\Black Friday Sales Prediction\Ensemble.csv", index = False)

In [57]:
from sklearn.linear_model import LinearRegression
clf = LGBMRegressor()

ens_oofs, ens_preds = boosting_cross_val(clf, train_new, test_new, ens_features, 'lgb')



 RMSE score for Validation set is : 2416.2809061797616


 RMSE score for Validation set is : 2418.996482301684


 RMSE score for Validation set is : 2421.88779049467


 RMSE score for Validation set is : 2414.9731614071334


 RMSE score for Validation set is : 2410.956560538801


RMSE score for oofs is 2416.6218196826585
