# XGBOOST

In [6]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [7]:
train = pd.read_csv('train_final.csv')
test = pd.read_csv('test_final.csv')

In [8]:
X = train.drop('Purchase', axis=1)
y = train['Purchase']

---

In [None]:
############################################################################
# Parameters Set 1
param2 = {'objective': 'reg:linear', 'booster': 'gbtree', 'silent': 1,
          'max_depth': 10, 'eta': 0.1, 'nthread': 4, 'subsample': 0.8,
          'colsample_bytree': 0.8, 'min_child_weight': 20,
          'max_delta_step': 0, 'gamma': 0}

num_boost_rounds = 690  # Number of boosting iterations
############################################################################


############################################################################
# Parameters Set 2
param1 = {'objective': 'reg:linear', 'silent': 1, 'max_depth': 10,
          'eta': 0.03, 'subsample': 0.8, 'min_child_weight': 10,
          'seed': 0}

num_boost_rounds = 1100  # Number of boosting iterations
############################################################################

---

In [None]:
#########
# DMatrix
#########

# DMatrix is a internal data structure that is used by XGBoost
# It is optimized for both memory efficiency and training speed.
# It is used for storing our train and test data in an efficient manner for XGBoost.

dtrain = xgb.DMatrix(data=X.values, label=y)         # Stores our training set
dtest = xgb.DMatrix(data=test.values, label=None)    # Stores our testing set (Testing set has no labels)

---

In [None]:
############################
# Function for XGBoost Model
############################

def XGBoost_Model(dtrain,  dtest,  num_boost_rounds,  param_dict, seed_no=0):

    param_dict["seed"] = seed_no

    regressor = xgb.train(params=param_dict, dtrain=dtrain, num_boost_rounds=num_boost_rounds)
    
    test_pred = regressor.predict(dtest)
    
    return test_pred

---

In [None]:
# For running XGBoost once

test_preds = XGBoost_Model(dtrain, dtest, num_boost_rounds, param1, seed_no=0)

---

In [None]:
# For running XGBoost n times with n seeds and taking mean predictions of each row

seeds = [1122, 2244, 3366, 4488, 5500]  # Random Seed Numbers (In this case 5 seeds)

test_preds = np.zeros((len(test), len(seeds)))  # A 2-D array of 0's for storing our 5 predictions for each row
                                                # Initially these predictions are set to zero
                                                # Eg.[ [0,0,0,0,0], [0,0,0,0,0], [0,0,0,0,0], ........ ] 

for run in range(len(seeds)):

    sys.stdout.write("\rXGB RUN:{}/{}".format(run+1, len(seeds)))   # For writing to the screen eg. RUN:1/5
    sys.stdout.flush()                                              # For flushing out the output

    test_preds[:, run] = XGBoost_Model(dtrain, dtest, num_boost_rounds, param2, seed_no=seeds[run])
    

test_preds = np.mean(test_preds, axis=1)  # Taking mean prediction of each row    

---

In [None]:
# Submission file

submit = pd.DataFrame({'User_ID': ids_test, 'Product_ID': product_ids_test, 'Purchase': test_preds})
submit = submit[['User_ID', 'Product_ID', 'Purchase']]

In [None]:
submit.ix[submit['Purchase'] < 0, 'Purchase'] = 12     # Changing min prediction to min value in train
submit.to_csv("final_solution-2.csv", index=False)