In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv("train_v3.csv", index_col="Datetime")

In [5]:
df.columns

Index(['Category_3', 'Category_1', 'Price', 'Number_Of_Sales', 'DayOfMonth',
       'DayCount', 'cat_2_1.0', 'cat_2_2.0', 'cat_2_3.0', 'cat_2_4.0',
       'cat_2_5.0', 'cat_2_other', 'week_0', 'week_1', 'week_2', 'week_3',
       'week_4', 'week_5', 'week_6', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12', 'is_weekend', 'quarter_1',
       'quarter_2', 'quarter_3', 'quarter_4'],
      dtype='object')

In [6]:
df = df[df["Number_Of_Sales"] < 1255]
df = df[df["Price"] < 5.0]

In [7]:
X = df.drop(['Price'], axis=1)

In [8]:
X.columns

Index(['Category_3', 'Category_1', 'Number_Of_Sales', 'DayOfMonth', 'DayCount',
       'cat_2_1.0', 'cat_2_2.0', 'cat_2_3.0', 'cat_2_4.0', 'cat_2_5.0',
       'cat_2_other', 'week_0', 'week_1', 'week_2', 'week_3', 'week_4',
       'week_5', 'week_6', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10',
       'month_11', 'month_12', 'is_weekend', 'quarter_1', 'quarter_2',
       'quarter_3', 'quarter_4'],
      dtype='object')

In [9]:
scaler = StandardScaler()
scaler.fit(df["Category_1"])
df["Category_1"] = scaler.transform(df["Category_1"])



In [10]:
y_sales = df["Number_Of_Sales"]
y_price = df["Price"]

In [11]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error as mse



In [12]:
def rmse(act_y, pred_y):
    return np.sqrt(mse(act_y, pred_y))

In [13]:
import xgboost as xgb

# Hyperopt

In [14]:
SEED = 314159265
VALID_SIZE = 0.2
TARGET = 'outcome'

In [15]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [16]:
def score(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test)
#     dvalid = xgb.DMatrix(X_test, label=y_test)
#     watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)
    
    predictions = model.predict(dvalid)
    score = rmse(y_test, predictions)
    # TODO: Add the importance for the selected features
    print("Score {0}\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 400, 1000, 50),
             'eta' : hp.quniform('eta', 0.05, 0.3, 0.05),
             'max_depth' : hp.choice('max_depth', np.arange(3, 14, dtype=int)),
             'min_child_weight' : hp.quniform('min_child_weight', 2, 6, 1),
             'subsample' : hp.quniform('subsample', 0.6, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 0.8, 0.1),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.6, 1, 0.1),
             'eval_metric': 'rmse',
             'objective': 'reg:linear',
             'silent' : 1
             }
    
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=20)
    print("\n\n")
    print("Best Model is: ")
    print(best)
    return best

## Sales_Prediction

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_price)

In [None]:
trials = Trials()

best_model = optimize(trials)

Training with params: 
{'n_estimators': 600.0, 'min_child_weight': 3.0, 'subsample': 0.65, 'silent': 1, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'max_depth': 10, 'gamma': 0.7000000000000001, 'eta': 0.30000000000000004, 'eval_metric': 'rmse'}
Score 0.41865968849570767

Training with params: 
{'n_estimators': 800.0, 'min_child_weight': 5.0, 'subsample': 0.9, 'silent': 1, 'colsample_bytree': 0.7000000000000001, 'objective': 'reg:linear', 'max_depth': 4, 'gamma': 0.7000000000000001, 'eta': 0.25, 'eval_metric': 'rmse'}
Score 0.4533754536026649

Training with params: 
{'n_estimators': 450.0, 'min_child_weight': 4.0, 'subsample': 0.8500000000000001, 'silent': 1, 'colsample_bytree': 0.9, 'objective': 'reg:linear', 'max_depth': 5, 'gamma': 0.7000000000000001, 'eta': 0.30000000000000004, 'eval_metric': 'rmse'}
Score 0.4265006239143493

Training with params: 
{'n_estimators': 450.0, 'min_child_weight': 5.0, 'subsample': 0.7000000000000001, 'silent': 1, 'colsample_bytree': 0.9, 'objecti

In [None]:
# def runXGB(train_X, train_y, test_X, test_y=None):
# #     params = {}
# #     params["objective"] = "reg:linear"
# #     params["eta"] = 0.02
# #     params["min_child_weight"] = 8
# #     params["subsample"] = 0.9
# #     params["colsample_bytree"] = 0.8
# #     params["silent"] = 1
# #     params["max_depth"] = 8
# #     params["seed"] = 1
# # plst = list(params.items())
# # num_rounds = 1000
#     params = {
#             'subsample': 0.75,
#             'colsample_bytree': 0.9,
#             'objective': 'reg:linear',
#             'eval_metric': 'rmse',
#             'silent': 1,
#             'gamma': 0.7000000000000001,
#             'n_estimators': 700.0,
#             'max_depth': 9,
#             'eta': 0.25,
#             'min_child_weight': 4.0
#         }
#     num_rounds = int(params['n_estimators'])
#     del params['n_estimators']
    
    
    

#     xgtrain = xgb.DMatrix(train_X, label=train_y)
# #     xgtest = xgb.DMatrix(test_X)
#     model = xgb.train(params, xgtrain, num_rounds)
# #     model = xgb.train(plst, xgtrain, num_rounds)
# #     pred_test_y = model.predict(xgtest)
#     return model

In [None]:
sales_model = runXGB(X_train, y_train, X_test)

In [None]:
y_sales_pred = sales_model.predict(xgb.DMatrix(X_test))
score = rmse(y_test, y_sales_pred)
print(score)

In [None]:
sales_model.save_model("sales_model_train_v3.model")
sales_model.dump_model('raw_sales_model_train_v3.txt')

In [None]:
from xgboost import plot_importance
plot_importance(sales_model)

In [None]:
# trials = Trials()

# best_model = optimize(trials)

In [None]:
# np.save('best_parameters_sales.npy', best_model)

In [None]:
#best params
# {'gamma': 0.55, 'colsample_bytree': 0.65, 'silent': 1, 'max_depth': 7, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'n_estimators': 730.0, 'subsample': 0.8, 'eta': 0.225, 'min_child_weight': 3.0}