In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv("train_v1.csv", index_col="Datetime")

In [5]:
df.columns

Index(['Category_1', 'Price', 'Number_Of_Sales', 'DayOfMonth', 'DayCount',
       'cat_3_0', 'cat_3_1', 'cat_2_1.0', 'cat_2_2.0', 'cat_2_3.0',
       'cat_2_4.0', 'cat_2_5.0', 'cat_2_other', 'week_0', 'week_1', 'week_2',
       'week_3', 'week_4', 'week_5', 'week_6', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12', 'is_weekend'],
      dtype='object')

In [6]:
X = df.drop(['Price', 'Number_Of_Sales'], axis=1)

In [18]:
X.columns

Index(['Category_1', 'DayOfMonth', 'DayCount', 'cat_3_0', 'cat_3_1',
       'cat_2_1.0', 'cat_2_2.0', 'cat_2_3.0', 'cat_2_4.0', 'cat_2_5.0',
       'cat_2_other', 'week_0', 'week_1', 'week_2', 'week_3', 'week_4',
       'week_5', 'week_6', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10',
       'month_11', 'month_12', 'is_weekend'],
      dtype='object')

In [7]:
scaler = StandardScaler()
scaler.fit(df["Category_1"])
df["Category_1"] = scaler.transform(df["Category_1"])



In [9]:
y_sales = df["Number_Of_Sales"]
y_price = df["Price"]

In [10]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [11]:
def rmse(act_y, pred_y):
    return np.sqrt(mse(act_y, pred_y))

In [12]:
import xgboost as xgb

# Hyperopt

In [13]:
SEED = 314159265
VALID_SIZE = 0.2
TARGET = 'outcome'

In [14]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [15]:
def score(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test)
#     dvalid = xgb.DMatrix(X_test, label=y_test)
#     watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)
    
    predictions = model.predict(dvalid)
    score = rmse(y_test, predictions)
    # TODO: Add the importance for the selected features
    print("Score {0}\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 400, 1000, 50),
             'eta' : hp.quniform('eta', 0.05, 0.3, 0.05),
             'max_depth' : hp.choice('max_depth', np.arange(3, 14, dtype=int)),
             'min_child_weight' : hp.quniform('min_child_weight', 2, 6, 1),
             'subsample' : hp.quniform('subsample', 0.6, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 0.8, 0.1),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.6, 1, 0.1),
             'eval_metric': 'rmse',
             'objective': 'reg:linear',
             'silent' : 1
             }
    
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=50)
    print("\n\n")
    print("Best Model is: ")
    print(best)
    return best

## Sales_Prediction

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y_sales)

In [17]:
trials = Trials()

best_model = optimize(trials)

Training with params: 
{'subsample': 0.65, 'eta': 0.1, 'objective': 'reg:linear', 'n_estimators': 700.0, 'min_child_weight': 5.0, 'silent': 1, 'max_depth': 4, 'colsample_bytree': 1.0, 'gamma': 0.7000000000000001, 'eval_metric': 'rmse'}
Score 204.6227075774103

Training with params: 
{'subsample': 0.65, 'eta': 0.15000000000000002, 'objective': 'reg:linear', 'n_estimators': 1000.0, 'min_child_weight': 5.0, 'silent': 1, 'max_depth': 7, 'colsample_bytree': 0.7000000000000001, 'gamma': 0.6000000000000001, 'eval_metric': 'rmse'}
Score 149.39241134515143

Training with params: 
{'subsample': 0.75, 'eta': 0.25, 'objective': 'reg:linear', 'n_estimators': 900.0, 'min_child_weight': 5.0, 'silent': 1, 'max_depth': 10, 'colsample_bytree': 1.0, 'gamma': 0.6000000000000001, 'eval_metric': 'rmse'}
Score 166.1709145046246

Training with params: 
{'subsample': 0.9500000000000001, 'eta': 0.15000000000000002, 'objective': 'reg:linear', 'n_estimators': 1000.0, 'min_child_weight': 6.0, 'silent': 1, 'max_dep

KeyboardInterrupt: 

In [None]:
np.save('best_parameters_sales.npy', best_model)

In [None]:
#best params
# {'gamma': 0.55, 'colsample_bytree': 0.65, 'silent': 1, 'max_depth': 7, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'n_estimators': 730.0, 'subsample': 0.8, 'eta': 0.225, 'min_child_weight': 3.0}