In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from sklearn import linear_model

In [None]:
from predict_sales import logger

In [None]:
from predict_sales.data import Data, log_lm_features, linear_features, xgb_features, check_nulls

In [None]:
Data.process_input("../input/store.csv", "../input/train.csv", "../input/test.csv")

In [None]:
Data.save('train.pkl', 'test.pkl')

In [None]:
from predict_sales import functions

In [None]:
import pickle
with open('train.pkl', 'rb') as pkl_file:
    train = pickle.load(pkl_file)
    logger.info('Loaded train data, shape {0}'.format(train.shape))
with open('test.pkl', 'rb') as pkl_file:
    test = pickle.load(pkl_file)
    logger.info('Loaded test data, shape {0}'.format(test.shape))

In [None]:
train.shape

In [None]:
check_nulls(train)

In [None]:
from predict_sales.functions import (remove_before_changepoint,
                                     log_transform_train,
                                     remove_outliers_lm,
                                     select_features,
                                     predict_elasticnet,
                                     exp_rmspe, rmspe,
                                     log_revert_predicted)

In [None]:
logger.info("Dropping store data before changepoint. Initial shape {0}".format(train.shape))
train_tr = remove_before_changepoint(train)
logger.info("Reduced to {0}".format(train_tr.shape))

In [None]:
logger.info("Dropping stores not in test set. Initial shape {0}".format(train_tr.shape))
train_tr = train_tr.query('Store in {test_set_stores}'
                          .format(test_set_stores=list(test['Store'].unique())))
logger.info("Reduced to {0}".format(train_tr.shape))

In [None]:
logger.debug("Log transform on sales data")
train_tr = log_transform_train(train_tr)

In [None]:
train_tr = remove_outliers_lm(train_tr)
logger.info("Removed outliers, reduced shape {0}".format(train_tr.shape))

In [None]:
train_tr = select_features(train_tr, linear_features)
logger.info("Selected linear features, shape {0}".format(train_tr.shape))

In [None]:
train_tr.columns[pd.isnull(train_tr).any()]

In [None]:
logger.info("Test shape {0}".format(test.shape))
test_tr = select_features(test, linear_features)
logger.info("Test, selected linear features, shape {0}".format(test_tr.shape))

In [None]:
test_tr.columns[pd.isnull(test_tr).any()]

In [None]:
from predict_sales.functions import predict_elasticnet, predict_xgboost, select_features

In [None]:
predictions = predict_elasticnet(train_tr, test_tr, exp_rmspe, steps=15, step_by=3)

In [None]:
train_tr = select_features(train, xgb_features)

In [None]:
from predict_sales.playground import glm_predictions, xgb_predictions

In [None]:
glm_predictions(train, test)

In [None]:
len(train_tr.columns), len(set(train_tr.columns))

In [None]:
xgb_predictions(train, test)

In [None]:
train_gb = train_tr.groupby('Store')

In [None]:
for fit, store in zip(predictions.fit, predictions.store):
    store_train = train_gb.get_group(store)
    store_train['PredictedSales'] = fit.predict(store_train.drop(['Date', 'Sales'], axis=1))
    log_revert_predicted(store_train)
    logger.info('Store {0} error {1}'
                .format(store, 
                        rmspe(store_train['PredictedSales'], np.exp(store_train['Sales']))))

In [None]:
type(predictions.fit[0])

In [None]:
pd.DataFrame({0:[1,2,3],1:[3,4,5]}).as_matrix()

In [None]:
test.shape

In [None]:
predictions = pd.concat(pred)

In [None]:
log_revert_predicted(predictions)

In [None]:
from predict_sales.functions import rmspe

In [None]:
predictions.shape

In [None]:
rmspe(test['Sales'], predictions['PredictedSales'])

In [None]:
train1 = functions.remove_outliers_lm(train)

In [None]:
isinstance(A['Id'], pd.DataFrame)

In [None]:
train1.shape

In [None]:
len(list(linear_features)), len(list(log_lm_features))

In [None]:
set(log_lm_features).difference(linear_features)

In [None]:
test = Data.test.copy()

In [None]:
check_dup_columns(test)

In [None]:
train['Id'] = np.arange(1, train.shape[0] + 1)

In [None]:
check_dup_columns(train)

In [None]:
train = functions.select_features(train, log_lm_features)

In [None]:
train.shape

In [None]:
l = [(1,2),'ab']

In [None]:
def mygen():
    for i in range(10):
        yield Predictions(i, 3*i, 5*i)

In [None]:
Predictions._fields

In [None]:
list(mygen())

In [None]:
Predictions(*zip(*mygen()))

In [None]:
functions.predict_lm(train, train, save_fit=True)

In [None]:
lm = linear_model.LinearRegression()

In [None]:
test.shape

In [None]:
pd.isnull(train).any()

In [None]:
fit = lm.fit(train.drop(['Date', 'Sales'], axis=1), train['Sales'])

In [None]:
pred = lm.predict(train.drop(['Date', 'Sales'], axis=1))

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
pd.Series(pred).describe()

In [None]:
train['Sales'].describe()

In [None]:
A = train.iloc[:10,:2]

In [None]:
abs(A['Sales']-5000), A['Sales']-5000

In [None]:
pd.DataFrame(mygen())

In [None]:
pd.concat(map(lambda x: pd.DataFrame([x]), mygen()))

In [None]:
plt.scatter(y=pred, x=train['Sales'])
plt.xlim([0,25000])

In [None]:
functions.predict_lm_per_store()

In [None]:
del plt

In [None]:
train_gb = train.groupby('Store')

In [None]:
train_gb.get_group(1).head()

In [None]:
from collections import namedtuple

In [None]:
Predictions = namedtuple('Predictions', 'predicted, fit, store')

In [None]:
p = Predictions(predicted=0, fit=1, store=None)
p

In [None]:
p._replace(store=7)

In [None]:
with open('glmnet.pkl', 'rb') as pkl_file:
    glm_preds = pickle.load(pkl_file)

In [None]:
with open('xgboost.pkl', 'rb') as pkl_file:
    xgb_preds = pickle.load(pkl_file)

In [None]:
glm_preds_df = pd.concat(glm_preds)

In [None]:
results = pd.concat([glm_preds_df[['Id', 'PredictedSales']], xgb_preds['PredictedSales']], axis=1).set_index('Id')

In [None]:
comp = pd.read_csv('../rf1.csv', index_col=0)

In [None]:
allres = results.join(comp)

In [None]:
allres.columns = ['glm', 'xgb', 'comp']

In [None]:
allres['avg'] = 0.5 * (allres['glm'] + allres['xgb'])

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
(allres['avg'] - allres['comp']).mean()

In [None]:
plt.scatter(y=allres['avg'] - allres['comp'], x=allres.index)