In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import math
import pdb
import sklearn as sk
from cleandata import cleanData
from sklearn.model_selection import train_test_split
import pickle

## Import Data from CSV

In [2]:
store_CSV = pd.read_csv('data/store.csv')
train_CSV = pd.read_csv('data/train.csv', low_memory=False)

In [3]:
data = train_CSV.join(store_CSV.set_index('Store'), on='Store')

In [4]:
train_raw, test_raw = train_test_split(data, test_size=0.2, random_state=42)

In [5]:
train_raw.shape, test_raw.shape

((510219, 18), (127555, 18))

## Clean Train Data by module cleandata

In [6]:
print('Clean Train Data')
train = cleanData(train_raw, 'train')



Clean Train Data
Dropped rows without store-ids


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Extracted year, month and day from Date
Extracted and reset day of week


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sales'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Set Sales to 0 if customers are 0
Dropped 0-sales rows in df


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Open'][i] = 1


Set Open = 1 if Sales > 0
Filled school holidays based on state holidays
Public Holidays updated


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['StateHoliday'][i] = row2['StateHoliday'].values[0]


Finished regional stateholidays
Finished cleaning remaining stateholidays
Adjusted open status of shops according to state holidays
Mean customers of test cleaning:758.7492748450405
Finished filling in empty customers cells
Mean Sales of training set = 6836.722219708965
Finished cleaning sales
Mean Competition Distance of training set = 5446.105182647453
PromoIntervals encoded
Store Type Encoded
State Holidays Encoded
Assortment Type Encoded
---Cleaning completed---
Dropped last leftovers
All done!


## Clean Test Data by module cleandata

#### Please input the mean calculated values from Train Data into corresponding cells of Clean test Data
- Mean customers of test cleaning
- Mean Sales of training set
- Mean Competition Distance of training set

In [7]:
print('Clean Test Data')
test = cleanData(test_raw, 'test')

Clean Test Data
Dropped rows without store-ids
Extracted year, month and day from Date
Extracted and reset day of week


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Set Sales to 0 if customers are 0
Dropped 0-sales rows in df
Set Open = 1 if Sales > 0
Filled school holidays based on state holidays
Public Holidays updated
Finished regional stateholidays
Finished cleaning remaining stateholidays
Adjusted open status of shops according to state holidays
Finished filling in empty customers cells
Finished cleaning sales
PromoIntervals encoded
Store Type Encoded
State Holidays Encoded
Assortment Type Encoded
---Cleaning completed---
Dropped last leftovers
All done!


## Split Data into test and training, label and feature sets

In [8]:
X_train = train.drop(['Sales'], axis=1).to_numpy()
y_train = train['Sales'].to_numpy()
X_test = test.drop(['Sales'], axis=1).to_numpy()
y_test = test['Sales'].to_numpy()

y_train = np.expand_dims(y_train, axis=1)
y_test = np.expand_dims(y_test, axis=1)

In [9]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((128789, 28), (128789, 1), (32223, 28), (32223, 1))

## Metric as defined by the kaggle competition(given)

In [10]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

## Define the xgboost model

In [11]:
# This is a basic benchmark
def model_base(params, X_train, y_train, X_test, y_test, epochs):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    bst = xgb.train(params, dtrain, evals=[(dtest, 'Test')], num_boost_round=epochs)
    
    
    
    test = xgb.DMatrix(X_test)
    y_h_test = bst.predict(test)
   
    train = xgb.DMatrix(X_train)
    y_h_train = bst.predict(train)
    
    print(y_h_test.shape)
    print(y_test.shape)

    return (metric(y_h_test, y_test), metric(y_h_train, y_train), bst)

## Train the model with given parameters

In [13]:
# This is a basic benchmark with tuned with max_depth set to 9, 'min_child_weight' : 1, 'gamma' : 0, learning_rate : 0.4, subsample: 0.9

epochs = 100
#x_axis =np.arange(0, 11, 1)
#train_axis= np.zeros_like(x_axis)
#train_axis.astype(float)
#test_axis=np.zeros_like(x_axis)
#test_axis.astype(float)
i=0

for learning_rate in range(1,2,1):
    
    print('learning_rate :',learning_rate/10)  
    
    params = {
   'booster' : 'gbtree',
   'colsample_bynode': 0.8,
   'learning_rate': 0.4,
   'max_depth': 9,
   'num_parallel_tree': 30,
   'objective': 'reg:squarederror',
   #'objective': 'reg:linear',
   'subsample': 0.9,
   'n_estimators' : 20,
   #'as_pandas': True,
   'tree_method': 'exact',
   'eval_metric': 'mae',
   'min_child_weight' : 1,
   'gamma' : 0

    }

    eval_test, eval_train, model = model_base(params, X_train, y_train, X_test, y_test, epochs)
    
    print('Test result: ', eval_test)
    print('Train result: ', eval_train)
    #train_axis[i]=eval_train
    #test_axis[i] =eval_test
    #i+=1
    
pickle.dump(model, open("pima.pickle.dat", "wb"))
    
#eval_test, eval_train = model_base(params, X_train, y_train, X_test, y_test, epochs)


learning_rate : 0.1
[0]	Test-mae:3886.03
[1]	Test-mae:2347.24
[2]	Test-mae:1443.37
[3]	Test-mae:943.864
[4]	Test-mae:693.453
[5]	Test-mae:580.336
[6]	Test-mae:523.428
[7]	Test-mae:494.865
[8]	Test-mae:475.488
[9]	Test-mae:459.036
[10]	Test-mae:450.871
[11]	Test-mae:441.144
[12]	Test-mae:430.001
[13]	Test-mae:423.037
[14]	Test-mae:415.235
[15]	Test-mae:408.552
[16]	Test-mae:404.332
[17]	Test-mae:399.476
[18]	Test-mae:395.93
[19]	Test-mae:393.045
[20]	Test-mae:389.296
[21]	Test-mae:387.181
[22]	Test-mae:384.779
[23]	Test-mae:382.774
[24]	Test-mae:381.142
[25]	Test-mae:379.374
[26]	Test-mae:377.923
[27]	Test-mae:376.634
[28]	Test-mae:375.434
[29]	Test-mae:374.389
[30]	Test-mae:373.475
[31]	Test-mae:372.609
[32]	Test-mae:371.828
[33]	Test-mae:371.207
[34]	Test-mae:370.417
[35]	Test-mae:369.596
[36]	Test-mae:368.962
[37]	Test-mae:368.403
[38]	Test-mae:368.003
[39]	Test-mae:367.501
[40]	Test-mae:366.968
[41]	Test-mae:366.397
[42]	Test-mae:365.846
[43]	Test-mae:365.483
[44]	Test-mae:365.011
[

# Predicitions

In [14]:
pickle.dump(model, open("pima.pickle.dat", "wb"))

In [48]:
model = pickle.load(open("pima.pickle.dat", "rb"))

In [43]:
def prediction(score, model):
    data = score.join(store_CSV.set_index('Store'), on='Store')
    data_clean = cleanData(data, 'test')
    feature = data_clean.drop(['Sales'], axis=1).to_numpy()
    label = data_clean['Sales'].to_numpy()
    dpredict = xgb.DMatrix(feature)
    y_h_predict = model.predict(dpredict)
    
    
    return metric(y_h_predict, label)

In [44]:
prediction(train_CSV, model)

Dropped rows without store-ids
Extracted year, month and day from Date
Extracted and reset day of week


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Set Sales to 0 if customers are 0
Dropped 0-sales rows in df
Set Open = 1 if Sales > 0
Filled school holidays based on state holidays
Public Holidays updated
Finished regional stateholidays
Finished cleaning remaining stateholidays
Adjusted open status of shops according to state holidays


Please enter mean customers of test cleaning: 758.7492748450405


Finished filling in empty customers cells


Please enter mean sales of test cleaning: 6836.722219708965


Finish sales


Please enter mean competition distance of test cleaning: 5446.105182647453


PromoIntervals encoded
Store Type Encoded
State Holidays Encoded
Assortment Type Encoded
---Cleaning completed---
Dropped last leftovers
All done!


8.666919973729465