# Setting up data

In [1]:
import HelperFunctions as hf
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [2]:
import gc
gc.collect()

41

In [3]:
#Load in the raw data
calendar_df = hf.load_calendar_data()
prices_df = hf.load_sell_price_data()
sales_df = hf.load_sales_train_validation_data()

In [4]:
#Clean the raw data
df = hf.rawToClean(sales_df, calendar_df, prices_df, days=750, items=1000, dropNAPrices=True)

In [5]:
df = hf.rollingMeanDemandFeature(df, windowSize=7, shift=1) #Mean of previous week
df = hf.rollingMeanDemandFeature(df, windowSize=28, shift=1) #Mean of previous 28 days

In [6]:
df = hf.rollingMeanWeekday(df, weeks = 3, shift = 1) #mean of previous 3 weeks (for that particular weekday)

In [7]:
df = hf.lagFeature(df, var='sold', lag=1) #Amount sold day before
df = hf.lagFeature(df, var='sold', lag=7) #Amount sold a week before
df = hf.lagFeature(df, var='sold', lag=28) #Amount sold 28 days before

In [8]:
df.shape

(429487, 48)

In [9]:
df.head()

Unnamed: 0,id,d,sold,wday,month,year,snap_CA,snap_TX,snap_WI,sell_price,...,WI_3,CA,TX,WI,rolling_mean_7_1,rolling_mean_28_1,rolling_mean_3_1,sold_lag_1,sold_lag_7,sold_lag_28
0,HOBBIES_1_144_TX_3_validation,1,0,1,1,2011,0,0,0,10.48,...,0,0,1,0,,,,,,
4,HOUSEHOLD_2_375_WI_3_validation,1,0,1,1,2011,0,0,0,5.87,...,1,0,0,1,,,,,,
5,HOUSEHOLD_1_319_CA_3_validation,1,1,1,1,2011,0,0,0,1.98,...,0,1,0,0,,,,,,
8,HOUSEHOLD_1_149_CA_1_validation,1,9,1,1,2011,0,0,0,0.97,...,0,1,0,0,,,,,,
11,FOODS_3_572_CA_4_validation,1,4,1,1,2011,0,0,0,2.68,...,0,1,0,0,,,,,,


In [10]:
df = df.dropna()

In [11]:
df.head()

Unnamed: 0,id,d,sold,wday,month,year,snap_CA,snap_TX,snap_WI,sell_price,...,WI_3,CA,TX,WI,rolling_mean_7_1,rolling_mean_28_1,rolling_mean_3_1,sold_lag_1,sold_lag_7,sold_lag_28
28000,HOBBIES_1_144_TX_3_validation,29,0,1,2,2011,0,0,0,10.48,...,0,0,1,0,0.285714,0.321429,0.666667,0,1,0
28004,HOUSEHOLD_2_375_WI_3_validation,29,0,1,2,2011,0,0,0,5.87,...,1,0,0,1,0.142857,0.25,1.0,0,0,0
28005,HOUSEHOLD_1_319_CA_3_validation,29,0,1,2,2011,0,0,0,1.98,...,0,1,0,0,6.285714,4.214286,5.666667,6,7,1
28008,HOUSEHOLD_1_149_CA_1_validation,29,0,1,2,2011,0,0,0,0.97,...,0,1,0,0,2.285714,2.035714,2.0,0,5,9
28011,FOODS_3_572_CA_4_validation,29,1,1,2,2011,0,0,0,2.68,...,0,1,0,0,3.142857,3.25,4.333333,1,4,4


In [12]:
df.columns

Index(['id', 'd', 'sold', 'wday', 'month', 'year', 'snap_CA', 'snap_TX',
       'snap_WI', 'sell_price', 'Cultural', 'National', 'Religious',
       'Sporting', 'NoEvent', 'Christmas', 'weekend', 'midweek', 'monfri',
       'FOODS_1', 'FOODS_2', 'FOODS_3', 'HOBBIES_1', 'HOBBIES_2',
       'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS', 'HOBBIES', 'HOUSEHOLD', 'CA_1',
       'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3',
       'CA', 'TX', 'WI', 'rolling_mean_7_1', 'rolling_mean_28_1',
       'rolling_mean_3_1', 'sold_lag_1', 'sold_lag_7', 'sold_lag_28'],
      dtype='object')

In [13]:
df = df.drop("id", axis = 1)

In [14]:
df = df.sample(10_000)

In [15]:
target = df.pop("sold")

# random forest

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [17]:
target = target.astype(int)

In [18]:
X_train,X_test ,y_train,y_test = train_test_split(df.values, target.values,test_size=0.3)

In [19]:
print(df.shape, target.shape)

(10000, 46) (10000,)


In [20]:
X_train.shape, y_train.shape

((7000, 46), (7000,))

In [21]:
y_train.astype

<function ndarray.astype>

In [22]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier())

In [23]:
sel.get_support()

array([ True,  True,  True, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True,  True,  True,  True,
        True])

In [28]:
selected_feat= df.columns[(sel.get_support())]
len(selected_feat)

10

In [29]:
selected_feat

Index(['d', 'wday', 'month', 'sell_price', 'rolling_mean_7_1',
       'rolling_mean_28_1', 'rolling_mean_3_1', 'sold_lag_1', 'sold_lag_7',
       'sold_lag_28'],
      dtype='object')