### data_clean_features
clean meta data

In [72]:
import pandas as pd
import numpy as np
from datetime import date
import matplotlib.pylab as plt
%matplotlib inline
import datetime

In [73]:
import pickle

#### Load data and normalize dates: since we are dealing with weekly sales, on which day an entry got recorded is not important. Rather, normalize them by converting them to the date of Monday of the same week.


In [74]:
features = pd.read_csv('data/features.csv')


# since we are dealing with weekly data, on which day an entry got recorded is not important
# rather, we will normalize them by converting them to the date of Monday of the same week.
# later we will aggregate weekly sales by summing all the entry on the same normalized date.
def year_week(date):
    temp = date.isocalendar()
    date_string = str(temp[0])+' '+str(temp[1])+' 1'
    return datetime.datetime.strptime(date_string, '%G %V %u')

# normalize dates
features['Date'] = pd.to_datetime(features['Date'])
features['Date'] = features['Date'].apply(year_week)


#### There are multiple rows on some dates, some with confilicting information. The following code consolidates rows into one for each date by taking the mean.  

In [75]:
clean_features=features.groupby(['Store','Date']).apply(np.mean)

In [76]:
clean_features=clean_features.reset_index(level='Date')

#### Next, impute missing values.

In [77]:
clean_features = clean_features.reset_index(drop = True)

In [78]:
clean_features.head()

Unnamed: 0,Date,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,2010-01-04,1.0,71.89,2.603,,,,,,211.671989,7.838,0.0
1,2010-02-01,1.0,71.59,2.694,,,,,,211.021992,7.7975,0.0
2,2010-02-15,1.0,39.93,2.514,,,,,,211.289143,8.106,0.0
3,2010-02-22,1.0,46.63,2.561,,,,,,211.319643,8.106,0.0
4,2010-03-08,1.0,65.24,2.6425,,,,,,211.56922,7.8125,0.0


In [79]:
# columns contain missing values include CPI, Unemployment and markdowns
clean_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7200 entries, 0 to 7199
Data columns (total 12 columns):
Date            7200 non-null datetime64[ns]
Store           7200 non-null float64
Temperature     7200 non-null float64
Fuel_Price      7200 non-null float64
MarkDown1       3672 non-null float64
MarkDown2       2760 non-null float64
MarkDown3       3285 non-null float64
MarkDown4       3164 non-null float64
MarkDown5       3690 non-null float64
CPI             6660 non-null float64
Unemployment    6660 non-null float64
IsHoliday       7200 non-null float64
dtypes: datetime64[ns](1), float64(11)
memory usage: 675.1 KB


#### Replace missing markdown values by 0

In [80]:
clean_features['MarkDown1']=clean_features['MarkDown1'].fillna(0)
clean_features['MarkDown2']=clean_features['MarkDown2'].fillna(0)
clean_features['MarkDown3']=clean_features['MarkDown3'].fillna(0)
clean_features['MarkDown4']=clean_features['MarkDown4'].fillna(0)
clean_features['MarkDown5']=clean_features['MarkDown5'].fillna(0)

#### Investigate the patterns of missing values for Unemployment and CPI

In [81]:
# there are missing values for unemployment in every store
mask = pd.isnull(clean_features['Unemployment'])
clean_features[cpi_mask]['Store'].unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
       40., 41., 42., 43., 44., 45.])

In [82]:
# CPI and Unemployment always miss together in the same rows

cpi_na_index = pd.isnull(clean_features[['CPI']]).any(1).nonzero()[0]
unemployment_na_index = pd.isnull(clean_features[['Unemployment']]).any(1).nonzero()[0]
cpi_na_index == unemployment_na_index


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

#### Impute missing CPI and Unemployment values from the same store on the closest date when these values are not missing

In [84]:
storeIDs = sorted(clean_features['Store'].unique())
dfs = []

for storeID in storeIDs:
    store_mask = (clean_features['Store'] == storeID)
    df = clean_features[store_mask]
    # CPI missing values are in the same rows too as CPI and Unemployment 
    # always missing together 
    na_index = pd.isnull(df[['Unemployment']]).any(1).nonzero()[0]    
    all_index = [i for i in range(df.shape[0])]
    non_na_index = [i for i in all_index if i not in na_index]
    
    for k in na_index:
        # find the closest row that has none missing cpi and unemployment; 
        # use these values for imputation
        impute_index = min(non_na_index, key=lambda x:abs(x-k))

        df['Unemployment'].iloc[k] = df['Unemployment'].iloc[impute_index]
        df['CPI'].iloc[k] = df['CPI'].iloc[impute_index]
        
    dfs.append(df)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [85]:
clean_features = pd.concat(dfs)
clean_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7200 entries, 0 to 7199
Data columns (total 12 columns):
Date            7200 non-null datetime64[ns]
Store           7200 non-null float64
Temperature     7200 non-null float64
Fuel_Price      7200 non-null float64
MarkDown1       7200 non-null float64
MarkDown2       7200 non-null float64
MarkDown3       7200 non-null float64
MarkDown4       7200 non-null float64
MarkDown5       7200 non-null float64
CPI             7200 non-null float64
Unemployment    7200 non-null float64
IsHoliday       7200 non-null float64
dtypes: datetime64[ns](1), float64(11)
memory usage: 731.2 KB


In [86]:
clean_features.head()

Unnamed: 0,Date,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,2010-01-04,1.0,71.89,2.603,0.0,0.0,0.0,0.0,0.0,211.671989,7.838,0.0
1,2010-02-01,1.0,71.59,2.694,0.0,0.0,0.0,0.0,0.0,211.021992,7.7975,0.0
2,2010-02-15,1.0,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,0.0
3,2010-02-22,1.0,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,0.0
4,2010-03-08,1.0,65.24,2.6425,0.0,0.0,0.0,0.0,0.0,211.56922,7.8125,0.0


#### Save cleaned features to pickle file

In [89]:
clean_features.to_pickle('data/clean_features.pkl')

In [88]:
# clean_features = pd.read_pickle('clean_features.pkl')