In [8]:
import pandas as pd
import numpy as np

In [15]:
def data_preprocess(data1,data2):
    data=pd.merge(data1,data2,on="Store")
    data.fillna(0,inplace=True)
    data['Promo2SinceWeek'] = data.Promo2SinceWeek.astype(int)
    data['Promo2SinceYear'] = data.Promo2SinceYear.astype(int)
    data['CompetitionOpenSinceMonth'] = data.CompetitionOpenSinceMonth.astype(int)
    data['CompetitionOpenSinceYear'] = data.CompetitionOpenSinceYear.astype(int)
    assortment_mapping={'a':0,'b':1,'c':2}
    store_mapping={'a':0,'b':1,'c':2,'d':4}
    data.replace(assortment_mapping,inplace=True)
    data.replace(store_mapping,inplace=True)
    
    data['Date'] = pd.to_datetime(data['Date'])
    
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear
    #calculation of time when the competition has opened in months
    data['CompetitionOpenMonths'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + (data.Month - data.CompetitionOpenSinceMonth)
    data['CompetitionOpenMonths'] = data.CompetitionOpenMonths.apply(lambda x: x if x > 0 else 0)
    
    
    return data

In [10]:
types = {'Promo': np.dtype(int)}
print("Load the training, test and store data using pandas")
train = pd.read_csv("C:/Users/Abhay Nanda/Desktop/train.csv",parse_dates=[3],low_memory=False)
test = pd.read_csv("C:/Users/Abhay Nanda/Desktop/test.csv",parse_dates=[3],low_memory=False)
store = pd.read_csv("C:/Users/Abhay Nanda/Desktop/store.csv")

Load the training, test and store data using pandas


In [11]:
#check NaN
display(train.isnull().sum(),test.isnull().sum(),store.isnull().sum())

Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
Id               0
dtype: int64

Store               0
DayOfWeek           0
Date                0
Customers           0
Open                0
Promo               0
StateHoliday        0
SchoolHoliday       0
Id                  0
Sales            1115
dtype: int64

Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

In [12]:
print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

Assume store open, if not provided


In [13]:
print("Only taking open stores")
train = train[train["Open"] == 1]
test = test[test["Open"] == 1]

Only taking open stores


In [16]:
print("data preprocessing")

train = data_preprocess(train,store)
test = data_preprocess(test,store)
train.head()
test.head()
display(train.isnull().sum(),test.isnull().sum())

data preprocessing


Store                        0
DayOfWeek                    0
Date                         0
Sales                        0
Customers                    0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
Id                           0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Year                         0
Month                        0
Day                          0
WeekOfYear                   0
CompetitionOpenMonths        0
dtype: int64

Store                        0
DayOfWeek                    0
Date                         0
Customers                    0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
Id                           0
Sales                        0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Year                         0
Month                        0
Day                          0
WeekOfYear                   0
CompetitionOpenMonths        0
dtype: int64

In [17]:
train.head()
test.head()

Unnamed: 0,Store,DayOfWeek,Date,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,Sales,...,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpenMonths
0,1,4,2015-07-31,555,1,1,0,1,879066,1.0,...,2008,0,0,0,0,2015,7,31,31,82
1,2,4,2015-07-31,625,1,1,0,1,37135,1.0,...,2007,1,13,2010,"Jan,Apr,Jul,Oct",2015,7,31,31,92
2,3,4,2015-07-31,821,1,1,0,1,285919,1.0,...,2006,1,14,2011,"Jan,Apr,Jul,Oct",2015,7,31,31,103
3,4,4,2015-07-31,1498,1,1,0,1,744348,1.0,...,2009,0,0,0,0,2015,7,31,31,70
4,5,4,2015-07-31,559,1,1,0,1,556250,1.0,...,2015,0,0,0,0,2015,7,31,31,3
