In [126]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
#from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse

import warnings



In [127]:
warnings.filterwarnings('ignore')

In [128]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

### Data Description

Loading master data 

In [129]:
df = pd.read_csv('master.csv')
df.head()

Unnamed: 0,Store_Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1_1,2/5/10,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,A,151315
1,1_1,2/12/10,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,A,151315
2,1_1,2/19/10,41595.55,False,39.93,2.514,,,,,,211.289143,8.106,A,151315
3,1_1,2/26/10,19403.54,False,46.63,2.561,,,,,,211.319643,8.106,A,151315
4,1_1,3/5/10,21827.9,False,46.5,2.625,,,,,,211.350143,8.106,A,151315


We want to see the dimension of our master dataset. It has 344667 rows and 15 columns.

In [130]:
df.shape


(344667, 15)

Now we want to check the missing values in the dataset. We have to use isna() function for checking missing values. We see that there are many missing values in markdown columns. As it is numeric columns, we want to replace missing values as zero.

In [131]:
df.isna().sum()

Store_Dept           0
Date                 0
Weekly_Sales         0
IsHoliday            0
Temperature          0
Fuel_Price           0
MarkDown1       270480
MarkDown2       278599
MarkDown3       276008
MarkDown4       278273
MarkDown5       270138
CPI                  0
Unemployment         0
Type                 0
Size                 0
dtype: int64

In [132]:
#Treating missing value as 0
df.fillna(0, inplace= True)

In [133]:
df.head()

Unnamed: 0,Store_Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1_1,2/5/10,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315
1,1_1,2/12/10,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,A,151315
2,1_1,2/19/10,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,A,151315
3,1_1,2/26/10,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,A,151315
4,1_1,3/5/10,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,A,151315


We would like to see the statistics of data

In [134]:
df.describe()

Unnamed: 0,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Size
count,344667.0,344667.0,344667.0,344667.0,344667.0,344667.0,344667.0,344667.0,344667.0,344667.0,344667.0
mean,16011.033071,57.39107,3.278793,1588.057498,1058.111573,560.259852,744.976707,1046.938687,170.260421,8.119909,136807.198011
std,22859.611439,18.644742,0.454502,5424.631426,5607.297589,6110.58775,3899.1671,3078.805701,38.767201,1.865077,60944.571069
min,-4988.94,-2.06,2.472,0.0,-265.76,-1.0,0.0,0.0,126.064,4.125,34875.0
25%,2100.335,43.82,2.843,0.0,0.0,0.0,0.0,0.0,131.940807,7.082,93638.0
50%,7638.86,58.06,3.263,0.0,0.0,0.0,0.0,0.0,182.238988,7.951,140167.0
75%,20179.49,71.24,3.669,0.0,0.0,0.0,0.0,0.0,211.406287,8.622,202505.0
max,693099.36,100.14,4.294,88646.76,104519.54,141630.61,67474.85,37581.27,225.367254,14.313,219622.0


In [135]:
df.shape

(344667, 15)

In [136]:
df.isnull().sum()

Store_Dept      0
Date            0
Weekly_Sales    0
IsHoliday       0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
Type            0
Size            0
dtype: int64

Now, let us check the data types. We use dtypes attributes to see features are of which datatype.

In [137]:
df.dtypes

Store_Dept       object
Date             object
Weekly_Sales    float64
IsHoliday          bool
Temperature     float64
Fuel_Price      float64
MarkDown1       float64
MarkDown2       float64
MarkDown3       float64
MarkDown4       float64
MarkDown5       float64
CPI             float64
Unemployment    float64
Type             object
Size              int64
dtype: object

Categorical variables are: Store_Dept, Date, Type (multiclass) and IsHoliday(binary)
Numerical variables: Weekly_Sales. Temperature, Fuel_price, Markdown1, Markdown2, Markdown3, Markdown4, Markdown5, CPI, Unemploymeny, Size

### Data Preparation

In our dataset, date column is string and cannot be directly used in model, so we have seperated Year, month and week columns from Date column.

In [138]:
#prepare data- converting date
df['Date'] = pd.to_datetime(df['Date'], errors ='coerce')
df['Year']=df['Date'].dt.year
df['Month']=df['Date'].dt.month
df['Week']=df['Date'].dt.week


The store and Department columns are joined together in our dataset, we wanted to create two seperate columns for Store and Department

In [139]:
store_dept = df["Store_Dept"].str.split("_", n = 1, expand = True) 
 
# making separate Store column from new data frame 
df["Store"]= store_dept[0] 
  
# making separate Dept column from new data frame 
df['Department']= store_dept[1] 
  
# Dropping old Store_Dept columns 
df.drop(columns =['Store_Dept'], inplace = True) 

#df['Store'] = 'Store_' + df['Store']

#df['Department'] = 'Dept_' + df['Department']

In [140]:
#df_Type = pd.get_dummies(df['Store'])
#df = pd.concat([df, df_Type], axis=1)

#df_Type = pd.get_dummies(df['Department'])
#df = pd.concat([df, df_Type], axis=1)
  

In [141]:
df.head()

Unnamed: 0,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Year,Month,Week,Store,Department
0,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,2010,2,5,1,1
1,2010-02-12,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,A,151315,2010,2,6,1,1
2,2010-02-19,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,A,151315,2010,2,7,1,1
3,2010-02-26,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,A,151315,2010,2,8,1,1
4,2010-03-05,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,A,151315,2010,3,9,1,1


In [142]:
df.dtypes

Date            datetime64[ns]
Weekly_Sales           float64
IsHoliday                 bool
Temperature            float64
Fuel_Price             float64
MarkDown1              float64
MarkDown2              float64
MarkDown3              float64
MarkDown4              float64
MarkDown5              float64
CPI                    float64
Unemployment           float64
Type                    object
Size                     int64
Year                     int64
Month                    int64
Week                     int64
Store                   object
Department              object
dtype: object

### Supervised learning (Classification)

**Sperating predictors and Type columns**

We want to create Supervised learning models in our data. We want to predict Type of Store of Walmart. We dropped Date column as well as we have week, month and year column seperated.

In [165]:
#separating columns for train test split
X = df.drop(columns=['Type','Date', 'IsHoliday','Temperature', 'Fuel_Price','CPI','Unemployment', 
                'CPI', 'Year','Month', 'Week'])
y = df[['Type']]

In [166]:
X.head(5)

Unnamed: 0,Weekly_Sales,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,Size,Store,Department
0,24924.5,0.0,0.0,0.0,0.0,0.0,151315,1,1
1,46039.49,0.0,0.0,0.0,0.0,0.0,151315,1,1
2,41595.55,0.0,0.0,0.0,0.0,0.0,151315,1,1
3,19403.54,0.0,0.0,0.0,0.0,0.0,151315,1,1
4,21827.9,0.0,0.0,0.0,0.0,0.0,151315,1,1


**Splitting the data into train and test set**

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

**Run Naive Bayes**

In [153]:
#initialize your classifier
print("Naive-Bayes Classifier")
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()

Naive-Bayes Classifier


In [154]:
#fit your classifier to training set
nb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [155]:
#use your classifier to predict on the test Set : X_test, save it to a temporary variable y_pred
y_pred =  nb.predict(X_test)

In [157]:
#compare y_pred with actual targets for your test set(y_test) and calculate precision, recall, f1-score
print("Naive-Bayes Classifier")
print("Accuracy: %0.2f" %accuracy_score(y_test, y_pred))
print("Precision: %0.2f" %precision_score(y_test, y_pred , average="macro"))
print("Recall: %0.2f" %recall_score(y_test, y_pred , average="macro"))
print("F1-score: %0.2f" %f1_score(y_test, y_pred , average="macro"))

#compute recall and f1-score

Naive-Bayes Classifier
Accuracy: 0.51
Precision: 0.35
Recall: 0.33
F1-score: 0.23


In [158]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[35211    76     0]
 [26672   112     0]
 [ 6843    20     0]]
              precision    recall  f1-score   support

           A       0.51      1.00      0.68     35287
           B       0.54      0.00      0.01     26784
           C       0.00      0.00      0.00      6863

   micro avg       0.51      0.51      0.51     68934
   macro avg       0.35      0.33      0.23     68934
weighted avg       0.47      0.51      0.35     68934



**Run Decision Tree**

In [160]:
print("Decision Tree Classifier")
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

Decision Tree Classifier


In [161]:
#y-pred - same as above
y_pred = clf.predict(X_test)
print("Decision Tree Classifier")
#compute precision,f1-score, recall - same as above
print("Accuracy: %0.2f" %accuracy_score(y_test, y_pred))
print("Precision: %0.2f" %precision_score(y_test, y_pred , average="macro"))
print("Recall: %0.2f" %recall_score(y_test, y_pred , average="macro"))
print("F1-score: %0.2f" %f1_score(y_test, y_pred , average="macro"))

Decision Tree Classifier
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00


In [162]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[35287     0     0]
 [    0 26784     0]
 [    0     0  6863]]
              precision    recall  f1-score   support

           A       1.00      1.00      1.00     35287
           B       1.00      1.00      1.00     26784
           C       1.00      1.00      1.00      6863

   micro avg       1.00      1.00      1.00     68934
   macro avg       1.00      1.00      1.00     68934
weighted avg       1.00      1.00      1.00     68934



**Cross validation**

**Run Naive Bayes- Cross validation**

In [167]:
print("Naive Bayes Classifier")
nb = BernoulliNB()

print("Accuracy: %0.2f" %(cross_val_score(nb, X, y, cv=10, scoring='accuracy').mean()))
print("Precision: %0.2f (+/- %0.2f)" % (cross_val_score(nb, X, y, cv=10, scoring='precision_macro').mean(), cross_val_score(nb, X, y, cv=10, scoring='precision_macro').std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (cross_val_score(nb, X, y, cv=10, scoring='recall_macro').mean(), cross_val_score(nb, X, y, cv=10, scoring='recall_macro').std() * 2))
print("F1-score: %0.2f (+/- %0.2f)" % (cross_val_score(nb, X, y, cv=10, scoring='f1_macro').mean(), cross_val_score(nb, X, y, cv=10, scoring='f1_macro').std() * 2))


Naive Bayes Classifier
Accuracy: 0.51
Precision: 0.34 (+/- 0.07)
Recall: 0.33 (+/- 0.00)
F1-score: 0.23 (+/- 0.00)


**Run Decision Tree- Cross validation**

In [168]:
#Repeat the same steps for DecisionTree Classifier
print("Decision Tree Classifier")
clf = tree.DecisionTreeClassifier()

print("Accuracy: %0.2f" % (cross_val_score(clf, X, y, cv=10, scoring='accuracy').mean()))
print("Precision: %0.2f (+/- %0.2f)" % (cross_val_score(clf, X, y, cv=10, scoring='precision_macro').mean(), cross_val_score(clf, X, y, cv=10, scoring='precision_macro').std() * 2))
print("Recall: %0.2f (+/- %0.2f)" % (cross_val_score(clf, X, y, cv=10, scoring='recall_macro').mean(), cross_val_score(clf, X, y, cv=10, scoring='recall_macro').std() * 2))
print("F1-score: %0.2f (+/- %0.2f)" % (cross_val_score(clf, X, y, cv=10, scoring='f1_macro').mean(), cross_val_score(clf, X, y, cv=10, scoring='f1_macro').std() * 2))


Decision Tree Classifier
Accuracy: 0.96
Precision: 0.94 (+/- 0.19)
Recall: 0.97 (+/- 0.13)
F1-score: 0.93 (+/- 0.21)


### Predict Type for Test data

Load test dataset

In [169]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,Store_Dept,Weekly_Sales,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1_1,17147.44,5/4/12,False,75.55,3.749,21290.13,,69.89,4977.35,3261.04,221.6718,7.143,A,151315
1,1_1,18164.2,5/11/12,False,73.77,3.688,8351.4,,10.52,2443.14,3127.88,221.725663,7.143,A,151315
2,1_1,18517.79,5/18/12,False,70.33,3.63,6154.14,,45.11,1675.49,5508.18,221.742674,7.143,A,151315
3,1_1,16963.55,5/25/12,False,77.22,3.561,4039.39,,745.19,1429.96,3631.13,221.744944,7.143,A,151315
4,1_1,16065.49,6/1/12,False,77.95,3.501,6086.21,12.0,370.51,148.75,3690.85,221.747214,7.143,A,151315


We want to do similar modifications in test dataset

In [170]:
test.fillna(0, inplace= True)
#prepare data- converting date
test['Date'] = pd.to_datetime(test['Date'], errors ='coerce')
test['Year']=test['Date'].dt.year
test['Month']=test['Date'].dt.month
test['Week']=test['Date'].dt.week

store_dept = test["Store_Dept"].str.split("_", n = 1, expand = True) 
 
# making separate Store column from new data frame 
test["Store"]= store_dept[0]  
# making separate Dept column from new data frame 
test['Department']= store_dept[1]   
# Dropping old Store_Dept columns 
test.drop(columns =['Store_Dept'], inplace = True) 

#test['Store'] = 'Store_' + df['Store']
#test['Department'] = 'Dept_' + df['Department']

In [171]:
test.head()

Unnamed: 0,Weekly_Sales,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Year,Month,Week,Store,Department
0,17147.44,2012-05-04,False,75.55,3.749,21290.13,0.0,69.89,4977.35,3261.04,221.6718,7.143,A,151315,2012,5,18,1,1
1,18164.2,2012-05-11,False,73.77,3.688,8351.4,0.0,10.52,2443.14,3127.88,221.725663,7.143,A,151315,2012,5,19,1,1
2,18517.79,2012-05-18,False,70.33,3.63,6154.14,0.0,45.11,1675.49,5508.18,221.742674,7.143,A,151315,2012,5,20,1,1
3,16963.55,2012-05-25,False,77.22,3.561,4039.39,0.0,745.19,1429.96,3631.13,221.744944,7.143,A,151315,2012,5,21,1,1
4,16065.49,2012-06-01,False,77.95,3.501,6086.21,12.0,370.51,148.75,3690.85,221.747214,7.143,A,151315,2012,6,22,1,1


In [172]:
X_train =  df.drop(columns=['Type','Date', 'IsHoliday','Temperature', 'Fuel_Price','CPI','Unemployment', 
                'CPI', 'Year','Month', 'Week'])
y_train =  df[['Type']]
X_test =  test.drop(columns=['Type','Date', 'IsHoliday','Temperature', 'Fuel_Price','CPI','Unemployment', 
                'CPI', 'Year','Month', 'Week'])
y_test =  test[['Type']]

We want to check the dimension of train and test data

In [173]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(344667, 9) (344667, 1) (76903, 9) (76903, 1)


fitting the Decision Tree Classifier in test data

In [174]:
print("Decision Tree Classifier")
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

Decision Tree Classifier


In [175]:
#y-pred - same as above
y_pred = clf.predict(X_test)
test['Weekly_Sales_Predicted'] = y_pred



Checking metrices

In [176]:
#compute precision,f1-score, recall - same as above

print("Precision: %0.2f" %precision_score(y_test, y_pred , average="macro"))
print("Recall: %0.2f" %recall_score(y_test, y_pred , average="macro"))
print("F1-score: %0.2f" %f1_score(y_test, y_pred , average="macro"))

Precision: 1.00
Recall: 1.00
F1-score: 1.00


In [177]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[39199     0     0]
 [    0 29743     0]
 [    0     0  7961]]
              precision    recall  f1-score   support

           A       1.00      1.00      1.00     39199
           B       1.00      1.00      1.00     29743
           C       1.00      1.00      1.00      7961

   micro avg       1.00      1.00      1.00     76903
   macro avg       1.00      1.00      1.00     76903
weighted avg       1.00      1.00      1.00     76903



In [59]:
test['Type_predicted']= y_pred

In [None]:
kmeans_df = pd.DataFrame({'Clusters':y_kmeans})

In [60]:
test.head()

Unnamed: 0,Weekly_Sales,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,...,Unemployment,Type,Size,Year,Month,Week,Store,Department,Weekly_Sales_Predicted,Type_predicted
0,17147.44,2012-05-04,False,75.55,3.749,21290.13,0.0,69.89,4977.35,3261.04,...,7.143,A,151315,2012,5,18,Store_Store_1,Dept_Dept_1,A,A
1,18164.2,2012-05-11,False,73.77,3.688,8351.4,0.0,10.52,2443.14,3127.88,...,7.143,A,151315,2012,5,19,Store_Store_1,Dept_Dept_1,A,A
2,18517.79,2012-05-18,False,70.33,3.63,6154.14,0.0,45.11,1675.49,5508.18,...,7.143,A,151315,2012,5,20,Store_Store_1,Dept_Dept_1,A,A
3,16963.55,2012-05-25,False,77.22,3.561,4039.39,0.0,745.19,1429.96,3631.13,...,7.143,A,151315,2012,5,21,Store_Store_1,Dept_Dept_1,A,A
4,16065.49,2012-06-01,False,77.95,3.501,6086.21,12.0,370.51,148.75,3690.85,...,7.143,A,151315,2012,6,22,Store_Store_1,Dept_Dept_1,A,A
