In [1]:
### Importing the required packages and libraries
# we will need numpy and pandas later
import numpy as np
import pandas as pd

#import LogisticRegression Class
from sklearn.linear_model import LogisticRegression
#import DecisionTreeClassifier class
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# Randomly splitting the original dataset into training set and testing set
from sklearn.model_selection import train_test_split

In [2]:
# reading a CSV file directly from Web, and store it in a pandas DataFrame:
# "read_csv" is a pandas function to read csv files from web or local device:
walmart_features_df =  pd.read_csv('../../data/features.csv')

walmart_sampleSubmission_df =  pd.read_csv('../../data/sampleSubmission.csv')

walmart_stores_df =  pd.read_csv('../../data/stores.csv')

walmart_test_df =  pd.read_csv('../../data/test.csv')

walmart_train_df =  pd.read_csv('../../data/train.csv')

In [3]:
print("Feature Data Frame Info", walmart_features_df.info())

print("\n")

print("Store Data Frame Info", walmart_stores_df.info())

print("\n")

print("Test Data Frame Info",walmart_test_df.info())

print("\n")

print("Train Data Frame Info", walmart_train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
Store           8190 non-null int64
Date            8190 non-null object
Temperature     8190 non-null float64
Fuel_Price      8190 non-null float64
MarkDown1       4032 non-null float64
MarkDown2       2921 non-null float64
MarkDown3       3613 non-null float64
MarkDown4       3464 non-null float64
MarkDown5       4050 non-null float64
CPI             7605 non-null float64
Unemployment    7605 non-null float64
IsHoliday       8190 non-null bool
dtypes: bool(1), float64(9), int64(1), object(1)
memory usage: 711.9+ KB
Feature Data Frame Info None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
Store    45 non-null int64
Type     45 non-null object
Size     45 non-null int64
dtypes: int64(2), object(1)
memory usage: 1.1+ KB
Store Data Frame Info None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115064 entries, 0 to 11506

In [4]:
#replace all the nan values with 0, also inplace = true make is permanent
walmart_features_df.fillna(value=0, inplace=True)

In [5]:
#adding all the markdown sales and putting it in one column
walmart_features_df['Markdowns'] = walmart_features_df['MarkDown1'] + walmart_features_df['MarkDown2'] + walmart_features_df['MarkDown3'] + walmart_features_df['MarkDown4'] + walmart_features_df['MarkDown5'] 

In [6]:
#dropping the unncessary columns,
labelsToDrop = ['MarkDown1', 'MarkDown2', 'MarkDown3','MarkDown4','MarkDown5']
walmart_features_df.drop(labels=labelsToDrop,axis=1, inplace=True)

In [7]:
walmart_features_df.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Markdowns
0,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0
1,1,2010-02-12,38.51,2.548,211.24217,8.106,True,0.0
2,1,2010-02-19,39.93,2.514,211.289143,8.106,False,0.0
3,1,2010-02-26,46.63,2.561,211.319643,8.106,False,0.0
4,1,2010-03-05,46.5,2.625,211.350143,8.106,False,0.0


In [55]:
#merging datasets, joining walmart_Store_Df to features by the common column of store.
MergeFeatureAndStore_Df = pd.merge(walmart_features_df,
                 walmart_stores_df[['Store','Size']],
                 on='Store')

In [56]:
MergeFeatureAndStore_Df

Unnamed: 0,Store,Date,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Markdowns,Size
0,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315
1,1,2010-02-12,38.51,2.548,211.242170,8.106,True,0.00,151315
2,1,2010-02-19,39.93,2.514,211.289143,8.106,False,0.00,151315
3,1,2010-02-26,46.63,2.561,211.319643,8.106,False,0.00,151315
4,1,2010-03-05,46.50,2.625,211.350143,8.106,False,0.00,151315
5,1,2010-03-12,57.79,2.667,211.380643,8.106,False,0.00,151315
6,1,2010-03-19,54.58,2.720,211.215635,8.106,False,0.00,151315
7,1,2010-03-26,51.45,2.732,211.018042,8.106,False,0.00,151315
8,1,2010-04-02,62.27,2.719,210.820450,7.808,False,0.00,151315
9,1,2010-04-09,65.86,2.770,210.622857,7.808,False,0.00,151315


In [62]:
#merging train dataset
Merged_Train_Features_Store_Df = pd.merge(MergeFeatureAndStore_Df, walmart_train_df[['Date','Dept','Weekly_Sales']], on='Date')

In [63]:
Merged_Train_Features_Store_Df.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Markdowns,Size,Dept,Weekly_Sales
0,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,1,24924.5
1,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,2,50605.27
2,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,3,13740.12
3,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,4,39954.04
4,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,5,32229.38


In [64]:
Merged_Train_Features_Store_Df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18970650 entries, 0 to 18970649
Data columns (total 11 columns):
Store           int64
Date            object
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
IsHoliday       bool
Markdowns       float64
Size            int64
Dept            int64
Weekly_Sales    float64
dtypes: bool(1), float64(6), int64(3), object(1)
memory usage: 1.6+ GB


In [60]:
Merged_Train_Features_Store_Df

Unnamed: 0,Store,Date,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Markdowns,Size,Weekly_Sales
0,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,24924.50
1,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,50605.27
2,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,13740.12
3,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,39954.04
4,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,32229.38
5,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,5749.03
6,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,21084.08
7,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,40129.01
8,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,16930.99
9,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.00,151315,30721.50


In [61]:
walmart_train_df

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.50,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.90,False
5,1,1,2010-03-12,21043.39,False
6,1,1,2010-03-19,22136.64,False
7,1,1,2010-03-26,26229.21,False
8,1,1,2010-04-02,57258.43,False
9,1,1,2010-04-09,42960.91,False
