In [1]:
### Importing the required packages and libraries
# we will need numpy and pandas later
import numpy as np
import pandas as pd

#import LogisticRegression Class
from sklearn.linear_model import LogisticRegression
#import DecisionTreeClassifier class
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# Randomly splitting the original dataset into training set and testing set
from sklearn.model_selection import train_test_split

In [2]:
# reading a CSV file directly from Web, and store it in a pandas DataFrame:
# "read_csv" is a pandas function to read csv files from web or local device:
walmart_features_df =  pd.read_csv('features.csv')

walmart_sampleSubmission_df =  pd.read_csv('sampleSubmission.csv')

walmart_stores_df =  pd.read_csv('stores.csv')

walmart_test_df =  pd.read_csv('test.csv')

walmart_train_df =  pd.read_csv('train.csv')

In [3]:
walmart_features_df.fillna(value=0, inplace=True)

#adding all the markdown sales and putting it in one column
walmart_features_df['Markdowns'] = walmart_features_df['MarkDown1'] + walmart_features_df['MarkDown2'] + walmart_features_df['MarkDown3'] + walmart_features_df['MarkDown4'] + walmart_features_df['MarkDown5'] 

labelsToDrop = ['MarkDown1', 'MarkDown2', 'MarkDown3','MarkDown4','MarkDown5']
walmart_features_df.drop(labels=labelsToDrop,axis=1, inplace=True)

In [4]:
#merging datasets, joining walmart_Store_Df to features by the common column of store.
MergeFeatureAndStore_Df = pd.merge(walmart_features_df,
                 walmart_stores_df[['Store','Size']],
                 on='Store')

MergeFeatureAndStore_Df.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Markdowns,Size
0,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315
1,1,2010-02-12,38.51,2.548,211.24217,8.106,True,0.0,151315
2,1,2010-02-19,39.93,2.514,211.289143,8.106,False,0.0,151315
3,1,2010-02-26,46.63,2.561,211.319643,8.106,False,0.0,151315
4,1,2010-03-05,46.5,2.625,211.350143,8.106,False,0.0,151315


In [5]:
#merging train dataset
Merged_Train_Features_Store_Df = pd.merge(MergeFeatureAndStore_Df, walmart_train_df[['Date','Dept','Weekly_Sales']], on='Date')
Merged_Train_Features_Store_Df.head()


Unnamed: 0,Store,Date,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Markdowns,Size,Dept,Weekly_Sales
0,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,1,24924.5
1,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,2,50605.27
2,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,3,13740.12
3,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,4,39954.04
4,1,2010-02-05,42.31,2.572,211.096358,8.106,False,0.0,151315,5,32229.38


In [6]:
#fix date into two columns one for month and other for year
from pandas import DatetimeIndex
df = Merged_Train_Features_Store_Df
df.Date = pd.to_datetime(df.Date)

In [7]:
df['Year'] = DatetimeIndex(df['Date']).year
df['Month']= DatetimeIndex(df['Date']).month
df['Day'] = DatetimeIndex(df['Date']).day
df = df.drop(columns=['Date'])
df[0::1000]

Unnamed: 0,Store,Temperature,Fuel_Price,CPI,Unemployment,IsHoliday,Markdowns,Size,Dept,Weekly_Sales,Year,Month,Day
0,1,42.31,2.572,211.096358,8.106,False,0.00,151315,1,24924.50,2010,2,5
1000,1,42.31,2.572,211.096358,8.106,False,0.00,151315,17,11908.60,2010,2,5
2000,1,42.31,2.572,211.096358,8.106,False,0.00,151315,56,359.50,2010,2,5
3000,2,40.19,2.572,210.752605,8.324,False,0.00,202307,49,11501.46,2010,2,5
4000,2,40.19,2.572,210.752605,8.324,False,0.00,202307,90,6171.21,2010,2,5
5000,2,40.19,2.572,210.752605,8.324,False,0.00,202307,40,24022.09,2010,2,5
6000,3,45.71,2.572,214.424881,7.368,False,0.00,37392,19,4891.96,2010,2,5
7000,3,45.71,2.572,214.424881,7.368,False,0.00,37392,40,34870.18,2010,2,5
8000,3,45.71,2.572,214.424881,7.368,False,0.00,37392,18,3218.60,2010,2,5
9000,4,43.76,2.598,126.442065,8.623,False,0.00,205863,85,4311.13,2010,2,5


In [8]:
#creating the feature matrix 
feature_cols = ['Store', 'Temperature','Fuel_Price','CPI','Unemployment', 'Markdowns', 'Size', 'Dept', 'Weekly_Sales']
X = Merged_Train_Features_Store_Df[feature_cols]

In [9]:
#Series of labels
y = Merged_Train_Features_Store_Df['IsHoliday']

y[0::10]

0           False
10          False
20          False
30          False
40          False
50          False
60          False
70          False
80          False
90          False
100         False
110         False
120         False
130         False
140         False
150         False
160         False
170         False
180         False
190         False
200         False
210         False
220         False
230         False
240         False
250         False
260         False
270         False
280         False
290         False
            ...  
18970350    False
18970360    False
18970370    False
18970380    False
18970390    False
18970400    False
18970410    False
18970420    False
18970430    False
18970440    False
18970450    False
18970460    False
18970470    False
18970480    False
18970490    False
18970500    False
18970510    False
18970520    False
18970530    False
18970540    False
18970550    False
18970560    False
18970570    False
18970580    False
18970590  

In [10]:
#logreg instantiated as an object of LogisticRegression
logreg = LogisticRegression()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=2)

In [12]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
y_predict_logreg = logreg.predict(X_test)

In [14]:
score_logreg = accuracy_score(y_test, y_predict_logreg)

In [15]:
print(score_logreg)

0.928443674834547


# GIT TESSST