In [46]:
# data analysis
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# import all the ml model libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix

# let's read in the data
train = pd.read_csv("train_walmart.csv")
test = pd.read_csv("test_walmart.csv")

<b size="14px">Exploratory Data Analysis</b> <br>
Let's begin by exploring the data that I have here. 

In [3]:
train.describe(include="all")

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
count,647054.0,647054.0,647054,642925.0,647054.0,645693,642925.0
unique,,,7,,,68,
top,,,Sunday,,,GROCERY DRY GOODS,
freq,,,133975,,,70402,
mean,58.584511,96167.640078,,30606980000.0,1.108878,,3726.884567
std,157.635533,55545.485154,,91201340000.0,0.700776,,2780.966158
min,3.0,5.0,,834.0,-12.0,,0.0
25%,27.0,49268.0,,3400001000.0,1.0,,1404.0
50%,39.0,97074.0,,7050103000.0,1.0,,3352.0
75%,40.0,144316.0,,30065310000.0,1.0,,5501.0


We have 647,054 rows of data. The various features we have are:

<b>TripType</b> - This is what we are trying to classify. There are roughly 40 types of trips.<br>
<b>VisitNumber</b> - The id corresponding to a single trip by a single customer <br>
<b>Weekday</b> - The day the trip was made <br>
<b>Upc</b> - The product code<br>
<b>ScanCount</b> - Number of items purchased<br>
<b>DeptDescription</b> - a high level description of the type of product<br>
<b>FinelineNumber</b> - a more refined category for the type of product<br>

Our categorical features are Weekday, DeptDescription, and FinelineNumber. <br>
Our ordinal features are Scancount. Alhough, I predict that the vast majority of the items will have a scancount of 1.

In [7]:
train.sample(10)

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
493082,35,146338,Sunday,1254601000.0,1,IMPULSE MERCHANDISE,135.0
647044,39,191346,Sunday,5100020000.0,1,GROCERY DRY GOODS,3107.0
172224,43,51964,Saturday,83972400000.0,1,HARDWARE,9802.0
104089,25,30248,Tuesday,4529902000.0,1,BOYS WEAR,616.0
514082,6,152182,Monday,4150000000.0,1,GROCERY DRY GOODS,2302.0
47622,44,13996,Sunday,1113200000.0,1,PETS AND SUPPLIES,806.0
9057,25,3130,Friday,60156040000.0,1,LADIESWEAR,1745.0
248382,40,73536,Tuesday,7418229000.0,1,PERSONAL CARE,3159.0
17987,39,5739,Friday,7320290000.0,1,FROZEN FOODS,4065.0
163233,38,49651,Saturday,4178900000.0,1,GROCERY DRY GOODS,3105.0


Here, we have a sample of some of the data that we will be working with. We will need to change Weekday and Department Description from strings to floats. But let's do that later it's easier to see the data I am working with.

In [19]:
print(pd.isnull(train).sum())

TripType                    0
VisitNumber                 0
Weekday                     0
Upc                      4129
ScanCount                   0
DepartmentDescription    1361
FinelineNumber           4129
dtype: int64


A couple thousand of our values are null. Given that we have over 647,000 rows of data, we still have around 99.4% of all the data, so it is safe to ignore these data entries. Therefore, I will drop them from our data. I can't figure out how to drop the rows right now so I will figure that out later.

In [48]:
train.loc[train.DepartmentDescription.isnull()]
train.loc[train.Upc.isnull()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
25,26,8,Friday,,1,,
548,27,259,Friday,,3,,
549,27,259,Friday,,1,,
959,999,409,Friday,,-1,,
1116,39,479,Friday,,1,,
1134,999,484,Friday,,-2,,
1135,999,484,Friday,,-2,,
1155,44,496,Friday,,1,PHARMACY RX,
1216,5,521,Friday,,1,PHARMACY RX,
1373,5,585,Friday,,1,PHARMACY RX,


In [47]:
print(pd.isnull(train).sum())

TripType                    0
VisitNumber                 0
Weekday                     0
Upc                      4129
ScanCount                   0
DepartmentDescription    1361
FinelineNumber           4129
dtype: int64
