In [None]:
#Reading the dataset(only for kernel!!)
library("readr")
train <- read.csv("../input/Train_UWu5bXk.csv")
test <- read.csv("../input/Test_u94Q5KV.csv")

In [None]:
#Viewing first few rows of the training dataset
head(train)

In [None]:
#Understanding the internal structure of the  training dataset.
str(train)
## Notice that the variables like Item_Type are already categorical(factor, not character), so 
## there is no need to change them
## We have 7 categorical variables, 4 numerical variables and 1 integer variable.

In [None]:
#Understanding the internal structure of the  test dataset.
str(test)
## Notice that the variables like Item_Type are already categorical(factor, not character), so 
## there is no need to change them.
## We see that there is no Item_Outlet-Sales in the test data. That is the variable that we
## have to predict.

In [None]:
#Summarizing the numeric variables
summary(train)
## Here, take note of the fact that the number of NA(missing) values is also shown

In [None]:
#Summarizing variable-wise(here Outlet_Establishment)
summary(train$Outlet_Establishment)

In [None]:
#Understanding the categorical variable distribution.(here Outlet_Type)
table(train$Outlet_Type)

In [None]:
#Checking the number of NA(missing) values
table(train$Outlet_Type, useNA = "always")
## Here the number of missing values is 0

In [None]:
#Checking the distribution of categorical variables percentage wise
prop.table(table(train$Outlet_Type, useNA = "always"))*100

In [None]:
## Explore the data, check out every variable in the data to get more comfortable
## with it(fill '---" with the desired variable). You can also check out the test data

In [None]:
#Numeric Variables
#summary(train$---)

In [None]:
#Categorical Variables
#table(train$---, useNA = "always")
#prop.table(table(train$---, useNA = "always"))*100

In [None]:
## We have to perform same operations on train and test data. Rather than doing the same
## operations twice, it is better to append(attach) the train and test data together.

In [None]:
#Appending the train and test dataset

## We can't append the datasets unless they have the same number of columns. Therefore, we will
## add another column(Item_outlet_Sales)(the dependent variable), to the test dataset.
test$Item_Outlet_Sales <- NA

## Now, we will append the datasets, and create a new dataset named 'data'
data <- rbind(train, test)

In [None]:
# Checking the dimensions of data
dim(data)
## Let's compare to the test and train datasets
dim(train)
dim(test)

In [None]:
#Checking the missing(NA) values of the data
summary(is.na(data)) 
## is.na() returns a logical vector which indicates the number of datapoints which are missing
## TRUE indicates missing.

In [None]:
## We can see that only Item_Weight has missing values.
## Item_Outlet_Sales has missing values because it is the dependent variable, and the data points 
## corresponding to the test data have missing values. So we don't have to worry about that.

In [None]:
#Missing imputing values.
#Method-1(Mean method)
## We input the mean of rest of the data in place of the missing values.

#Finding the mean of the variable
 mean(data$Item_Weight) 
## will return NA. therefore, we have to specify to find the mean the 'non-NA' values
 mean((data$Item_Weight), na.rm = TRUE)
## na.rm = TRUE will remove the NA values to while calculating the mean. 
## OR
 mean(data$Item_Weight[!is.na(data$Item_Weight)])
## '!' returns the FALSE values

In [None]:
## Now, we will imsert this value in place of the missing value
## We will make another dataset to observe.
data_mean_impu <- data
data_mean_impu$Item_Weight[is.na(data_mean_impu$Item_Weight)] <- mean(data$Item_Weight[!is.na(data_mean_impu$Item_Weight)])
summary(data_mean_impu$Item_Weight)

In [None]:
#Method-2 (Median imputation)
## We input the median of rest of the data in place of the missing values.
 median((data$Item_Weight), na.rm = TRUE)
 median(data$Item_Weight[!is.na(data$Item_Weight)])

In [None]:
## Now, we will imsert this value in place of the missing value
## We will make another dataset to observe.
data_median_impu <- data
data_median_impu$Item_Weight[is.na(data_median_impu$Item_Weight)] <- median(data$Item_Weight[!is.na(data_median_impu$Item_Weight)])
summary(data_median_impu$Item_Weight)

In [None]:
##Since the only variable with missing values is numerical, we won't be using the k-NN method,
##which is for categorical variables. However, you can have a glance at k-NN implementation
##  https://www.youtube.com/watch?v=u8XvfhBdbMw 

In [None]:
#Method-3(Advanced Packages)(Optional)
## There are many powerful packages in R which help us input missing values
##For e.g. Mice, missForest, mi, Amelia etc. All packages have their specific uses,
## and pitfalls. For, more information, you can refer:-
## https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/


In [None]:
## First, we need to understand the distriution of Item_Weight. We can understand it better,
## if we can visually see it. Here, we will plot the histogram.
hist(data$Item_Weight)
hist(data_mean_impu$Item_Weight)
hist(data_median_impu$Item_Weight)

In [None]:
## As we can see, the methods of imputing mean and median for this case 
## would be a bad idea(the data biases too much). Instead,we will use the package Hmisc.
## But first, we will do some data cleaning.
## Notice that in Item_Fat_Content, we have LF, low fat and Low Fat, which are same. Also,
## reg and Regular are same. Therefore, add the observations of "LF" and "low fat" to "Low fat",
## and "reg" to "Regular"
data$Item_Fat_Content <- gsub("LF", "Low Fat", data$Item_Fat_Content)
data$Item_Fat_Content <- gsub("low fat", "Low Fat", data$Item_Fat_Content)
data$Item_Fat_Content <- gsub("reg", "Regular", data$Item_Fat_Content)
table(data$Item_Fat_Content) # Viewing the variable

In [None]:
## Notice that Item_Type has factors which are not food items. So, Item_Fat_Content makes no
## sense. Hence we will add a new factor(level): "None", which will correspond to the 
## non-food items in Item_Type.

#Adding new level in Item_Fat_Content "None"
levels(data$Item_Fat_Content) <- c(levels(data$Item_Fat_Content), "None")

## Based on Item_Type, for "health and Hygiene", "Household" and "Others",
## we will change the Item_Fat_Content factor to "None".
data[which(data$Item_Type=="Health and Hygiene"), ]$Item_Fat_Content = "None"
data[which(data$Item_Type=="Household"), ]$Item_Fat_Content = "None"
data[which(data$Item_Type=="Others"), ]$Item_Fat_Content = "None"
data$Item_Fat_Content <- as.factor(data$Item_Fat_Content)
table(data$Item_Fat_Content) # Viewing the variable

In [None]:
## Notice that the column Outlet_Size has blank values. We will now procede to replace them.
## We will compare them with other variables, to understand better about the missing variables.

In [None]:
Outlet_Size_Length <- as.data.frame(setNames(aggregate(data$Outlet_Size, 
                              by=list(Category=data$Outlet_Identifier, 
                                      Category=data$Outlet_Type,
                                      Category=data$Outlet_Location_Type,
                                      Category=data$Outlet_Size), FUN= length), 
                                      c("Outlet_Identifier","Outlet_Type", "Outlet_Location_Type", "Outlet_Size", "number")))
Outlet_Size_Length # Viewing the data frame

In [None]:
table(data[which(data$Outlet_Type=='Grocery Store'), ]$Outlet_Size, useNA='always')
table(data[which(data$Outlet_Type=='Supermarket Type1'), ]$Outlet_Size, useNA='always')
table(data[which(data$Outlet_Type=='Supermarket Type2'), ]$Outlet_Size, useNA='always')
table(data[which(data$Outlet_Location_Type=='Tier 1'), ]$Outlet_Size, useNA='always')
table(data[which(data$Outlet_Location_Type=='Tier 2'), ]$Outlet_Size, useNA='always')
table(data[which(data$Outlet_Location_Type=='Tier 3'), ]$Outlet_Size, useNA='always')

In [None]:
## As per analysis, we would be better off by fixing category 'Small', on every 'Grocery Store',
## and for the remaining blank values, we will assign category 'Small' again.
data[which(data$Outlet_Identifier=="OUT010"), ]$Outlet_Size <- "Small"
data[which(data$Outlet_Identifier=="OUT017"), ]$Outlet_Size <- "Small"
data[which(data$Outlet_Identifier=="OUT045"), ]$Outlet_Size <- "Small"
data$Outlet_Size <- factor(data$Outlet_Size) # Running factor on an existing factor variable will reset the levels to only those that are present.
table(data$Outlet_Size)

In [None]:
## Since we are only concerned with how old the outlet is, and not the establishment year,
## we will substitute Outlet_Establishment_Year with Outlet_Year
data$Outlet_Year <- as.factor(2013-data$Outlet_Establishment_Year) # Since this is 2013 data
table(data$Outlet_Year)

In [None]:
## Visualizing Item_MRP with ggplot
library(ggplot2)
ggplot(data, aes(Item_MRP)) + geom_density(adjust = 1/5)
## It is obvious that we would be better off by converting Item_MRP to Categorical variable

In [None]:
data$MRP_Factor <- as.factor(ifelse(data$Item_MRP < 70, "Low" ,
                             ifelse(data$Item_MRP <130, "Medium",
                             ifelse(data$Item_MRP < 201, "High", "Very High"))))
table(data$MRP_Factor) # Viewing the new variable

In [None]:
## Notice that Item_Visibility has a minimum value of 0. It seems absurd that an item has 0 
## visibility. Therefore, we will modify that column.
data_Visibility <- data[which(data$Item_Visibility !=0), c(1,4)]
IV_m <- aggregate(Item_Visibility~Item_Identifier, data=data_Visibility, FUN=mean)

In [None]:
## Now, we will substitute the values according to IV_m in Item_Visibility

In [None]:
for(i in 1:1559){
    if(dim(data[which(data$Item_Identifier==IV_m[i, ]$Item_Identifier & data$Item_Visibility=='0'), ])[1]!='0'){
            data[which(data$Item_Identifier==IV_m[i, ]$Item_Identifier & data$Item_Visibility=='0'), ]$Item_Visibility <- IV_m[i, ]$Item_Visibility
    }
}

In [None]:
summary(data$Item_Visibility) #Checking

In [None]:
## If you look at Item_Identifier, i.e. the unique ID of each item, it starts with either 
## FD, DR or NC. If you see the categories, these look like being Food, Drinks
## and Non-Consumables. So, we will create a new broad variable that assigns these three factors
data$Item_Identifier <- as.character(data$Item_Identifier)
data$Item_Type_Broad <- as.factor(ifelse(grepl('FD',data$Item_Identifier)== 'TRUE',"Foods",
                            ifelse(grepl('DR',data$Item_Identifier)== 'TRUE',"Drinks",
                            "Non-Consumables" )))
data$Item_Identifier <- as.factor(data$Item_Identifier)

In [None]:
table(data$Item_Type_Broad, useNA='always') #Checking

In [None]:
#We will aso create a more brief Item_Type_Broad2 from Item_Type
data$Item_Type_Broad2 <- as.factor(ifelse(data$Item_Type == 'Baking Goods' | data$Item_Type == 'Breads', "Bakery", 
                                          ifelse(data$Item_Type == 'Canned' | data$Item_Type == 'Frozen Foods' | data$Item_Type == 'Dairy', "Refrigerated",
                                          ifelse(data$Item_Type == 'Meat' | data$Item_Type == 'Seafood', "Non-Veg", 
                                          ifelse(data$Item_Type == 'Household' | data$Item_Type == 'Others' | data$Item_Type == 'Health and Hygiene', "Other",
                                          ifelse(data$Item_Type == 'Hard Drinks', "Alcoholic",
                                          ifelse(data$Item_Type == 'Snack Foods' | data$Item_Type == 'Soft Drinks', 'Snacks',
                                          ifelse(data$Item_Type == 'Breakfast', 'Breakfast', "NOTA"))))))))
table(data$Item_Type_Broad2, useNA = 'always')

In [None]:
## Now we will input missing values in Item_Weight
set.seed(825) #Setting seed
library(Hmisc)
imputed_arg <- aregImpute(~Item_Weight+Item_Fat_Content+Item_Type+Item_Type_Broad+ Item_Type_Broad2, 
                          data = data[ ,c(2,3,5,15,16)], boot.method = 'approximate bayesian', n.impute = 1)

In [None]:
for(i in 1:2439){
    if(dim(data[which(is.na(data$Item_Weight) == TRUE), ])[1]!='0'){
            data[which(is.na(data$Item_Weight) == TRUE), ]$Item_Weight <- imputed_arg$imputed$Item_Weight
    }
}

In [None]:
summary(data$Item_Weight) #Checking

In [None]:
## We are done with the data cleaning and feature engineering. If you can come up with more ideas,
## you are welcome to try.
## Now, we will divide the data into train and test.

In [None]:
# Dividing data into train and test
train <- data[1:8523, ]
test <- data[8524:14204, ]