In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#read the data
train=pd.read_csv("../input/Train.csv")
test=pd.read_csv("../input/Test.csv")

In [None]:
#print first few records
train.head()

In [None]:
#let us look at a few test ds obervations as well
test.head()

In [None]:
#print info-train
train.info()

In [None]:
test.info()

In [None]:
# let us take copies of train and test datasets 
train_orig=train.copy()
test_orig=test.copy()

In [None]:
# let us add Item_Outlet_Sales column to test dataset so it is easy to merge train and test where needed
test["Item_Outlet_Sales"]=0

In [None]:
#combine train and test datasets
combi=train.append(test,ignore_index=True)

In [None]:
#let us verify a few tailing records
combi.tail()

In [None]:
#let us print the info on combi
combi.info()

In [None]:
# Let us look at item weight feature as it seems to have missing values in it
import matplotlib.pyplot as plt
plt.hist(x=combi.Item_Weight,cumulative=True)
plt.xlabel("Item weight")
plt.ylabel("Frequency count")
plt.show()

In [None]:
#let us look at skewness and kurtosis values of item weight feature
combi.Item_Weight.skew()
combi.Item_Weight.kurt()

# it does not sound like we have many outliers in item weight

In [None]:
combi.Item_Weight.describe()

In [None]:
#item weight and outlet size seems to have missing values. fill that with mean or median for item weight.
combi.Item_Weight.fillna(combi.Item_Weight.mean(),inplace=True)

In [None]:
combi.Item_Weight.describe()

In [None]:
combi.Item_Visibility.describe()

In [None]:
#Look at item visibility
plt.hist(x=combi.Item_Visibility)
plt.xlabel("Item Visibility")
plt.ylabel("Frequency count")
plt.show()

In [None]:
#Looks like there are some 0(zero) visibility values...could be a mistake or problem with the data feeding. let us identify those
combi[combi.Item_Visibility==0].shape

In [None]:
# 879 such rows are there where you have zero visibility.
combi.Item_Visibility.kurt()

#let us impute mean value into those
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy="mean", axis=0)
combi["Item_Visibility"] = imp.fit_transform(combi[["Item_Visibility"]]).ravel()

In [None]:
# verify if we still see any missing values (zeros) in item visibility feature
(combi.Item_Visibility==0).sum()

In [None]:
combi.info()

In [None]:
(combi.Item_Visibility==0).sum()  # no more zeros in visibility column..hurray

In [None]:
# let us look at outlet type and size features
combi.Outlet_Type.value_counts()    

So, outlet type column has the following unique values:

Supermarket Type1     
Grocery Store         
Supermarket Type3     
Supermarket Type2     

In [None]:
# Let us look at outlet size feature
combi.Outlet_Size.value_counts(dropna=False)

# Looks like there are 4016 missing values in it. let us take a look at some of these observations before we decide on anything further on these.

In [None]:
pd.crosstab(combi.Outlet_Type,combi.Outlet_Size,margins=True,dropna=False)

In [None]:
#let us take a look at those combinations
combi[combi.Outlet_Size.isna()]

In [None]:
# let us fill those missing values with "Other"
combi.Outlet_Size.fillna(value="Other",inplace=True)

In [None]:
combi.info()   # No more missing values.

In [None]:
combi.Item_Fat_Content.value_counts()

In [None]:
# Lools like: LF, Low Fat and low fat are same; Regular and reg are same. let us convert them to "low fat" and "reg" 
# respectively.

fat_conversion_dict={"Low Fat":"low fat","Regular":"reg","LF":"low fat","reg":"reg","low fat":"low fat"}
combi.Item_Fat_Content=combi.Item_Fat_Content.map(fat_conversion_dict)

In [None]:
#let us take a look at it 
combi.Item_Fat_Content.value_counts()  # Now, there are only 2 types

In [None]:
# Let us take a look at object type variables
categorical_cols=combi.select_dtypes(include="object").columns  # This includes item identifier

value_counts=[combi[col].value_counts() for col in categorical_cols if col not in ["Item_Identifier"]]

In [None]:
# Go thru each categorical variable and find out how many different values each of them assumes
categorical_cols=[]
for items in value_counts:
    print(items.name) # Name of categorical variable
    categorical_cols.append(items.name)
    print(items.nunique()) # unique value count for the categorical variable
    print()

In [None]:
# Let us convert all these categorical variables into numeric using one hot coding
combi=pd.get_dummies(combi,drop_first=True,columns=categorical_cols)

In [None]:
combi.head()

In [None]:
combi.shape

In [None]:
#Let us scale certain numeric features before we do any model building:
scaled_features=["Item_Weight","Item_Visibility","Item_MRP"]


In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
combi[scaled_features]=sc.fit_transform(combi[scaled_features])

In [None]:
combi[scaled_features].head()

In [None]:
# Now the numeric features are scaled...let us do a bit more proessing on this:
# Separate Item_Identifier as it is not needed for model
# Separate Item_Outlet_Sales into y dataset as it is to be predicted

item_id=combi["Item_Identifier"]
y=combi["Item_Outlet_Sales"]

In [None]:
print(item_id.count())
print(y.shape)

combi dataframe has both train and test observations.
Time to split those.

In [None]:
# Let us divide the combi frame to X and y
orig_train_len=train.shape[0]
orig_test_len=test.shape[0]
print(orig_train_len,orig_test_len)

combi_item_id=combi["Item_Identifier"]  # both train and test
combi_y=combi["Item_Outlet_Sales"]      # both train and test 

print(len(combi_item_id))       # both train and test set
print(len(combi_y))             # both train and test set

In [None]:
# Build X and y datasets now for model building; ignore original test dataset; ignore item id and target columns
X=combi.loc[0:orig_train_len-1,combi.columns[~combi.columns.isin(['Item_Identifier','Item_Outlet_Sales'])]]
y=combi.loc[0:orig_train_len-1,['Item_Outlet_Sales']]


# Let us do k-fold cross validation here with the following regression models:
1. Linear regression
2. Ridge regression model
3. Decision tree regression model
4. Random Forest regression model

In [None]:
# let us start with model selection first here. Since this is a regression problem, import all those relevant models/.
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Let us use default parameters and initialize the models accordingly
models = []
models.append(('LR', LinearRegression()))
models.append(('RR', Ridge()))
models.append(('DT', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))


In [None]:
print(models)

In [None]:
# Let us get the possible scoring values that can be used in cross validation methods:
from sklearn import metrics
metrics.SCORERS.keys()

In [None]:
np.array(y).ravel()

In [None]:
# K fold cross validation ---perfect way of measuring accuracy of models
from sklearn.model_selection import KFold,cross_val_score
names = []
rmse_scores = []
for name, model in models:    
    kfold = KFold(n_splits=10, random_state=10) 
    rmse_score = np.sqrt(np.abs(cross_val_score(model, X, np.array(y).ravel(), cv=kfold, scoring='neg_mean_squared_error').mean()
                               ))
    names.append(name)
    rmse_scores.append(rmse_score)
kf_cross_val = pd.DataFrame({'Model Name': names, 'mean RMSE Score': rmse_scores})
print(kf_cross_val)

Cross validation suggests Linear Regression and Ridge regression models result in lowest RMSE values. So, let us go with Linear regression.

In [None]:
# Let us split this into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [None]:
# Let us try linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lrg=LinearRegression()
lrg.fit(X_train,y_train)
lrg_Xtest_predictions=lrg.predict(X_test)

In [None]:
print("Mean sqaured error is:",mean_squared_error(y_test,lrg_Xtest_predictions))

In [None]:
print("Root Mean sqaured error is:",np.sqrt(mean_squared_error(y_test,lrg_Xtest_predictions)))

In [None]:
# Print intercept and coefficients of the model
print("Linear regression model (lrg) Intercept is: ",lrg.intercept_)
print("Linear regression model (lrg) Coefficients are: ",lrg.coef_)

In [None]:
plt.figure(figsize=(20,20))
predictors=X_train.columns
coefficients = pd.Series(np.array(lrg.coef_).ravel(), predictors).sort_values()
coefficients.plot(kind='bar', title='Model Coefficients',fontsize=20)

# Leave this space blank for submission. We need to do predictions using the above linear regression model on "original test data" and submit those predictions into a submission file.

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=37)   # all components
pca.fit(X_train)
print("explained variance ratios: ",pca.explained_variance_ratio_)
cum_variance=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print("cumulative explained variance ratios: ",cum_variance)
plt.plot(cum_variance)

In [None]:
#find the index of 99% variance
index_for_99percent_variance=np.where(cum_variance > 99.0)[0][0]
index_for_99percent_variance

In [None]:
# It looks like we can achieve 99% variance with just 13 components alone.. So, let us build the PCA with just 13 components
pca=PCA(n_components=13)   # only 13 components
pca.fit(X_train)
X_train_pca=pca.transform(X_train)
X_test_pca=pca.transform(X_test)
print("explained variance ratios: ",pca.explained_variance_ratio_)
cum_variance=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print("cumulative explained variance ratios: ",cum_variance)
plt.plot(cum_variance)

In [None]:
#Let us build linear regresion on PCA components (dimensionality reduction happened)
lrgp=LinearRegression()
lrgp.fit(X_train_pca,y_train)
lrgp_Xtest_pca_predictions=lrgp.predict(X_test_pca)

In [None]:
print("Root Mean sqaured error using PCA is:",np.sqrt(mean_squared_error(y_test,lrgp_Xtest_pca_predictions)))