<a href="https://colab.research.google.com/github/kenedy21/kenedy21/blob/ml_tutorial/ml_Bigmart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics  import r2_score,mean_squared_error

In [None]:
##load data
traind = pd.read_csv('Train.csv')
testd= pd.read_csv('Test.csv')

In [None]:
## check rows and columns
traind.shape,testd.shape

In [None]:
traind.columns

In [5]:
#merge datasets to perform eda
traind['source'] = 'train'
testd['source'] = 'test'
data = pd.concat([traind,testd],ignore_index=True)

In [None]:
data 

In [None]:
## summary of data
data.describe()

In [None]:
## check missing values
data.isnull().sum()

In [None]:
## all columns
data.columns

Print unique values in columns

In [10]:
data['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [11]:
data['Outlet_Establishment_Year'].unique()

array([1999, 2009, 1998, 1987, 1985, 2002, 2007, 1997, 2004])

In [12]:
data['Outlet_Size'].unique()

array(['Medium', nan, 'High', 'Small'], dtype=object)

In [13]:
data['outlet_age'] = 2022 - data['Outlet_Establishment_Year']
                                 

print count values

In [14]:
data['Item_Fat_Content'].count()

14204

In [15]:
data['Outlet_Size'].count()

10188

In [16]:
## mode in outlet size
data['Outlet_Size'].mode()[0]

'Medium'

Replace missing values

In [17]:
data['Outlet_Size'] = data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0])

In [18]:
data['Item_Weight'] = data['Item_Weight'].fillna(data['Item_Weight'].mean())

plotting

In [None]:
data['Item_Visibility'].hist(bins=10)

Detect outliers

In [20]:
## calculate first and second quartile 
Q1 = data['Item_Visibility'].quantile(0.25)
Q3 = data['Item_Visibility'].quantile(0.75)

In [21]:
## calculate inter_quantile_range iqr
IQR = Q3 - Q1

In [22]:
##Remove outlier and store in fill_data variable
fill_data = data.query('(@Q1 - 1.5 * @IQR) <= Item_Visibility <= (@Q3 + 1.5 * @IQR)')

In [None]:
fill_data

In [24]:
# modify Item_Visibility by converting into categories ; 
## low,visibility and high
data['Item_Visibility_Bins'] = pd.cut(data['Item_Visibility'],[0.000,0.065,0.13,0.2],
                                      labels = ['low_viz','Viz','High_Viz'])

In [None]:
## count visibility bins
data['Item_Visibility_Bins'].value_counts()

In [26]:
## Replace null values with low visibility
data['Item_Visibility_Bins'] = data['Item_Visibility_Bins'].replace(np.nan,'low viz',regex = True)

In [27]:
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace(['low fat', 'LF'],'Low Fat')

In [28]:
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace('reg','Regular')


In [29]:
data['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular'], dtype=object)

Encode categorical values using label encoder

In [30]:
le = LabelEncoder()


In [31]:
#transform item fat content
data['Item_Fat_Content'] = le.fit_transform(data['Item_Fat_Content'])

In [32]:
data['Item_Visibility_Bins'] = le.fit_transform(data['Item_Visibility_Bins'])

In [33]:
data['Outlet_Size'] = le.fit_transform(data['Outlet_Size'])

In [34]:
data['Outlet_Location_Type'] = le.fit_transform(data['Outlet_Location_Type'])


In [None]:
dummies = pd.get_dummies(data['Outlet_Type'])
dummies.head()

In [None]:
data['Item_Identifier'].value_counts()

In [37]:
data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x : x[0:2])

In [38]:
data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food',
'NC': 'Non-consumable',
'DR': 'Drinks'})

In [None]:
data['Item_Type_Combined'].value_counts()

In [40]:
data = pd.get_dummies(data, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type_Combined'])

In [None]:
data.columns

In [41]:
import warnings
warnings.filterwarnings("ignore")

In [59]:
## drop coloumns that have been converted to other types
data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)

## divide data into train and test
train = data.loc[data.source == 'train']
test = data.loc[data['source'] == 'test']

In [60]:
## drop unnecessary columns
test.drop(['Item_Outlet_Sales','source'], axis=1,inplace = True)
train.drop(['source'],axis=1,inplace=True)

In [61]:
train.to_csv('train_modified.csv')
test.to_csv('test_modified.csv')

In [62]:
trainm = pd.read_csv('train_modified.csv')
testm = pd.read_csv('test_modified.csv')

In [None]:
trainm.dtypes

In [65]:
## inindependent variable xtrain and depedent variable ytrain
x_train = trainm.drop(['Item_Identifier', 'Outlet_Identifier','Item_Outlet_Sales'],axis=1)
y_train = trainm.Item_Outlet_Sales

In [None]:
testm.dtypes

In [67]:
x_test = testm.drop(['Item_Identifier','Outlet_Identifier'],axis=1)


In [70]:
from sklearn import model_selection
from sklearn.linear_model import LinearRegression

In [71]:
xtrain,xtest,ytrain,ytest = model_selection.train_test_split(x_train,y_train,test_size=0.2,random_state=42)

In [69]:
model1 = LinearRegression()

In [74]:
model1.fit(xtrain,ytrain)

LinearRegression()

In [None]:
model1.coef_, model1.intercept_

In [78]:
pred =model1.predict(xtest)

In [81]:
## find RMSE for the model
import math
print(math.sqrt(mean_squared_error(ytest,pred)))

1067.7474977510958


In [83]:
#predict column item outlet sales
y_sales_pred = model1.predict(x_test)
y_sales_pred

array([1812.22238525, 1623.09689923, 1904.70509532, ..., 1909.90656537,
       3639.94086652, 1341.16223205])

In [86]:
test_predictions = pd.DataFrame({
    'Item_Identifier': testm['Item_Identifier'],
    'Outlet_Identfier': testm['Outlet_Identifier'],
    'Item_Outlet_Sales': y_sales_pred,

}, columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])

In [None]:
test_predictions