# <center> Retail Price Prediction Using Linear Regression

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression,Ridge
from sklearn.model_selection import train_test_split,RepeatedKFold,GridSearchCV
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

## Reading the Normalized Dataset

In [2]:
df = pd.read_csv("FinalDf.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,State_Andhra Pradesh,State_Arunachal Pradesh,State_Assam,State_Bihar,State_Goa,State_Gujarat,State_Haryana,State_Himachal Pradesh,State_Jammu Kashmir,...,Unit_Dozen,Unit_80gm. Pkt,Unit_Litre,Unit_Kg,Unit_400/800 Gm,Unit_Peice,Unit_500gm. Pkt,Retail Price,Month,Year
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,24.0,1,2001
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,26.0,1,2001
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,25.5,1,2001
3,10,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,28.0,1,2001
4,12,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,27.0,1,2001


In [4]:
df.shape

(1147446, 208)

## Model Training

### Separating the X and Y variables

In [5]:
X = df.drop("Retail Price",axis = 1)
Y = df["Retail Price"]

### Splitting the Dataset into training and testing datasets

In [6]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=27,test_size = 0.25)

## Using a Linear Regression Model

### Initializing the model using a variable

In [7]:
lr = LinearRegression()

### Training the Model on Training Dataset

In [8]:
lr.fit(X_train,Y_train)

LinearRegression()

### Predicting outcome of the test dataset

In [10]:
Y_pred_lr = lr.predict(X_test)

## Evaluating the Model

In [16]:
print("Intercept: ", lr.intercept_)
print("Coefficients:")
list(zip(X, lr.coef_))

Intercept:  -11297.76055368961
Coefficients:


[('Unnamed: 0', -2.1289376311285324e-05),
 ('State_Andhra Pradesh', -6.279621992155683),
 ('State_Arunachal Pradesh', -6.209187483954537),
 ('State_Assam', 0.7727889163447408),
 ('State_Bihar', 4.167005639248515),
 ('State_Goa', -0.6689771196211254),
 ('State_Gujarat', -5.711880114180479),
 ('State_Haryana', 0.6182270579644111),
 ('State_Himachal Pradesh', 0.004256485546108024),
 ('State_Jammu Kashmir', -3.0185739799177913),
 ('State_Jharkhand', 0.8517182560730715),
 ('State_Karnataka', -4.133925467540995),
 ('State_Kerala', -5.596997610079419),
 ('State_Madhya Pradesh', -2.5937569918534416),
 ('State_Maharashtra', 1.2944322485417623),
 ('State_Manipur', 10.37594553086673),
 ('State_Meghalaya', -2.4502839751118013),
 ('State_Mizoram', 4.785247297091312),
 ('State_Nagaland', 0.49165454707155715),
 ('State_National Capital', -2.3716551973686437),
 ('State_Orissa', -8.81864265155295),
 ('State_Punjab', 0.07003189654740294),
 ('State_Rajasthan', -0.8420163610438578),
 ('State_Sikkim', 11.8

In [17]:
print("Prediction for test set: {}".format(Y_pred_lr))

Prediction for test set: [139.18444902 -29.58567698  66.76898307 ... 102.00260784 -23.01343454
 121.50302442]


In [19]:
lr_diff = pd.DataFrame({'Actual value': Y_test, 'Predicted value': Y_pred_lr})
lr_diff.head()

Unnamed: 0,Actual value,Predicted value
872977,138.785278,139.184449
585215,-30.873657,-29.585677
886343,66.782227,66.768983
1092997,-29.844849,-30.680949
861661,39.706909,40.053635


### Model Accuracy

In [21]:
meanAbErr = mean_absolute_error(Y_test, Y_pred_lr)
meanSqErr = mean_squared_error(Y_test, Y_pred_lr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_lr))
print('R squared: {:.2f}'.format(lr.score(X,Y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 73.82
Mean Absolute Error: 19.52236741431941
Mean Square Error: 5232.797091307233
Root Mean Square Error: 72.33807497651036


In [11]:
r2_score(Y_pred_lr,Y_test)

0.6337939351296515

## Ridge Linear Regression 

### Initializing and Training the Model on Training Dataset

In [9]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, Y_train)

  return linalg.solve(A, Xy, sym_pos=True,


Ridge()

### Predicting outcome of the test dataset

In [12]:
Y_pred_ridge = ridge.predict(X_test)

## Evaluating the Model

In [22]:
print("Intercept: ", ridge.intercept_)
print("Coefficients:")
list(zip(X, ridge.coef_))

Intercept:  -11284.021905921305
Coefficients:


[('Unnamed: 0', -2.0597473149608333e-05),
 ('State_Andhra Pradesh', -6.279329884009104),
 ('State_Arunachal Pradesh', -6.2086673258319),
 ('State_Assam', 0.7728976302734619),
 ('State_Bihar', 4.166796519386239),
 ('State_Goa', -0.6688423255853986),
 ('State_Gujarat', -5.7118015523230445),
 ('State_Haryana', 0.6179706968985198),
 ('State_Himachal Pradesh', 0.004026788599528433),
 ('State_Jammu Kashmir', -3.0184504908407286),
 ('State_Jharkhand', 0.8516514971315746),
 ('State_Karnataka', -4.133665800027552),
 ('State_Kerala', -5.596661747485178),
 ('State_Madhya Pradesh', -2.593582860085435),
 ('State_Maharashtra', 1.2944633080486772),
 ('State_Manipur', 10.375466116292149),
 ('State_Meghalaya', -2.4498541325260508),
 ('State_Mizoram', 4.785243257282977),
 ('State_Nagaland', 0.4917245739896166),
 ('State_National Capital', -2.3715491928564165),
 ('State_Orissa', -8.818676448323151),
 ('State_Punjab', 0.06999208902500959),
 ('State_Rajasthan', -0.8418839301605241),
 ('State_Sikkim', 11.81

In [23]:
print("Prediction for test set: {}".format(Y_pred_ridge))

Prediction for test set: [139.1776888  -29.61634934  66.77280663 ... 101.99581028 -22.99305573
 121.50115856]


In [24]:
ridge_diff = pd.DataFrame({'Actual value': Y_test, 'Predicted value': Y_pred_ridge})
ridge_diff.head()

Unnamed: 0,Actual value,Predicted value
872977,138.785278,139.177689
585215,-30.873657,-29.616349
886343,66.782227,66.772807
1092997,-29.844849,-30.656373
861661,39.706909,40.048943


### Model Accuracy

In [25]:
meanAbErr = mean_absolute_error(Y_test, Y_pred_ridge)
meanSqErr = mean_squared_error(Y_test, Y_pred_ridge)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred_ridge))
print('R squared: {:.2f}'.format(lr.score(X,Y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 73.82
Mean Absolute Error: 19.518488633159006
Mean Square Error: 5232.809460949641
Root Mean Square Error: 72.33816047529575


In [13]:
r2_score(Y_pred_ridge,Y_test)

0.6337603240317083