In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## SET THE COLOR THEME FOR SEABORN
sns.set_theme(color_codes=True)

# PANDAS DATA FRAME TO DISPLAY ALL COLUMNS
pd.set_option('display.max_columns',None)
# PANDAS DATA FRAME WITH 2 DECMIMAL PLACES
pd.set_option('display.float_format', lambda x: '%.2f' % x)


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score

In [9]:
# READ THE CSV FILE INTO DATAFRAME
df = pd.read_csv('advertising.csv', encoding='unicode_escape')

In [10]:
# DISPLAY THE TOP FIVE ROWS OF THE DATAFRAME
df.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [11]:
## SHAPE OF THE DATAFRAME
df.shape

(200, 4)

In [12]:
## REMOVE THE COLUMN SALES FROM THE DFRAME
## INDEPENDENT VARIABLES
X = df.drop('sales', axis=1)
X

Unnamed: 0,TV,radio,newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4
...,...,...,...
196,38.2,3.7,13.8
197,94.2,4.9,8.1
198,177.0,9.3,6.4
199,283.6,42.0,66.2


In [16]:
# DEPENDENT VARIABLES
y= df[['sales']]

MODEL

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [19]:
# AFTER SPLITTING
X_train.shape

(160, 3)

In [20]:
X_test.shape

(40, 3)

In [21]:
y_train.shape

(160, 1)

In [22]:
y_test.shape

(40, 1)

In [24]:
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

# OR
#reg_model = LinearRegression().fit(X_train, y_train)

In [None]:
# INTERCEPTS (b - bias)
b = reg_model.intercept_ 
b # array([2.90794702])

array([2.90794702])

In [None]:
# COEFFICIENT (w - weights)
w= reg_model.coef_
w # array([[0.0468431 , 0.17854434, 0.00258619]])

array([[0.0468431 , 0.17854434, 0.00258619]])

PREDICTION FOR MULTIPLE LINEAR REGRESSION

In [None]:
# TV = 30
# radio = 10
# newspaper = 40

# MODEL EQUEATION
# Sales = 2.90 + TV * 0.04 + radio * 0.17 + newspaper * 0.002 = 5.88
# Sales = 2.90794702 + TV * 0.0468431 + radio * 0.17854434 + newspaper * 0.00258619 = 6.202131

yindep_value = [[30], [10], [40]]
yindep_value = pd.DataFrame(yindep_value).T

reg_model.predict(yindep_value) # OUTPUT - array([[6.202131]])





array([[6.202131]])

In [None]:
# Train RMSE
y_pred = reg_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred)) # OUTPUT - 1.7369025901470923

1.7369025901470923

In [None]:
#Train RSQUARE
reg_model.score(X_train, y_train) # OUTPUT - 0.8959372632325174

0.8959372632325174

In [None]:
# Test RMSE
y_pred = reg_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred)) # OUPUT - 1.4113417558581587

# Train Error (1.7369025901470923) should be higher than the Test Error (1.4113417558581587)

1.4113417558581587

In [None]:
# Test RSQUARE
reg_model.score(X_test,y_test) # OUTPUT - 0.8927605914615384

0.8927605914615384

In [None]:
## CROSS VALIDATION - FOR 1O DATA IN cv 

-cross_val_score(reg_model, X, y, cv=10, scoring="neg_mean_squared_error") # WILL MULTIPLY BY NEGATIVE
# OUTPUT : array([3.56038438, 3.29767522, 2.08943356, 2.82474283, 1.3027754 ,
#       1.74163618, 8.17338214, 2.11409746, 3.04273109, 2.45281793])

array([3.56038438, 3.29767522, 2.08943356, 2.82474283, 1.3027754 ,
       1.74163618, 8.17338214, 2.11409746, 3.04273109, 2.45281793])

In [None]:
## CROSS VALIDATION - FOR 1O DATA IN cv - SQUARE ROOT FOR THE ABOVE OUTPUT

np.sqrt(-cross_val_score(reg_model, X, y, cv=10, scoring="neg_mean_squared_error")) # WILL MULTIPLY BY NEGATIVE
# OUTPUT : array([1.88689808, 1.81595022, 1.44548731, 1.68069713, 1.14139187, 
# 1.31971064, 2.85891276, 1.45399362, 1.7443426 , 1.56614748])

array([1.88689808, 1.81595022, 1.44548731, 1.68069713, 1.14139187,
       1.31971064, 2.85891276, 1.45399362, 1.7443426 , 1.56614748])

In [None]:
## #CROSS VALIDATION - FOR 1O DATA IN cv - MEAN FOR THE ABOVE OUTPUT

np.mean(np.sqrt(-cross_val_score(reg_model, X, y, cv=10, scoring="neg_mean_squared_error"))) # WILL MULTIPLY BY NEGATIVE
# OUTPUT : 1.6913531708051797

1.6913531708051797

In [None]:
## #CROSS VALIDATION - FOR 5 DATA IN cv - MEAN FOR THE ABOVE OUTPUT

np.mean(np.sqrt(-cross_val_score(reg_model, X, y, cv=5, scoring="neg_mean_squared_error"))) # WILL MULTIPLY BY NEGATIVE
# OUTPUT : 1.1.7175247278732086

1.7175247278732086

OUR TEST ERROR (10 DATASET) : 1.4113417558581587
TRAINING ERROR (10 DATASET) : 1.7369025901470923
ERROR FROM THE CROSS VALIDATION (10 DATASET): 1.6913531708051797
ERROR FROM THE CROSS VALIDATION (5 DATASET) : 1.7175247278732086

SINCE OUR DATA SET IS ON THE SMALLER SIDE, TRUSTING THE 10 FOLD CROSS VALIDATION LIKE A SMARTER BET