# Use of Linear Regression Machine Learning model to predict total sales features like money spent on Advertising/Marketing individual fields.

****Importing Libraries****

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

**Load Dataset**

In [2]:
data = pd.read_csv('/kaggle/input/advertising/Advertising.csv')

In [3]:
data

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
...,...,...,...,...,...
195,196,38.2,3.7,13.8,7.6
196,197,94.2,4.9,8.1,9.7
197,198,177.0,9.3,6.4,12.8
198,199,283.6,42.0,66.2,25.5


**Domain Analysis**
 - Sales is target.
 - TV, Radio and Newspaper are input variables.
 - Dataset tells us about total sales made by company by inveting certain amount on TV, Radio & Newspaper advertisement.

**Basic Checks**

In [None]:
data.shape

In [None]:
# head
data.head()

In [None]:
# tail
data.tail()

In [None]:
# info
data.info()

In [None]:
# describe
data.describe()

In [None]:
# checking for missing values
data.isnull().sum()

In [None]:
data.columns

**Exploratory Data Analysis**

***1. Univariate Analysis***

In [None]:
# checking distribution of TV
sns.distplot(x = data.TV, kde=True)

In [None]:
# checking distribution of Radio
sns.distplot(x = data.Radio, kde=True)

In [None]:
# checking distribution of Newspaper
sns.distplot(x = data.Newspaper, kde=True)

In [None]:
# checking distribution of Sales
sns.distplot(x = data.Sales, kde=True)

In [None]:
# Insights
# Sales represent normal distribution
# TV, Radio & Newspaper are not representing normal distribution

***2. Bivariate Analysis***

In [None]:
# Analyzing amount spent on TV with respect to Sales
sns.scatterplot(data = data, x = 'TV', y = 'Sales')

In [None]:
# Analyzing amount spent on Radio with respect to Sales
sns.scatterplot(data = data, x = 'Radio', y = 'Sales')

In [None]:
# Analyzing amount spent on Newspaper with respect to Sales
sns.scatterplot(data = data, x = 'Newspaper', y = 'Sales')

In [None]:
# From above all plots TV shows best advertisement trends among all TV, Radio & Newspaper.

***3. Multivariate Analysis***

In [None]:
sns.pairplot(data)

**Data preprocessing**

In [None]:
# checking missing values
data.isnull().sum()

In [None]:
# checking duplicates
data.duplicated().sum()

In [None]:
# data types
data.dtypes

*Checking for outliers*

In [None]:
# check outliers in TV
sns.boxplot(x = data.TV)

In [None]:
# check outliers in Radio
sns.boxplot(x = data.Radio)

In [None]:
# check outliers in Newspaper
sns.boxplot(x = data.Newspaper)

In [None]:
# Hence, from above analysis it is clear that Newspaper has outliers so we need to handle it.
# Newspaper has no nomarmal distribution, so will use IQR.

In [None]:
# Find Q1 & Q2
Q1 = data['Newspaper'].quantile(0.25)
Q3 = data['Newspaper'].quantile(0.75)

In [None]:
# Find IQR
IQR = Q3 - Q1
IQR

In [None]:
# find lower limit and upper limit 
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
print('Lower Limit : ',lower_limit)
print('Upper Limit : ',upper_limit)

In [None]:
# find values above upper limit
data.loc[data['Newspaper'] > upper_limit]

In [None]:
# find values above upper limit
data.loc[data['Newspaper'] < lower_limit]

In [None]:
# Since data is not normal will replace outliers with median
data.loc[data['Newspaper'] > upper_limit, 'Newspaper'] = data['Newspaper'].median()

In [None]:
# Check are there any outliers in the Newspaper
sns.boxplot(x = data.Newspaper)

**Feature Engineering**

In [None]:
# drop irrelevant coumns
# select best features
# drop columns which has very less correlation with target variables
# drop one of the column if two input variables have high correlation

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr(), annot = True)

In [None]:
# check for multicolinearity
sns.heatmap(data.drop('Sales', axis = 1).corr(), annot = True)

**Splitting of data**

In [None]:
x = data.drop('Sales', axis = 1)
y = data['Sales']

**Splitting data for training and testing**

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 4)

In [None]:
# Shapes with respect each data
print('x_train : ',x_train.shape)
print('x_test  : ',x_test.shape)
print('y_train : ',y_train.shape)
print('x_test  : ',y_train.shape)

**Model Building**

In [None]:
# import linear regression model
from sklearn.linear_model import LinearRegression

# initialize the model
model = LinearRegression()

# train model with x_train, y_train
model.fit(x_train, y_train)

*Predictions*

In [None]:
# make predictions
y_pred = model.predict(x_test)
y_pred

In [None]:
y_test

In [None]:
# Coeffiecients of x
model.coef_

In [None]:
# intercept c
model.intercept_

In [None]:
# Making predictions when TV = 100, Radio = 50, Newspaper = 10
model.predict([[0,100,50,10]])

**Evaluate the model**

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# MAE
MAE = mean_absolute_error(y_test, y_pred)
MAE

In [None]:
# MSE
MSE = mean_squared_error(y_test, y_pred)
MSE

In [None]:
# RMSE
RMSE = np.sqrt(MSE)
RMSE

In [None]:
# r2_score
r2_score(y_test, y_pred)