# Multiple Linear Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
#reading and loading data
df = pd.read_excel (r'Air pollution 2016-2022.xlsx', sheet_name='Chennai')

In [3]:
#filling missing values with mean
x=['PM2.5','NO2','NH3','CO','SO2','OZONE','AQI']
df[x] = df[x].fillna(df[x].mean())

In [4]:
df.shape

(121, 10)

In [5]:
df.head()

Unnamed: 0,City,Date,PM2.5,NO2,NH3,CO,SO2,OZONE,AQI,AQI_BUCKET
0,Chennai,2017-01-01,44.56,15.04,43.97622,0.18,5.8,19.33,118.0,Moderate
1,Chennai,2017-01-15,55.97,9.54,43.97622,0.01,4.56,48.59,103.0,Moderate
2,Chennai,2017-02-01,45.13,15.97,43.97622,0.04,4.57,28.68,130.0,Moderate
3,Chennai,2017-02-15,63.51,14.78,43.97622,0.22,3.45,43.83,95.0,Satisfactory
4,Chennai,2017-03-01,82.44,17.92,43.97622,0.27,7.02,54.36,157.0,Moderate


In [6]:
df.describe()

Unnamed: 0,PM2.5,NO2,NH3,CO,SO2,OZONE,AQI
count,121.0,121.0,121.0,121.0,121.0,121.0,121.0
mean,45.557934,15.66281,43.97622,10.815702,11.304132,30.48438,88.404959
std,24.213212,9.947058,39.151462,17.782163,11.718683,15.670461,37.728764
min,11.0,4.0,1.34,0.01,2.0,3.0,25.0
25%,30.99,11.0,22.56,0.64,5.51,17.31,63.0
50%,40.91,14.0,43.97622,0.88,7.29,28.82,83.0
75%,53.31,18.0,43.97622,23.0,12.44,40.81,105.0
max,150.0,98.77,248.09,107.0,90.0,81.71,234.0


In [7]:
df.isnull().sum()

City          0
Date          0
PM2.5         0
NO2           0
NH3           0
CO            0
SO2           0
OZONE         0
AQI           0
AQI_BUCKET    0
dtype: int64

In [8]:
X = df[['PM2.5','NO2', 'NH3','CO','SO2','OZONE']] #independent variable
y = df['AQI'] #dependent variable

In [9]:
#20% of data for test data
#80% for train data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
#training the alg
regressor.fit(X_train, y_train)

LinearRegression()

In [11]:
#coeff of regression
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
PM2.5,0.797069
NO2,-0.214868
NH3,0.033986
CO,-1.056765
SO2,0.525601
OZONE,0.64595


In [12]:
y_pred = regressor.predict(X_test)

In [13]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
48,212.0,180.279104
94,39.0,47.293555
95,76.0,54.377655
8,76.0,96.565376
97,130.0,119.85081
22,57.0,95.334563
7,87.0,105.623653
10,91.0,105.50458
45,135.0,97.815223
89,40.0,68.459702


In [14]:
regressor.score(X_test, y_test)*100

17.844519012519598

In [15]:
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 43.70 %
Standard Deviation: 26.32 %


In [16]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 26.223036688328552
Mean Squared Error: 1158.5237306930537
Root Mean Squared Error: 34.03709345248289


# THANK YOU!!!