In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

In [2]:
from sklearn.linear_model import LinearRegression

## Loading the data

In [3]:
df = pd.read_csv('Multiple-linear-regression.csv')

In [5]:
df.columns

Index(['SAT', 'Rand 1,2,3', 'GPA'], dtype='object')

## Regression

In [6]:
reg = LinearRegression()

In [7]:
reg.fit(df[['SAT','Rand 1,2,3']],df['GPA'])

LinearRegression()

In [10]:
r2 = reg.score(df[['SAT','Rand 1,2,3']],df['GPA'])
print("The R^2 Value of the model is : ", r2)

The R^2 Value of the model is :  0.40668119528142843


## Calculating the Adjusted R^2 Value.

In [16]:
n = df.shape[0]
p = df.shape[1]-1
R2 = 1 - (1-r2) * (n-1)/(n-p-2)
print("The adjusted R^2 Value : ", R2)

The adjusted R^2 Value :  0.384431740104482


In [18]:
# The adjusted R2 value is lesss than that fo R2 value, so there is some feature with low explanatory power.

## Calculating the P-Value for each Feature(F-Regression)

In [19]:
# We do regression for each feature against the SAT score.

In [20]:
from sklearn.feature_selection import f_regression

In [21]:
x = df[['SAT','Rand 1,2,3']]
y = df['GPA']

In [22]:
f_regression(x,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [23]:
# The second tuple is the array of the P-values respectively

In [28]:
p_values = f_regression(x,y)[1]
p_values = p_values.round(3)
p_values

array([0.   , 0.676])

In [29]:
# We can discard the features with P-value greater than 0.05 i.e 95% confindence

## Summary

In [33]:
summary = pd.DataFrame(data=x.columns,columns=['Features'])
summary['Coeffs'] = reg.coef_
summary['P-Values'] = p_values

In [34]:
summary

Unnamed: 0,Features,Coeffs,P-Values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


## Standardization

In [35]:
from sklearn.preprocessing import StandardScaler

In [36]:
scaler = StandardScaler()

In [37]:
scaler.fit(x)

StandardScaler()

In [40]:
scaler.mean_

array([1845.27380952,    2.05952381])

In [41]:
x_transformed = scaler.transform(x)

In [43]:
type(x_transformed)

numpy.ndarray

## Regression with Standardized Data.

In [44]:
stdReg = LinearRegression()

In [45]:
stdReg.fit(x_transformed,y)

LinearRegression()

In [46]:
stdReg.coef_

array([ 0.17181389, -0.00703007])

In [47]:
stdReg.intercept_

3.330238095238095

## Summary after standardization

In [63]:
rows = ['Constant'] + x.columns.values.tolist()
rows

['Constant', 'SAT', 'Rand 1,2,3']

In [65]:
s = pd.DataFrame(data = rows,columns=['Features'])
s

Unnamed: 0,Features
0,Constant
1,SAT
2,"Rand 1,2,3"


In [67]:
s['Weights'] = [stdReg.intercept_] + stdReg.coef_.tolist()

In [69]:
s

Unnamed: 0,Features,Weights
0,Constant,3.330238
1,SAT,0.171814
2,"Rand 1,2,3",-0.00703


## Predicting with standardized Data.

In [80]:
data = pd.DataFrame(data = [[1811,1],[1301,2]] , columns = ['SAT','Rand 1,2,3'])

In [81]:
data

Unnamed: 0,SAT,"Rand 1,2,3"
0,1811,1
1,1301,2


In [82]:
stdReg.predict(data)

array([314.47816779, 226.84605242])

In [83]:
stdData = scaler.transform(data)

In [84]:
stdData

array([[-0.32985212, -1.24637147],
       [-5.23810666, -0.07002087]])

In [85]:
stdReg.predict(stdData)

array([3.28232699, 2.43075085])