In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
data = pd.read_excel('Text Exer 2 Data.xls') #Loading up the Data
data.index = data['Observation'] #Changing the index

In [3]:
data = data.drop('Observation', axis = 1) #Dropping the 'observations' column

In [4]:
X = data.iloc[:, 1:] #Declaring X
y = pd.Series(data.iloc[:, 0]) #Declaring y

In [5]:
import statsmodels.api as sm
X = sm.add_constant(X) #Adding the constant column to the Independent Variables

# *a-i)* Regression of FGPA on constant and SATV

In [6]:
satv = X[['const', 'SATV']] #Declaring the SATV and Constant Data Set

In [7]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression() #Declaring the Regressor

In [8]:
regressor.fit(satv, y) #Fitting the X and y variables 

LinearRegression()

In [9]:
df1 = pd.merge(y, satv, on = y.index) #Creating a single dataset for case-specific X and y data

In [10]:
import statsmodels.formula.api as smf
mod1 = smf.ols('FGPA ~ const + SATV', data = df1) #Establishing data for regression summary
res = mod1.fit() #Fitting data for the regression summary
res.summary()

0,1,2,3
Dep. Variable:,FGPA,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,5.201
Date:,"Sun, 24 Oct 2021",Prob (F-statistic):,0.0229
Time:,10:20:56,Log-Likelihood:,-388.44
No. Observations:,609,AIC:,780.9
Df Residuals:,607,BIC:,789.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.2209,0.078,15.747,0.000,1.069,1.373
const,1.2209,0.078,15.747,0.000,1.069,1.373
SATV,0.0631,0.028,2.280,0.023,0.009,0.117

0,1,2,3
Omnibus:,11.335,Durbin-Watson:,1.949
Prob(Omnibus):,0.003,Jarque-Bera (JB):,7.694
Skew:,0.138,Prob(JB):,0.0213
Kurtosis:,2.524,Cond. No.,7.63e+16


# *a-ii)* Effect of 1 point increase in SATV on FGPA

Since the slope of the line tells us the effect of a point's increase in SATV on FGPA, we can say that the FGPA will increase by 0.063 if the score on the SATV increases by 1

# *b)* a-i and a-ii with Constant, SATV, SATM, and Gender

## Regression

In [11]:
regressor.fit(X, y) #Fitting the data in the regressor

LinearRegression()

In [12]:
df2 = pd.merge(X, y, on = X.index) #Merging the X and y data in a single dataset
df2 = df2.drop('key_0', axis = 1) #Dropping the unncessary key

In [13]:
mod = smf.ols('FGPA ~ const + SATM + SATV + FEM', data = df2) #Establishing data for regression summary
res = mod.fit() #Fitting data for the regression summary
res.summary()

0,1,2,3
Dep. Variable:,FGPA,R-squared:,0.083
Model:,OLS,Adj. R-squared:,0.078
Method:,Least Squares,F-statistic:,18.24
Date:,"Sun, 24 Oct 2021",Prob (F-statistic):,2.41e-11
Time:,10:20:56,Log-Likelihood:,-364.67
No. Observations:,609,AIC:,737.3
Df Residuals:,605,BIC:,755.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7785,0.108,7.205,0.000,0.566,0.991
const,0.7785,0.108,7.205,0.000,0.566,0.991
SATM,0.1727,0.032,5.410,0.000,0.110,0.235
SATV,0.0142,0.028,0.507,0.612,-0.041,0.069
FEM,0.2003,0.037,5.358,0.000,0.127,0.274

0,1,2,3
Omnibus:,7.757,Durbin-Watson:,1.912
Prob(Omnibus):,0.021,Jarque-Bera (JB):,5.727
Skew:,0.118,Prob(JB):,0.0571
Kurtosis:,2.588,Cond. No.,9.12e+16


## Effect of 1 point increase

If there is a one point increase in the constant, the FGPA would increase by 0.779. If there's a one point increase in the SATM, the FGPA would increase by 0.173. If there's a one point increase in the SATV, the FGPA would increase by 0.014. If there's a one point increase in the Gender, i.e. the person is a female, then the FGPA would increase by 0.200.

# *c)* Correlation matrix

In [14]:
import pandas as pd

corrMatrix = data.corr() #Creating the correlation matrix
print (corrMatrix)

          FGPA      SATM      SATV       FEM
FGPA  1.000000  0.195040  0.092167  0.176491
SATM  0.195040  1.000000  0.287801 -0.162680
SATV  0.092167  0.287801  1.000000  0.033577
FEM   0.176491 -0.162680  0.033577  1.000000


# *d-i)* F-Test on the significance of the effect of SATM on FGPA

In [15]:
a, b = np.polyfit(X['SATV'].values, y, 1) #Assigning a and b
print(a, b) #The values are same as obtained by the previous method

0.06308584537784592 2.441732463077964


In [16]:
def linefitline(X): #Defining the function for the prediction line
    return a * X + b #Establishing the equation
line1 = linefitline(X['SATV']) #Declaring a variable to contain the regression line with SATV as independent variable

In [17]:
y1 = pd.Series(y)

In [18]:
differences_line1 = [] #Erstwhile empty list to contain squared residuals
for i in range(len(line1)): #Loop to create squared residuals
    differences_line1.append((y.values[i] - line1.values[i])**2) #Formula and values for the squared residuals

In [19]:
RSS = sum(differences_line1) #Summing the Squared residuals

In [20]:
mean = np.mean(y) #Finding mean of y

In [21]:
differences_line2 = [] #Erstwhile empty list to contain squared difference of y_i and y_bar (mean)
for i in range(len(y)): #Loop to create squared difference of y_i and y_bar
    differences_line2.append((y.values[i] - mean)**2) #Formula and values for the squared difference of y_i and y_bar

In [22]:
TSS = sum(differences_line2) #Summing the squared difference of y_i and y_bar to get Total Sum of Squares

In [23]:
R_squared = 1 - round(RSS, 6)/round(TSS, 6) #Formula and values for R-Squared

In [24]:
F = (R_squared / (1 - R_squared)) * 608/1 #Formula and values for the F-stat

In [25]:
F #Getting F

5.209075657510555

In [26]:
F = round(F, 1) #Rounding F to 1 decimal place

# *d-ii)* Testing if F = t^2

In [27]:
t = 2.280 #Value of t from the first regression summary

t_2 = t ** 2 #Squaring t to get t^2

In [28]:
t_2 = round(t_2, 1) #Rounding t^2 to 1 decimal place

In [29]:
t_2 #Getting value of t^2

5.2

In [30]:
F #Getting value of F

5.2

F = t^2