In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
df = pd.read_csv("Mustard.csv")
df.head()

Unnamed: 0,weight,light,watering,medium
0,2.02,normal,1,soil
1,0.3,normal,1,cottonwool
2,0.27,normal,1,newspaper
3,0.09,normal,1,sawdust
4,2.04,normal,1,soil


In [3]:
df.dropna(subset = ["weight"], inplace=True)
df.dropna(subset = ["light"], inplace=True)
df.dropna(subset = ["watering"], inplace=True)
df.dropna(subset = ["medium"], inplace=True)

### For all three models describe complexity and goodness of fit by listing the degrees of freedom and residual sum of squares, respectively.

> The residual sum of squares (RSS) measures the level of variance in the error term, or residuals, of a regression model. The smaller the residual sum of squares, the better your model fits your data; the greater the residual sum of squares, the poorer your model fits your data.

> The degrees of freedom are an accounting of how many parameters are estimated by the model and, by extension, a measure of complexity for linear regression models. For example, the complexity of a linear regression model with two parameters is equal to the degrees of freedom, which in this case is 2. We often prefer lower complexity models over higher complexity models. Simpler models generalize better.

In [4]:
m_one = smf.ols("weight ~ light + medium + watering", data=df).fit()
sm.stats.anova_lm(m_one)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
light,2.0,7.755972,3.877986,15.173072,1.816851e-06
medium,3.0,85.111348,28.370449,111.002693,2.120333e-31
watering,1.0,37.498517,37.498517,146.717325,3.459094e-21
Residual,98.0,25.047176,0.255583,,


In [5]:
# df = 6, rss = 25

In [6]:
m_two = smf.ols("weight ~ (medium*watering) + (medium*light) + (watering*light)", data=df).fit()
sm.stats.anova_lm(m_two)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
medium,3.0,85.483215,28.494405,114.397567,4.288742e-30
light,2.0,7.384104,3.692052,14.822621,2.886501e-06
medium:light,6.0,1.504919,0.25082,1.006976,0.4261291
watering,1.0,37.623429,37.623429,151.048203,1.025897e-20
medium:watering,3.0,0.891745,0.297248,1.193374,0.3170948
watering:light,2.0,0.855444,0.427722,1.717192,0.1855994
Residual,87.0,21.670157,0.249082,,


In [7]:
# df = 17 , rss = 22

In [8]:
m_three = smf.ols("weight ~ light * medium * watering", data=df).fit()
sm.stats.anova_lm(m_three)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
light,2.0,7.755972,3.877986,15.190783,2.498641e-06
medium,3.0,85.111348,28.370449,111.13226,1.266163e-28
light:medium,6.0,1.504919,0.25082,0.982507,0.4426215
watering,1.0,37.623429,37.623429,147.37788,6.409024e-20
light:watering,2.0,0.870632,0.435316,1.705212,0.1881923
medium:watering,3.0,0.876557,0.292186,1.144545,0.3361981
light:medium:watering,6.0,0.992036,0.165339,0.647664,0.6917963
Residual,81.0,20.678122,0.255285,,


In [9]:
# df = 23, rss = 21

In [10]:
sm.stats.anova_lm(m_one, m_two, m_three)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,98.0,25.047176,0.0,,,
1,87.0,21.670157,11.0,3.377019,1.202582,0.297518
2,81.0,20.678122,6.0,0.992036,0.647664,0.691796


> Best fit is model with 3-way interaction, but it is also the most complex model. \
Model with 2-way interaction has less DF which makes it less complex, and RSS almost same as third model \
First model has only 6 degrees of freedom which makes it the simpliest of all three. Difference between rss1 and (rss2, rss3) is not big, that makes first model the most optimal one

### Carry out an ANOVA to compare Models 1 and 2. Which of these two models is preferable based on the test decision of the ANOVA?

In [11]:
sm.stats.anova_lm(m_one, m_two)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,98.0,25.047176,0.0,,,
1,87.0,21.670157,11.0,3.377019,1.232532,0.278187


> As we can see model 2 is preferable based on test-decision \
Critical F-value:	1.41407399. We can't reject H0


### Analogously, carry out an ANOVA to compare Models 2 and 3 and describe the test decision.

In [12]:
sm.stats.anova_lm(m_two, m_three)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,87.0,21.670157,0.0,,,
1,81.0,20.678122,6.0,0.992036,0.647664,0.691796


> As we can see model 3 is preferable \
Critical F-value:	1.43700385.
We can't reject H0

### Overall, which of the three models would be selected in an ANOVA-based forward selection or backward selection?

> The third model (as the most complex) would be selected in an ANOVA-based backward selection

> The first model (as the most simple) would be selected in an ANOVA-based forward selection

In [15]:
from stepwise_selection import stepwise_selection
m_twoo = smf.ols("weight ~ light * medium * watering", data=df)
m_2 = stepwise_selection(m_twoo, scope={"lower": "weight ~ 1",
                                        "upper": "weight ~ (light * medium * watering)**2" })

Step:  aic= 175.36425763675854
(' - light:medium:watering', 168.28454204622983)
(' - light:medium', 169.32966041271263)
(' - medium:watering', 175.3642576367585)
('', 175.36425763675854)
(' - light:watering', 175.36425763675854)
Step:  aic= 168.28454204622983
(' - light:medium', 163.90166328580037)
(' - medium:watering', 166.44813785924717)
('', 168.28454204622983)
(' - light:watering', 168.3497665460794)
(' + light:medium:watering', 175.36425763675854)
Step:  aic= 163.90166328580037
(' - medium:watering', 161.85963392947122)
(' - light:watering', 163.60679726952714)
('', 163.90166328580037)
(' + light:medium', 168.28454204622983)
(' + light:medium:watering', 169.32966041271263)
Step:  aic= 161.85963392947122
(' - light:watering', 161.49117043622041)
('', 161.85963392947122)
(' + medium:watering', 163.90166328580037)
(' + light:medium', 166.44813785924717)
(' + light:medium:watering', 169.3296604127126)
(' - medium', 313.3456188087899)
Step:  aic= 161.49117043622041
('', 161.4911704362

> According to stepwise selection the best model is first one (weight ~ light + medium + watering)