In [19]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [20]:
df = pd.read_csv("Mustard.csv")
df.head()

Unnamed: 0,weight,light,watering,medium
0,2.02,normal,1,soil
1,0.3,normal,1,cottonwool
2,0.27,normal,1,newspaper
3,0.09,normal,1,sawdust
4,2.04,normal,1,soil


In [21]:
df.dropna(subset = ["weight"], inplace=True)
df.dropna(subset = ["light"], inplace=True)
df.dropna(subset = ["watering"], inplace=True)
df.dropna(subset = ["medium"], inplace=True)

### For all three models describe complexity and goodness of fit by listing the degrees of freedom and residual sum of squares, respectively.

> The residual sum of squares (RSS) measures the level of variance in the error term, or residuals, of a regression model. The smaller the residual sum of squares, the better your model fits your data; the greater the residual sum of squares, the poorer your model fits your data.

> The degrees of freedom are an accounting of how many parameters are estimated by the model and, by extension, a measure of complexity for linear regression models. For example, the complexity of a linear regression model with two parameters is equal to the degrees of freedom, which in this case is 2. We often prefer lower complexity models over higher complexity models. Simpler models generalize better.

In [22]:
m_one = smf.ols("weight ~ light + medium + C(watering)", data=df).fit()
sm.stats.anova_lm(m_one)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
light,2.0,7.755972,3.877986,16.205559,8.468511e-07
medium,3.0,85.111348,28.370449,118.55613,2.51522e-32
C(watering),2.0,39.33362,19.66681,82.184842,1.322974e-21
Residual,97.0,23.212073,0.2393,,


In [23]:
# df = 6, rss = 25

In [24]:
m_two = smf.ols("weight ~ medium*C(watering) + medium*light + C(watering)*light", data=df).fit()
sm.stats.anova_lm(m_two)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
medium,3.0,85.483215,28.494405,125.147772,2.55476e-30
C(watering),2.0,39.486821,19.74341,86.713297,7.385192e-21
light,2.0,7.230904,3.615452,15.879108,1.519292e-06
medium:C(watering),6.0,2.048888,0.341481,1.49979,0.1887237
medium:light,6.0,1.342297,0.223716,0.982564,0.442584
C(watering):light,4.0,1.378317,0.344579,1.513396,0.2060287
Residual,81.0,18.442572,0.227686,,


In [25]:
# df = 17 , rss = 22

In [26]:
m_three = smf.ols("weight ~ light * medium * C(watering)", data=df).fit()
sm.stats.anova_lm(m_three)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
light,2.0,7.755972,3.877986,17.664037,6.389288e-07
medium,3.0,85.111348,28.370449,129.226021,2.987437e-28
C(watering),2.0,39.33362,19.66681,89.581367,6.63563e-20
light:medium,6.0,1.46286,0.24381,1.110543,0.3652768
light:C(watering),4.0,1.342,0.3355,1.528186,0.2036241
medium:C(watering),6.0,1.964641,0.32744,1.491474,0.1940282
light:medium:C(watering),12.0,3.294222,0.274519,1.250419,0.2681899
Residual,69.0,15.14835,0.219541,,


In [27]:
# df = 23, rss = 21

In [28]:
sm.stats.anova_lm(m_one, m_two, m_three)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,97.0,23.212073,0.0,,,
1,81.0,18.442572,16.0,4.769501,1.357803,0.184431
2,69.0,15.14835,12.0,3.294222,1.250419,0.26819


> Best fit is model with 3-way interaction, but it is also the most complex model. \
Model with 2-way interaction has less DF which makes it less complex, and RSS almost same as third model \
First model has only 6 degrees of freedom which makes it the simpliest of all three. Difference between rss1 and (rss2, rss3) is not big, that makes first model the most optimal one

### Carry out an ANOVA to compare Models 1 and 2. Which of these two models is preferable based on the test decision of the ANOVA?

In [29]:
sm.stats.anova_lm(m_one, m_two)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,97.0,23.212073,0.0,,,
1,81.0,18.442572,16.0,4.769501,1.309232,0.212108


### Analogously, carry out an ANOVA to compare Models 2 and 3 and describe the test decision.

In [30]:
sm.stats.anova_lm(m_two, m_three)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,81.0,18.442572,0.0,,,
1,69.0,15.14835,12.0,3.294222,1.250419,0.26819


In [31]:
m = smf.ols("weight ~ 1", data=df).fit()

In [32]:
sm.stats.anova_lm(m, m_one, m_two, m_three)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,104.0,155.413013,0.0,,,
1,97.0,23.212073,7.0,132.20094,86.024125,9.305596e-39
2,81.0,18.442572,16.0,4.769501,1.357803,0.1844309
3,69.0,15.14835,12.0,3.294222,1.250419,0.2681899


In [None]:
#check telegram :)

### Overall, which of the three models would be selected in an ANOVA-based forward selection or backward selection?

> The third model (as the most complex) would be selected in an ANOVA-based backward selection

> The first model (as the most simple) would be selected in an ANOVA-based forward selection

In [33]:
from stepwise_selection import stepwise_selection
m_twoo = smf.ols("weight ~ light * medium * watering", data=df)
m_2 = stepwise_selection(m_twoo, scope={"lower": "weight ~ 1",
                                        "upper": "weight ~ (light * medium * watering)**2" })

Step:  aic= 175.36425763675854
(' - light:medium:watering', 168.28454204622983)
(' - light:medium', 169.32966041271263)
(' - medium:watering', 175.3642576367585)
('', 175.36425763675854)
(' - light:watering', 175.36425763675854)
Step:  aic= 168.28454204622983
(' - light:medium', 163.90166328580037)
(' - medium:watering', 166.44813785924717)
('', 168.28454204622983)
(' - light:watering', 168.3497665460794)
(' + light:medium:watering', 175.36425763675854)
Step:  aic= 163.90166328580037
(' - medium:watering', 161.85963392947122)
(' - light:watering', 163.60679726952714)
('', 163.90166328580037)
(' + light:medium', 168.28454204622983)
(' + light:medium:watering', 169.32966041271263)
Step:  aic= 161.85963392947122
(' - light:watering', 161.49117043622041)
('', 161.85963392947122)
(' + medium:watering', 163.90166328580037)
(' + light:medium', 166.44813785924717)
(' + light:medium:watering', 169.3296604127126)
(' - medium', 313.3456188087899)
Step:  aic= 161.49117043622041
('', 161.4911704362

> According to stepwise selection the best model is first one (weight ~ light + medium + watering)