In [1]:
# load some data to practice regressions
import seaborn as sns
import numpy as np
diamonds = sns.load_dataset('diamonds')

# this alteration is not strictly necessary to practice a regression
# but we use this in livecoding
diamonds2 = (diamonds.query('carat < 2.5')               # censor/remove outliers
            .assign(lprice = np.log(diamonds['price']))  # log transform price
            .assign(lcarat = np.log(diamonds['carat']))  # log transform carats
            .assign(ideal = diamonds['cut'] == 'Ideal') 
             
             # some regression packages want you to explicitly provide 
             # a variable for the constant
            .assign(const = 1)                           
            )  

## Interpreting regressions

**Regression 1**

In [4]:
from statsmodels.formula.api import ols as sm_ols

model2   = sm_ols('lprice ~ lcarat',  # specify model (you don't need to include the constant!)
                  data=diamonds2)
results2 = model2.fit()               # estimate / fit
print(results2.summary())             # view results ... identical to before

# the prediction and residual and plotting are the exact same

                            OLS Regression Results                            
Dep. Variable:                 lprice   R-squared:                       0.933
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                 7.542e+05
Date:                Tue, 14 Apr 2020   Prob (F-statistic):               0.00
Time:                        11:47:05   Log-Likelihood:                -4073.2
No. Observations:               53797   AIC:                             8150.
Df Residuals:                   53795   BIC:                             8168.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.4525      0.001   6193.432      0.0

If carat goes up 1%, price goes up 1.68%. This is because the log of both variables has been taken

**Regression 2:** 

In [7]:
subsample_of_equal_amounts = diamonds2.query('cut in ["Ideal","Fair"]').groupby('cut').apply(lambda x: x.sample(400)) 
sm_ols('lprice ~ lcarat + ideal + lcarat*ideal', data=subsample_of_equal_amounts).fit().summary()

0,1,2,3
Dep. Variable:,lprice,R-squared:,0.899
Model:,OLS,Adj. R-squared:,0.898
Method:,Least Squares,F-statistic:,2349.0
Date:,"Tue, 14 Apr 2020",Prob (F-statistic):,0.0
Time:,11:51:10,Log-Likelihood:,-120.09
No. Observations:,800,AIC:,248.2
Df Residuals:,796,BIC:,266.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,8.1902,0.014,567.176,0.000,8.162,8.218
ideal[T.True],0.3507,0.026,13.617,0.000,0.300,0.401
lcarat,1.5201,0.032,48.239,0.000,1.458,1.582
lcarat:ideal[T.True],0.2178,0.042,5.222,0.000,0.136,0.300

0,1,2,3
Omnibus:,7.889,Durbin-Watson:,1.994
Prob(Omnibus):,0.019,Jarque-Bera (JB):,8.682
Skew:,-0.168,Prob(JB):,0.013
Kurtosis:,3.385,Cond. No.,6.58


This shows that a 1% increase in carats is associated with a 1.47% increase in price for fair diamonds, but a 1.78% increase for ideal diamonds (1.47+0.28)