# DS-SF-25 | Codealong 07 | Introduction to Regression and Model Fit, Part 2

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import statsmodels.api as sm
import statsmodels.formula.api as smf

# TODO

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

## Part A - Model's F-statistic

In [2]:
df = pd.read_csv(os.path.join('..', 'datasets', 'zillow-07.csv'), index_col = 'ID')

> ### `SalePrice` as a function of `Size`

In [3]:
# TODO
model = smf.ols(formula = 'SalePrice ~ IsAStudio', data = df).fit()


model.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.07775
Date:,"Wed, 03 Aug 2016",Prob (F-statistic):,0.78
Time:,19:00:03,Log-Likelihood:,-1847.4
No. Observations:,986,AIC:,3699.0
Df Residuals:,984,BIC:,3709.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.3811,0.051,27.088,0.000,1.281 1.481
IsAStudio,0.0829,0.297,0.279,0.780,-0.501 0.666

0,1,2,3
Omnibus:,1682.807,Durbin-Watson:,1.488
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1342290.714
Skew:,10.942,Prob(JB):,0.0
Kurtosis:,182.425,Cond. No.,5.92


> ### `SalePrice` as a function of `IsAStudio`

In [None]:
# TODO

model.summary()

### Model's F-value (with significance level of `5%`)

In [None]:
model.fvalue

### Corresponding p-value

In [None]:
model.f_pvalue

## Part B1 - Linear Regression Modeling with `sklearn`

In [None]:
def summary(X, y, model):
    fvalues, f_pvalues = feature_selection.f_regression(X, y)
    print 'F-statistic (not join but instead done sequentially for each regressor)'
    print '- F-value', fvalues
    print '- p-value', f_pvalues
    print

    print 'R^2 =', model.score(X, y)
    print

    print 'Coefficients'
    print '- beta_0 (Intercept) = {}'.format(model.intercept_)
    for i, coef in enumerate(model.coef_):
        print '- beta_{} ({}) = {}'.format(i + 1, X.columns[i], coef)

> ### Remove samples with `NaN` in `IsAStudio`, `Size`, or `LotSize`

In [None]:
# TODO

### SalePrice ~ IsAStudio with `statsmodels`

In [None]:
smf.ols(formula = 'SalePrice ~ IsAStudio', data = df).fit().summary()

> ### SalePrice ~ IsAStudio with `sklearn`

In [None]:
X = df[ ['IsAStudio'] ]
y = df.SalePrice

# TODO

summary(X, y, model)

### SalePrice ~ Size + LotSize with `statsmodels`

In [None]:
smf.ols(formula = 'SalePrice ~ Size + LotSize', data = df).fit().summary()

> ### SalePrice ~ Size + LotSize with `sklearn`

In [None]:
# TODO

## Part B2 - Linear Regression Modeling with `sklearn` (cont.)

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'advertising.csv'))

In [None]:
df

## Plots

> ### Sales ~ TV

In [None]:
# TODO

> ### Sales ~ Radio

In [None]:
# TODO

> ### Sales ~ Newspaper

In [None]:
# TODO

## Simple linear regressions

> ### Sales ~ TV

In [None]:
# TODO

model_tv.summary()

> ### Sales ~ Radio

In [None]:
# TODO

model_radio.summary()

> ### Sales ~ Newspaper

In [None]:
# TODO

model_newspaper.summary()

## Residuals

> ### Sales ~ TV

In [None]:
# TODO

> ### Sales ~ Radio

In [None]:
# TODO

> ### Sales ~ Newspaper

In [None]:
# TODO

> ### Sales ~ TV + Radio + Newspaper

In [None]:
# TODO

model.summary()

> ### Sales ~ TV + Radio

In [None]:
# TODO

model.summary()

In [None]:
sm.qqplot(model.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

## Part C - Interaction Effects

### Sales ~ TV + Radio + TV * Radio

In [None]:
model = smf.ols(formula = 'Sales ~ TV + Radio + TV * Radio', data = df).fit()

model.summary()

In [None]:
sm.qqplot(model.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV:Radio')

pass

## Part D - One-hot encoding for categorical variables

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'zillow-07.csv'), index_col = 'ID')

In [None]:
df.drop(df[df.IsAStudio == 1].index, inplace = True)

In [None]:
smf.ols(formula = 'SalePrice ~ BathCount', data = df).fit().summary()

> ### What's the bathrooms' distribution in the dataset?

In [None]:
# TODO

> ### Let's keep properties with 1, 2, 3, or 4 bathrooms

In [None]:
# TODO

> ### Let's use `pandas`'s `get_dummies` to create our one-hot encoding

In [None]:
# TODO

In [None]:
baths_df

In [None]:
baths_df.rename(columns = {'Bath_1.0': 'Bath_1',
                           'Bath_2.0': 'Bath_2',
                           'Bath_3.0': 'Bath_3',
                           'Bath_4.0': 'Bath_4'}, inplace = True)

In [None]:
baths_df

In [None]:
df = df.join([baths_df])

In [None]:
df.columns

## Activity | One-hot encoding for categorical variables

> ### `SalesPrice` as a function of `Bath_2`, `Bath_3`, and `Bath_4`

In [None]:
# TODO

> ### `SalesPrice` as a function of `Bath_1`, `Bath_3`, and `Bath_4`

In [None]:
# TODO

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_4`

In [None]:
# TODO

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_3`

In [None]:
# TODO