# DS-SF-34 | 09 | Linear Regression, Part 3 | Codealong | Starter Code

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import linear_model

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

## Part A | Model Fit and Customer Retention

In [4]:
train_df = pd.DataFrame({ 'Year' : [0, 1, 2, 3, 4, 5, 6, 7],
 'Retention_Rate' : [1, .869, .743, .653, .593, .551, .517, .491] })

train_df

Unnamed: 0,Retention_Rate,Year
0,1.0,0
1,0.869,1
2,0.743,2
3,0.653,3
4,0.593,4
5,0.551,5
6,0.517,6
7,0.491,7


In [8]:
smf.ols(formula = 'Year ~ Retention_Rate', data = train_df).fit().summary()

0,1,2,3
Dep. Variable:,Year,R-squared:,0.922
Model:,OLS,Adj. R-squared:,0.909
Method:,Least Squares,F-statistic:,70.91
Date:,"Mon, 15 May 2017",Prob (F-statistic):,0.000153
Time:,18:48:18,Log-Likelihood:,-7.7811
No. Observations:,8,AIC:,19.56
Df Residuals:,6,BIC:,19.72
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,12.3003,1.077,11.418,0.000,9.664 14.936
Retention_Rate,-12.9966,1.543,-8.421,0.000,-16.773 -9.220

0,1,2,3
Omnibus:,0.955,Durbin-Watson:,0.59
Prob(Omnibus):,0.62,Jarque-Bera (JB):,0.641
Skew:,0.313,Prob(JB):,0.726
Kurtosis:,1.763,Cond. No.,8.67


## Part B | One-Hot Encoding for Categorical Variables and SF Housing

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-09-zillow.csv'), index_col = 'ID')

In [None]:
df.drop(df[df.IsAStudio == 1].index, inplace = True)

In [None]:
smf.ols(formula = 'SalePrice ~ Baths', data = df).fit().summary()

> ### What's the bathrooms' distribution in the dataset?

In [None]:
# TODO

> ### Let's keep properties with 1, 2, 3, or 4 bathrooms

In [None]:
# TODO

> ### Let's use `pandas`'s `get_dummies` to create a one-hot encoding of the `Bath` categorical feature

In [None]:
# TODO

In [None]:
baths_df

In [None]:
baths_df.rename(columns = {'Bath_1.0': 'Bath_1',
    'Bath_2.0': 'Bath_2',
    'Bath_3.0': 'Bath_3',
    'Bath_4.0': 'Bath_4'}, inplace = True)

In [None]:
baths_df

In [None]:
df = df.join([baths_df])

In [None]:
df.columns

### Activity | Linear modeling using `Bath`'s one-hot encoding

> ### `SalesPrice` as a function of `Bath_2`, `Bath_3`, and `Bath_4`

In [None]:
# TODO

> ### `SalesPrice` as a function of `Bath_1`, `Bath_3`, and `Bath_4`

In [None]:
# TODO

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_4`

In [None]:
# TODO

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_3`

In [None]:
# TODO

## Part C | The Adverstising Dataset

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-09-advertising.csv'))

In [None]:
df

### Plots

> ### Sales ~ TV

In [None]:
sns.lmplot(x = 'TV', y = 'Sales', data = df)

> ### Sales ~ Radio

In [None]:
sns.lmplot(x = 'Radio', y = 'Sales', data = df)

> ### Sales ~ Newspaper

In [None]:
sns.lmplot(x = 'Newspaper', y = 'Sales', data = df)

### Simple linear regressions

> ### Sales ~ TV

In [None]:
model_tv = smf.ols(formula = 'Sales ~ TV', data = df).fit()

model_tv.summary()

> ### Sales ~ Radio

In [None]:
model_radio = smf.ols(formula = 'Sales ~ Radio', data = df).fit()

model_radio.summary()

> ### Sales ~ Newspaper

In [None]:
model_newspaper = smf.ols(formula = 'Sales ~ Newspaper', data = df).fit()

model_newspaper.summary()

### Residuals

> ### Sales ~ TV

In [None]:
sm.qqplot(model_tv.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model_tv, 'TV')

pass

> ### Sales ~ Radio

In [None]:
sm.qqplot(model_radio.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model_radio, 'Radio')

pass

> ### Sales ~ Newspaper

In [None]:
sm.qqplot(model_newspaper.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model_newspaper, 'Newspaper')

pass

> ### Sales ~ TV + Radio + Newspaper

In [None]:
# TODO

model.summary()

> ### Sales ~ TV + Radio

In [None]:
# TODO

model.summary()

In [None]:
sm.qqplot(model.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

## Part D | Interaction Effects and Advertising

### Sales ~ TV + Radio + TV * Radio

In [None]:
model = smf.ols(formula = 'Sales ~ TV + Radio + TV * Radio', data = df).fit()

model.summary()

In [None]:
sm.qqplot(model.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV:Radio')

pass