# OLS Regression - Simple Train and Test

## Required Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import os

## Notbook Settings

In [None]:
# set working directory
os.chdir(".")

# make sure it is set right
print(os.getcwd())

# make sure plots display in notebook
%matplotlib inline

## Data Load

Weekly marketing spend (in thousand's) by channel and the corresponding product sales (in million's).

In [None]:
# read in the advertising data set
ad_df = pd.read_csv("../data/Advertising.csv", index_col = [0])

# look at the top rows
ad_df.head()

## EDA

In [None]:
# describe the dataset
ad_df.describe()

In [None]:
# look at the data types
ad_df.info()

In [None]:
sns.pairplot(ad_df)

In [None]:
sns.distplot(ad_df["sales"])

In [None]:
sns.heatmap(ad_df.drop(columns = "sales").corr(), annot=True, cmap="YlGnBu", square = True)

## Training

In [None]:
# specifying the regression model
ols = linear_model.LinearRegression()

from sklearn.model_selection import train_test_split

# feature set
X = ad_df.drop(columns = "sales")

# target
y = ad_df["sales"]

# creating training / testings datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print(len(X_train.index))
print(len(X_test.index))

In [None]:
# fit the regression model
ols.fit(X_train,y_train)

## Assumption Checking

Errors should be independently and identically normally distributed with a mean of 0 and a fixed variance.

### Residuals vs. Fitted Values

In [None]:
fitted_values = ols.predict(X_train)

residuals = ols.predict(X_train) - y_train

plt.scatter(fitted_values, residuals) 

### Distribution of Residuals

In [None]:
sns.distplot(residuals)

## Model Interpretation

### Feature Importance?

In [None]:
for estimate in zip(X_train.columns, ols.coef_):
    print(estimate)

### Intercept?

In [None]:
ols.intercept_

## Testing

In [None]:
# predicted
predicted = ols.predict(X_test)

# actual
validate = pd.DataFrame(y_test)

validate.columns = ['actual']

validate['predicted'] = predicted

validate.head(10)

## Quality of Fit

In [None]:
# mean squared error
mse = np.sum((validate['actual'] - validate['predicted'])**2) / len(validate)

print("The Mean Squared Error is " + str(mse))

# sklearn mse
# mean_squared_error(predicted, y_test)

# r squared
r2 = r2_score(predicted, y_test)

print("The R-Squared is " + str(r2))

## Next Steps

Can you spot any ways to improve the model? Hint: Check the assumptions!!!
What about p-values?

## References

Data sourced from An Introduction to Statistical Learning
with Applications in R
Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani.