# Import File and Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import datetime as dt

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [None]:
# Read file in as Pandas dataframe

dfh = pd.read_csv('kc3_house_data.csv')
dfh.head()

# Linear Regression Using Statsmodels

In [None]:
# Run regressions of remaining feature variables to assess individual impact on model

formula = 'price ~ grade'
model = ols(formula = formula, data = dfh).fit()
model.summary()

In [None]:
dfh.drop(['renovated'], axis = 1, inplace = True)

In [None]:
dfh.drop(['SD_VashonIsland'], axis = 1, inplace = True)

In [None]:
# Run regression model on all significant features

outcome = 'price'
predictors = dfh.drop(['price'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

In [None]:
model = ols(formula=formula, data=dfh).fit()
model.summary()

## Model Validation

### Validate models mean square error and ability to predict values using train test split

In [None]:
dfh.dropna(inplace = True)

In [None]:
dfh.isnull().sum()

In [None]:
y = dfh[['price']]
X = dfh.drop(['price'], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)

y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
print('Train Mean Squarred Error:', train_mse)
print('Test Mean Squarred Error:', test_mse)

In [None]:
resid = model.resid

In [None]:
fig = sm.graphics.qqplot(resid, dist = stats.norm, line = '45', fit = True)

In [None]:
fig = plt.figure(figsize=(15,8))
fig = sm.graphics.plot_regress_exog(model, "grade", fig=fig)
plt.show()

## Cross Validation

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

cv_5_results  = np.mean(cross_val_score(linreg, X, y, cv=5,  scoring='neg_mean_squared_error'))
cv_10_results = np.mean(cross_val_score(linreg, X, y, cv=10, scoring='neg_mean_squared_error'))
cv_20_results = np.mean(cross_val_score(linreg, X, y, cv=20, scoring='neg_mean_squared_error'))

In [None]:
print(cv_5_results)
print(cv_10_results)
print(cv_20_results)