### This notebook serves the purpose of providing information for the project presentation

In [1]:
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as plt 
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import sklearn.preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from scipy.stats import stats
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

### Import onetime_ols function for access to model

In [2]:
def onetime_ols(X_train, y_train):
    # Create x constants
    Xconst = sm.add_constant(X_train)

    # Create OLS model and summary
    ols_model = sm.OLS(y_train, Xconst, hasconst= True)
    est = ols_model.fit()
    results = est.summary()
    
    return est, results

### Retrieve data

In [3]:
X_train = pd.read_csv('../data/X_train2.csv', index_col=0)
X_test = pd.read_csv('../data/X_test.csv', index_col=0)

pickle_in = open('../data/y_train.pickle', 'rb')
y_train = pickle.load(pickle_in)

pickle_in = open('../data/y_test.pickle', 'rb')
y_test = pickle.load(pickle_in)

### Calculate log of dependant variables

In [4]:
y_test_log = np.log(y_test)
y_train_log = np.log(y_train)

### Recreate model from training data and capture in dataframe

In [38]:
model, results = onetime_ols(X_train, y_train_log)

In [39]:
results_as_html = results.tables[1].as_html()
model_df = pd.read_html(results_as_html, header=0, index_col=0)[0]

### Invert the log function applied to independant variables

In [40]:
model_df

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
const,11.6295,0.174,66.758,0.0,11.288,11.971
zipcode_98002,-0.0526,0.016,-3.249,0.001,-0.084,-0.021
zipcode_98004,1.07,0.013,80.196,0.0,1.044,1.096
zipcode_98005,0.6811,0.017,39.63,0.0,0.647,0.715
zipcode_98006,0.5911,0.011,52.294,0.0,0.569,0.613
zipcode_98007,0.613,0.018,33.537,0.0,0.577,0.649
zipcode_98008,0.6282,0.014,45.558,0.0,0.601,0.655
zipcode_98010,0.2029,0.022,9.065,0.0,0.159,0.247
zipcode_98011,0.4339,0.016,27.391,0.0,0.403,0.465
zipcode_98014,0.2979,0.02,14.555,0.0,0.258,0.338


In [41]:
# model_df.sort_values(by=['coef'], ascending=False)[:10]

In [42]:
# pd.set_option('display.max_rows', None)
model_df['coef_exp'] = model_df['coef'].values
model_df['coef_exp'] = model_df['coef_exp'].apply(lambda c: np.exp(c))
model_df.sort_values(by=['coef_exp'], ascending=False)

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975],coef_exp
const,11.6295,0.174,66.758,0.0,11.288,11.971,112364.125798
zipcode_98039,1.1806,0.03,39.041,0.0,1.121,1.24,3.256327
zipcode_98004,1.07,0.013,80.196,0.0,1.044,1.096,2.915379
zipcode_98112,1.0104,0.015,67.344,0.0,0.981,1.04,2.746699
zipcode_98109,0.9376,0.022,43.136,0.0,0.895,0.98,2.553845
zipcode_98119,0.9339,0.017,54.858,0.0,0.901,0.967,2.544413
zipcode_98102,0.9242,0.021,43.512,0.0,0.883,0.966,2.519852
zipcode_98105,0.8982,0.016,57.581,0.0,0.868,0.929,2.45518
zipcode_98040,0.8206,0.014,57.011,0.0,0.792,0.849,2.271863
zipcode_98199,0.82,0.013,61.052,0.0,0.794,0.846,2.2705


In [44]:
# model_df = model_df.reset_index()
# model_df = model_df.rename(columns={'index':'feature'})
model_df

Unnamed: 0,feature,coef,std err,t,P>|t|,[0.025,0.975],coef_exp
0,const,11.6295,0.174,66.758,0.0,11.288,11.971,112364.125798
1,zipcode_98002,-0.0526,0.016,-3.249,0.001,-0.084,-0.021,0.948759
2,zipcode_98004,1.07,0.013,80.196,0.0,1.044,1.096,2.915379
3,zipcode_98005,0.6811,0.017,39.63,0.0,0.647,0.715,1.97605
4,zipcode_98006,0.5911,0.011,52.294,0.0,0.569,0.613,1.805974
5,zipcode_98007,0.613,0.018,33.537,0.0,0.577,0.649,1.845961
6,zipcode_98008,0.6282,0.014,45.558,0.0,0.601,0.655,1.874234
7,zipcode_98010,0.2029,0.022,9.065,0.0,0.159,0.247,1.22495
8,zipcode_98011,0.4339,0.016,27.391,0.0,0.403,0.465,1.543265
9,zipcode_98014,0.2979,0.02,14.555,0.0,0.258,0.338,1.347027


In [10]:
y_train_log

array([14.1006895 , 13.59111622, 11.98292909, ..., 12.5776362 ,
       13.05408454, 13.47302025])