Links from Marisa:

RFE: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE

OHE options: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

### Import the required packages

In [7]:
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as plt 
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import sklearn.preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from scipy.stats import stats

import warnings
warnings.filterwarnings("ignore")

### Function to loop ols iterations, will be run later on...

In [8]:
def ols_run(X, y, run):
    Xconst = sm.add_constant(X)
    
    ols_model = sm.OLS(y, Xconst, hasconst= True)
    est = ols_model.fit()
    
    results_as_html = results.tables[1].as_html()
    ols_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
    ols_df = ols_df.reset_index()
    ols_df = ols_df.rename(columns={'index':'feature'});

    kill_cols = ols_df['feature'][(ols_df['t'] > -1) & (ols_df['t'] < 1)]
    
    sum_residuals = sum(est.resid) - sum(y_train)
    r2_adj = est.rsquared_adj

    run_dict = {'ols_run': run, \
                'sum_resid': sum_residuals, \
                'r2_adj': r2_adj, \
                'cols': Xconst.shape[1] - 1,\
                'kill_cols': kill_cols.shape}
    run_df = pd.DataFrame.from_dict([run_dict])
    
    return [kill_cols, run_df]

### Import the data and create X df for independant variables and y df for dependant variables

In [9]:
data = pd.read_csv('data_cleaned.csv', index_col=0)
data = data.drop(columns=['id', 'long', 'lat', 'date'])
X_cols = data.drop(columns=['price'])
y = data.price.values

### Review histogram from the dependant variables

In [10]:
# pd.DataFrame.hist(X_cols, figsize = [15,15]);

# residuals of overall model are important not normality of variables

### Prepare data for the one hot encoding and column transformation
Need to create data frame and lists of categorical variables and continuous variables

In [11]:
categories = X_cols[['bathrooms', 'bedrooms', 'floors', 'condition', 'grade', 'attic', \
                     'zipcode', 'view', 'yr_built']]
categories_ls = list(categories.columns)
x_cols_ls = list(X_cols.columns)
numeric_vars = [x for x in x_cols_ls if x not in categories]

### Create a minimum/maximum value dictionary
This dictionary records the minimum/maximum values for the continuous, independant variables to return to normal values after normalization.  Convert the dicitonary to a dataframe and save as a csv.

In [12]:
# min_max_dict = {}

# for c in X_cols:
#     if c in categories.columns:
#         pass
#     else:
#         min_max_dict.update({c: [X_cols[c].min(), X_cols[c].max()]})

# min_max_df = pd.DataFrame.from_dict(min_max_dict)
# min_max_df.to_csv('min_max_vals')

In [13]:
### Create ohe headers (based on categorical values) and continuous variable headers

### Min/max scaling for independant, continuous variables and convert categorical variables to strings

In [14]:
# Create dictionary of category variable names, counts, and a sorted list of values
category_counts = {}

for cat in categories:
    l = len(list(X_cols[cat].unique())) - 1
    vals = sorted(list(X_cols[cat].unique().round(3)))
    category_counts.update({cat: [l, vals[1:]]})

ohe_cols = []
for cc in category_counts:
    for r in range(0, category_counts[cc][0]):
        column = '{}_{}'.format(cc, category_counts[cc][1][r])
        ohe_cols.append(column)

In [15]:
min_max_scaler = sklearn.preprocessing.MinMaxScaler()

for c in X_cols.columns:
    if c in categories:
        X_cols[c] = X_cols[c].astype(str)
    else:
        x = np.array(X_cols[c]).reshape(-1, 1)
        X_cols[c] = min_max_scaler.fit_transform(x)

### One hot encoding for housing data

In [16]:
## create an encoder object. This will help us to convert
## categorical variables to new columns
encoder = OneHotEncoder(handle_unknown= 'error',
                        drop='first',
                        categories= 'auto',
                        sparse=True)

## Create an columntransformer object.
## This will help us to merge transformed columns
## with the rest of the dataset.

ct = ColumnTransformer(transformers =[('ohe', encoder, categories_ls)], remainder= 'passthrough')
ct.fit(X_cols[x_cols_ls])
X = ct.transform(X_cols)
X = X.todense()

In [17]:
# Create columns for X df
columns = ohe_cols + numeric_vars

In [18]:
X = pd.DataFrame(X, columns=columns)

### Split data into train and test groups

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train.to_csv('X_train.csv')

In [20]:
import pickle
pickle_out = open('y_train.pickle','wb')
pickle.dump(y_train, pickle_out)
pickle_out.close()

### Loop OLS Sumary data tracking r2 adjusted, total residual value, total columns, and run number

In [21]:
run1 = ols_run(X_train, y_train, 1)
X_train_chop = X_train.drop(columns=run1[0])
ols_df = run1[1]

NameError: name 'results' is not defined

In [22]:
for run in range(2, 100):
    ols_data = ols_run(X_train_chop, y_train, run)
    ols_df = pd.concat([ols_df, ols_data[1]], axis=0)
    X_train_chop = X_train.drop(columns=ols_data[0])

NameError: name 'X_train_chop' is not defined

In [23]:
# ols_df

In [24]:
# run = ols_run(X_train_chop, y_train, 2, ols_df)
# ols_df = pd.concat([ols_df, run[1]], axis=0)
# ols_df

### Code for a one-time OLS stats model summary, used to populate the function and loop above

In [32]:
Xconst = sm.add_constant(X_train)

ols_model = sm.OLS(y_train, Xconst, hasconst= True)
est = ols_model.fit()
est.summary()

results = est.summary();
results


0,1,2,3
Dep. Variable:,y,R-squared:,0.837
Model:,OLS,Adj. R-squared:,0.834
Method:,Least Squares,F-statistic:,325.7
Date:,"Tue, 24 Mar 2020",Prob (F-statistic):,0.0
Time:,21:20:06,Log-Likelihood:,-215180.0
No. Observations:,16149,AIC:,430900.0
Df Residuals:,15898,BIC:,432800.0
Df Model:,250,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.363e+05,1.16e+05,1.178,0.239,-9.05e+04,3.63e+05
bathrooms_0.75,-2.272e+04,1.09e+05,-0.209,0.834,-2.36e+05,1.9e+05
bathrooms_1.0,532.7872,1.06e+05,0.005,0.996,-2.08e+05,2.09e+05
bathrooms_1.25,-1.309e+05,1.23e+05,-1.067,0.286,-3.72e+05,1.1e+05
bathrooms_1.5,-5709.2518,1.06e+05,-0.054,0.957,-2.14e+05,2.03e+05
bathrooms_1.75,-3779.8792,1.06e+05,-0.036,0.972,-2.12e+05,2.05e+05
bathrooms_2.0,-3646.3081,1.06e+05,-0.034,0.973,-2.12e+05,2.05e+05
bathrooms_2.25,1.154e+04,1.06e+05,0.108,0.914,-1.97e+05,2.2e+05
bathrooms_2.5,9650.7432,1.06e+05,0.091,0.928,-1.99e+05,2.18e+05

0,1,2,3
Omnibus:,13271.129,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2657494.269
Skew:,3.135,Prob(JB):,0.0
Kurtosis:,65.531,Cond. No.,2.65e+16


In [4]:
results_as_html = results.tables[1].as_html()
ols_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
ols_df = ols_df.reset_index()
ols_df = ols_df.rename(columns={'index':'feature'});

NameError: name 'results' is not defined

In [5]:
pd.set_option('display.max_rows', None)
ols_df.sort_values(by='t', ascending=False)

kill_cols = ols_df['feature'][(ols_df['t'] > -1) & (ols_df['t'] < 1)]

NameError: name 'pd' is not defined

In [6]:
sum_residuals = sum(est.resid) - sum(y_train)
r2_adj = est.rsquared_adj

ols_dict = {'ols_run': 1, 'sum_resid': sum_residuals, 'r2_adj': r2_adj, 'cols': Xconst.shape[1] - 1}
ols_df = pd.DataFrame.from_dict([ols_dict])

NameError: name 'est' is not defined

In [255]:
ols_df

Unnamed: 0,ols_run,sum_resid,r2_adj,cols
0,1,-8754851000.0,0.834061,252


In [256]:
X_train_chop = X_train.drop(columns=kill_cols)

In [257]:
X_train_chop.shape == X_train.shape

False

In [258]:
ols_run(X_train_chop, y_train, 2)


[1      bathrooms_0.75
 2       bathrooms_1.0
 4       bathrooms_1.5
 5      bathrooms_1.75
 6       bathrooms_2.0
 7      bathrooms_2.25
 8       bathrooms_2.5
 9      bathrooms_2.75
 10      bathrooms_3.0
 11     bathrooms_3.25
 12      bathrooms_3.5
 13     bathrooms_3.75
 14      bathrooms_4.0
 24      bathrooms_6.5
 26      bathrooms_7.5
 30         bedrooms_3
 31         bedrooms_4
 32         bedrooms_5
 33         bedrooms_6
 39         floors_1.5
 43         floors_3.5
 58            attic_1
 59      zipcode_98002
 60      zipcode_98003
 70      zipcode_98022
 76      zipcode_98030
 78      zipcode_98032
 84      zipcode_98042
 126     zipcode_98198
 135     yr_built_1904
 137     yr_built_1906
 139     yr_built_1908
 140     yr_built_1909
 141     yr_built_1910
 142     yr_built_1911
 143     yr_built_1912
 145     yr_built_1914
 146     yr_built_1915
 147     yr_built_1916
 148     yr_built_1917
 150     yr_built_1919
 151     yr_built_1920
 152     yr_built_1921
 153     yr

### Loop ols regressions

In [2]:
def ols_run(X, y, run):
    Xconst = sm.add_constant(X)
    
    ols_model = sm.OLS(y, Xconst, hasconst= True)
    est = ols_model.fit()
    
    results_as_html = results.tables[1].as_html()
    ols_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
    ols_df = ols_df.reset_index()
    ols_df = ols_df.rename(columns={'index':'feature'});

    kill_cols = ols_df['feature'][(ols_df['t'] > -1) & (ols_df['t'] < 1)]
    
    sum_residuals = sum(est.resid) - sum(y_train)
    r2_adj = est.rsquared_adj

    run_dict = {'ols_run': run, 'sum_resid': sum_residuals, 'r2_adj': r2_adj, 'cols': Xconst.shape[1] - 1}
    run_df = pd.DataFrame.from_dict([run_dict])
    
    return [kill_cols, run_df]