# Feature Analysis

In [None]:
# recursive feature selection in sklearn can automate this below

In [9]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [10]:
data = pd.read_csv('data_cleaned.csv', index_col=0)
data = data.drop(columns=['id', 'zipcode', 'long', 'lat', 'sqft_above', 'date', 'view',])
features = data.drop(columns=['price'])

In [11]:
feature_tests = {}

for column in features:
    y = data.price.values

    X = features[column].values
    Xconst = sm.add_constant(X)

    model = sm.OLS(y, Xconst, hasconst=True)
    fitted_model = model.fit()

    p_const = fitted_model.pvalues[0]
    p_x = fitted_model.pvalues[1]
    r2 = fitted_model.rsquared
    r2_adj = fitted_model.rsquared_adj
    
    feature_tests.update({column: [p_const, p_x, r2, r2_adj]})

In [12]:
result_cols = ['p_const', 'p_x', 'r2', 'r2_adj']
results = pd.DataFrame.from_dict(feature_tests, orient='index', columns=result_cols)
results.sort_values(by='r2', ascending=False)

Unnamed: 0,p_const,p_x,r2,r2_adj
sqft_living,1.121644e-21,0.0,0.492241,0.492217
grade,0.0,0.0,0.446652,0.446626
sqft_living15,5.7198430000000004e-39,0.0,0.34261,0.34258
bathrooms,0.0776788,0.0,0.275691,0.275657
sqft_basement,0.0,0.0,0.10181,0.101769
bedrooms,4.285961e-33,0.0,0.099373,0.099332
floors,0.0,0.0,0.066085,0.066041
sqft_lot,0.0,2.899312e-40,0.008161,0.008115
sqft_lot15,0.0,2.191635e-34,0.006921,0.006875
yr_built,2.093523e-06,1.571686e-15,0.002947,0.0029


In [13]:
corr = data.corr()
data.corr()['price'].sort_values(ascending=False)

price            1.000000
sqft_living      0.701599
grade            0.668320
sqft_living15    0.585329
bathrooms        0.525063
sqft_basement    0.319077
bedrooms         0.315236
floors           0.257069
sqft_lot         0.090341
sqft_lot15       0.083194
yr_built         0.054284
attic            0.051796
condition        0.035367
Name: price, dtype: float64

### Conclusions from initial one variable correlation analysis
- sqft_living is the most correlated to pricing with a p-value of 0.701 and R2 of 0.492
- Second is grade with a p-value of 0.668 and R2 of 0.447
- Third is sqft_living15 with a p-value of 0.585 and R2 of 0.343
- Fourth is bathrooms with a p-value of 0.525 and R2 of 0.276

### Multiple linear regression analysis

In [14]:
sqft_live = features.drop(columns=['sqft_living'])
mlr_results = {}

for column in sqft_live:
    y = data.price.values
    X = data[['sqft_living', column]].values

    Xconst = sm.add_constant(X)

    model = sm.OLS(y, Xconst, hasconst= True)
    mm_fitted = model.fit()
    mm_fitted.summary()
    
    p_coef = mm_fitted.pvalues[0]
    px1 = mm_fitted.pvalues[1]
    px2 = mm_fitted.pvalues[2]
    r2_adj = mm_fitted.rsquared_adj
    
    mlr_results.update({column: [p_coef, px1, px2, r2_adj]})
    
mlr_df_cols = ['p_coef', 'px1', 'px2', 'r2_adj']
mlr_df = pd.DataFrame.from_dict(mlr_results, orient='index', columns=mlr_df_cols)
mlr_df.sort_values(by='r2_adj', ascending=False)

Unnamed: 0,p_coef,px1,px2,r2_adj
grade,0.0,0.0,0.0,0.53471
yr_built,1.092243e-298,0.0,1.9746990000000003e-304,0.523964
bedrooms,2.407631e-43,0.0,1.0888929999999999e-148,0.507855
sqft_living15,4.064362e-72,0.0,4.90748e-67,0.499202
condition,1.433475e-76,0.0,1.544376e-57,0.498187
attic,6.508034e-32,0.0,1.669505e-48,0.497221
sqft_lot15,6.078376e-21,0.0,1.938843e-22,0.494429
sqft_lot,4.787515e-22,0.0,3.980467e-11,0.493222
sqft_basement,8.2022549999999995e-19,0.0,1.240722e-05,0.492644
floors,9.297806e-18,0.0,0.04652254,0.492287


### Adding grade as second linear regressor, moving on to the third

In [17]:
gr = features.drop(columns=['sqft_living', 'grade'])
mlr2_results = {}

for column in gr:
    y = data.price.values
    X = data[['sqft_living', 'grade', column]].values

    Xconst = sm.add_constant(X)

    model = sm.OLS(y, Xconst, hasconst= True)
    mm_fitted = model.fit()
    mm_fitted.summary()
    
    p_coef = mm_fitted.pvalues[0]
    px1 = mm_fitted.pvalues[1]
    px2 = mm_fitted.pvalues[2]
    px3 = mm_fitted.pvalues[3]
    r2_adj = mm_fitted.rsquared_adj
    
    mlr2_results.update({column: [p_coef, px1, px2, px3, r2_adj]})
    
mlr2_df_cols = ['p_coef', 'px1', 'px2', 'px3', 'r2_adj']
mlr2_df = pd.DataFrame.from_dict(mlr2_results, orient='index', columns=mlr2_df_cols)
mlr2_df.sort_values(by='r2_adj', ascending=False)

Unnamed: 0,p_coef,px1,px2,px3,r2_adj
yr_built,0.0,0.0,0.0,0.0,0.603625
condition,0.0,0.0,0.0,2.5507080000000003e-128,0.547073
attic,0.0,0.0,0.0,5.08389e-92,0.54355
bedrooms,8.22702e-201,0.0,0.0,1.96191e-88,0.543201
sqft_basement,0.0,0.0,0.0,5.104108e-69,0.541305
floors,0.0,0.0,0.0,1.933985e-35,0.538013
bathrooms,0.0,0.0,0.0,3.456395e-29,0.537399
sqft_lot15,0.0,0.0,0.0,1.10442e-18,0.536368
sqft_lot,0.0,0.0,0.0,1.252743e-08,0.535388
sqft_living15,0.0,0.0,0.0,0.0001074079,0.535013
