In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy import stats
from yellowbrick.regressor import ResidualsPlot, PredictionError

import math

## Read in Pickle

In [None]:
cleaned_KC = pd.read_pickle('./data/cleaned_KC.pkl')

# Scale price feature through logarithmic transformations

In [None]:
cleaned_KC.price = cleaned_KC.price.map(lambda x: np.log(x))

In [None]:
# Created a new column that calculates how much square feet exists for each bedroom
cleaned_KC['sqft_living/bedroom'] = (cleaned_KC['sqft_living']/cleaned_KC['bedrooms']).round(2)

# Round the number of bathrooms to the nearest integers to reduce noise when creating dummy variables
cleaned_KC['bathrooms'] = cleaned_KC['bathrooms'].map(lambda x : np.where(x - math.floor(x) >= .5,math.ceil(x),math.floor(x))) 

# Drop all columns not relevant to our further analysis
cleaned_KC = cleaned_KC.drop(axis=1, columns=['id','date','zipcode','yr_built','grade','condition','view','floors','sqft_lot','sqft_living15','sqft_lot15'])
cleaned_KC.head()

In [None]:
cleaned_KC.corr()['price'].sort_values(ascending=False)

In [None]:
categorial_columns = ['bathrooms','bedrooms']
non_categorial_columns = ['price','sqft_living', 'sqft_living/bedroom']

# This replaces categorical columns with dummy variables, and drops non categorical columns
dummies_df = pd.get_dummies(cleaned_KC, columns=categorial_columns).drop(columns=['price'])
dummies_df

We are going to run a for loop through each feature to run a multiple regression tests and compare rsquared values

In [None]:
for column in dummies_df.columns:


    # This sets X to a dataframe consisting of columns with dummy variables & without extra columnsmap
    X = dummies_df[column].values.reshape(-1, 1)

    # this simply sets y to the price column of the dataframe
    y = cleaned_KC['price'].values.reshape(-1, 1)

    # Creates a Ridge Regression object
    ridge = Ridge()

    # Train the Regression object to fit a line with train data
    ridge.fit(X, y)

    # using that line, predict y values against x_test values.
    y_pred = ridge.predict(X)

    # Use sklearn.metrics.r2_score to determine r-squared based on y_test values and y_pred values.
    r_squared = r2_score(y, y_pred).round(3)
    print('r-squared value for {}: {}'.format(column,r_squared))

Having 2 bathrooms is the least predictive compared to the other amount of bathrooms

Would have been awesome to do predictions on price using zipcodes + lot15 or even geomapping lat/long data which we dropped. If we could run categorical tests for number of bathrooms and bedrooms. 

Things we can do to improve the model:
- Feature Engineering
- Train Test Split at beginning of Data Cleaning to prevent Data Leakage
- Properly remove outliers from dataset - not using Zscores since we can't assume a normal distribution of the features.
- Dummy Variables vs onehotencoder
- Cross Validation