In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.impute import SimpleImputer
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.preprocessing import StandardScaler
from scipy import stats

# Data Exploration

In this section, I will be taking steps to explore the data and see what I am working with. I look for null values, value counts of specific values, and column names.

In [None]:
house= pd.read_csv('../../../data/kc_house_data.csv')
#reading the data file into the learn-env

In [None]:
house.head()

In [None]:
house.columns.to_list()
#printing columns into a list for easier readability 

In [None]:
total = house.isnull().sum().sort_values(ascending=False)
percent = (house.isnull().sum()/house.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

In [None]:
house.dtypes
#seeing what types of data we are working with

In [None]:
house.describe

## Data Cleaning

In this part of the notebook, I am focusing on getting the data cleaned and easier to manipulate so that we can get a baseline model and feature engineer to create more models.

In [None]:
#Getting rid of the question marks in the sqft_basement so that I can make a new column turning those values into booleans
house['sqft_basement'] = pd.to_numeric(house.sqft_basement, errors='coerce')

In [None]:
#Any values in the column sqft_basement greater than zero are set to True in the new column and those with 0 are set to false.
house['basement'] = house['sqft_basement'] > 0

In [None]:
house

In [None]:
house.sqft_basement = house.sqft_basement.replace('?','0')

In [None]:
house.waterfront = house.waterfront.astype(int)

In [None]:
house['sqft_basement']= house['sqft_basement'].astype(float)
house['sqft_basement']= house['sqft_basement'].apply(lambda x: '1' if x>=1 else 0)

This function grabs the zipcodes and returns the city associated with those zipcodes so that we can group them to make more sense of them.

In [None]:
house['zipcode'].value_counts()

In [None]:
def getcity(zipcode):
    if (zipcode == 98002) or (zipcode == 98092):
        return 'Auburn'
    elif(zipcode == 98224):
        return 'Baring'
    elif(zipcode == 98004) or (zipcode == 98005) or (zipcode == 98006) or (zipcode == 98007) or (zipcode == 98008):
        return 'Bellevue'
    elif(zipcode == 98010):
        return 'Black Diamond'
    elif(zipcode == 98011):
        return 'Bothell'
    elif(zipcode == 98178):
        return 'Bryn Mawr-Skyway'
    elif(zipcode == 98148) or (zipcode == 98166):
        return 'Burien'
    elif(zipcode == 98014):
        return 'Carnation'
    elif(zipcode == 98077):
        return 'Cottage Lake'
    elif(zipcode == 98042):
        return 'Covington'
    elif(zipcode == 98198):
        return 'Des Moines'
    elif(zipcode == 98019):
        return 'Duvall'
    elif(zipcode == 98031):
        return 'East Hill-Meridian'
    elif(zipcode == 98022):
        return 'Enumclaw'
    elif(zipcode == 98058):
        return 'Fairwood'
    elif(zipcode ==98024):
        return 'Fall city'
    elif(zipcode == 98003) or (zipcode == 98023):
        return 'Federal Way'
    elif(zipcode == 98027) or (zipcode == 98029):
        return 'Issaquah'
    elif(zipcode == 98028):
        return 'Kenmore'
    elif(zipcode == 98032) or (zipcode == 98030):
        return 'Kent'
    elif(zipcode == 98033) or (zipcode == 98034):
        return 'Kirkland'
    elif (zipcode == 98001):
        return 'Lakeland North'
    elif(zipcode == 98038):
        return 'Maple Valley'
    elif(zipcode == 98039):
        return 'Medina'
    elif(zipcode == 98040):
        return 'Mercer Island'
    elif(zipcode == 98045):
        return 'North Bend'
    elif(zipcode == 98047):
        return 'Pacific'
    elif(zipcode == 98050):
        return 'Preston'
    elif(zipcode == 98051):
        return 'Ravensdale'
    elif(zipcode == 98052):
        return 'Redmond'
    elif(zipcode == 98055) or (zipcode == 98056) or(zipcode == 98057) or (zipcode == 98059):
        return 'Renton'
    elif(zipcode == 98074) or (zipcode == 98075):
        return 'Sammamish'
    elif (zipcode == 98188):
        return 'SeaTac'
    elif (zipcode == 98155) or (zipcode == 98177):
        return 'Shoreline'
    elif (zipcode == 98288):
        return 'Skykomish'
    elif (zipcode == 98065):
        return 'Snoqualmie'
    elif (zipcode == 98168):
        return 'Tukwila'
    elif (zipcode == 98053):
        return 'Union Hill-Novelty Hill'
    elif (zipcode == 98195):
        return 'Univ Of Washington'
    elif (zipcode == 98070):
        return 'Vashon'
    elif (zipcode == 98146):
        return 'White Center'
    elif (zipcode == 98072):
        return 'Woodinville'
    else:
        return 'Seattle'
city = house.zipcode.apply(getcity)
house['city'] = city
house

In [None]:
house['city'].value_counts()

In this step, we are turning 

In [None]:
dummy = pd.get_dummies(house['city'])
dummy

In [None]:
house = pd.concat([house, dummy], axis=1)
house.head()

In [None]:
house = house.merge(dummy, left_index=True, right_index=True)
house

In [None]:
house = house.drop('zipcode', 1)
house= house.drop('city', 1)
house = house.drop('date', 1)

In [None]:
house.corr().sort_values('price')
#Since we are using price as our independent variable, I am looking to see what the highest correlated columns are

In [None]:
house_features = [ 'sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'sqft_lot', 'sqft_above','sqft_living15', 'sqft_lot15' ] 

# Baseline Model

In [None]:
house_features = [ 'sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'sqft_lot', 'sqft_above','sqft_living15', 'sqft_lot15' ]
X= house[house_features]
y= house.price

In [None]:
X.describe()
#gives the specifics of each column that we are looking at

In [None]:
house['waterfront'].fillna(0.0, inplace = True)
house['view'].fillna(0.0, inplace = True)
house['yr_renovated'].fillna(0.0, inplace = True)
house['sqft_basement'].fillna(house['sqft_basement'].mean, inplace= True)
print(house.isnull().sum())
#changing null values to the mean to see the difference in the correlation

In [None]:
#split train test and linear regression
reg= LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)
#gives us a 68% accuracy

In [None]:
#bin zipcodes into towns (reduced dimensions)
#encode (one-hind code)
#create booleans for basement

In [None]:
model_1_train = X_train[['sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'sqft_lot']]
model_1_test= X_test[['sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'sqft_lot']]
model1= LinearRegression()
model1.fit(model_1_train, y_train)

train_preds = model1.predict(model_1_train)
test_preds = model1.predict(model_1_test)

train_error= mean_squared_error(y_train, train_preds)
test_error= mean_squared_error(y_test, test_preds)

print('Train error:', train_error)
print('Test error:', test_error)

This is our baseline error which gives us a target to beat for our other models.

In [None]:
plt.scatter(house.price, house.sqft_living)
plt.title('Square Feet')

In [None]:
plt.scatter(house.price, house.lat)
plt.title('Location and Price')

In [None]:
#shows us where the most houses are located in the dataset
plt.figure(figsize=(13,13))
sns.jointplot(x=house.lat.values, y= house.long.values, size= 12)
plt.xlabel= ('Longitude')
plt.ylabel= ('Latitude')
plt.show()

In [None]:
#seeing how many bedrooms most houses have since my mean code isn't working
house['bedrooms'].value_counts().plot(kind='bar')
plt.title = ('# of Bedrooms')
plt.xlabel= ('Bedrooms')
plt.ylabel= ('Count')

# Secondary Models

In this model I am getting rid of outliers that are not within 2 standard deviations away from the mean to see how it effects the score of my model.

In [None]:
sns.distplot(house['price']);

In [None]:
house=house[(np.abs(stats.zscore(house['price'])) <2)]
house= house[(np.abs(stats.zscore(house['price']))> -2)]
house

In [None]:
sns.distplot(house['price']);

In [None]:
house_features = [ 'sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'sqft_lot', 'sqft_above','sqft_living15', 'sqft_lot15' ]
X= house[house_features]
y= house.price

In [None]:
reg= LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from yellowbrick.regressor import PredictionError


In [None]:
model = Lasso()
visualizer = PredictionError(model)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show()               

Getting rid of outliers increased our R2 scored by about .02 which is a good start. The next step below will be calculating the train and test error.

In [None]:
model_2_train = X_train[['sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'sqft_lot']]
model_2_test= X_test[['sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'sqft_lot']]
model2= LinearRegression()
model2.fit(model_2_train, y_train)

train_preds = model2.predict(model_2_train)
test_preds = model2.predict(model_2_test)

train_error= mean_squared_error(y_train, train_preds)
test_error= mean_squared_error(y_test, test_preds)

print('Train error:', train_error)
print('Test error:', test_error)

The train and test erros are significantly lower and smaller in difference than the baseline model!

## Model 3

In [None]:
#based this model on the top 5 correlated columns in the df
house_corr = ['sqft_living15', 'sqft_living', 'sqft_above', 'yr_built', 'view']
X= house[house_corr]
y= house.price

In [None]:
reg= LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)
#Score of .56 was much worse than the baseline model

In [None]:
model_3_train = X_train[['sqft_living15', 'sqft_living', 'sqft_above', 'yr_built', 'view']]
model_3_test= X_test[['sqft_living15', 'sqft_living', 'sqft_above', 'yr_built', 'view']]
model3= LinearRegression()
model3.fit(model_3_train, y_train)

train_preds = model3.predict(model_3_train)
test_preds = model3.predict(model_3_test)

train_error= mean_squared_error(y_train, train_preds)
test_error= mean_squared_error(y_test, test_preds)

print('Train error:', train_error)
print('Test error:', test_error)

This model gave us an even lower R2 than the baseline but there is still a smaller error which is a good quality

# Model 4

In this model, I will test what creating another column that tells us whether a house has a basement or not has an impact on our model's accuracy.

In [None]:
with_basement= ['sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'basement']
X= house[with_basement]
y= house.price

In [None]:
reg= LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
model_4_train = X_train[['sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'basement']]
model_4_test= X_test[['sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built', 'basement']]
model4= LinearRegression()
model4.fit(model_4_train, y_train)

train_preds = model4.predict(model_4_train)
test_preds = model4.predict(model_4_test)

train_error= mean_squared_error(y_train, train_preds)
test_error= mean_squared_error(y_test, test_preds)

print('Train error:', train_error)
print('Test error:', test_error)

This model has a tad higher R2 than the previous model, but it is still lower than model 2. The train and test errors stayed constant as well.

In [None]:
plt.scatter(X,Y,color='#003F72', label="Data points")
plt.plot(X, regression_line, label= "Regression Line")
plt.legend()

## Model 5

In [None]:
house['log_price'] = np.log(house['price'])

In [None]:
log_with_basement= ['sqft_living','grade','bedrooms', 'lat', 'waterfront','view','floors','long','yr_renovated', 
                  'condition', 'yr_built']
X= house[log_with_basement]
y= house['log_price']

In [None]:
reg= LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
mlr_split = LinearRegression()

#fit
mlr_split.fit(X_train, y_train)

# calculate predictions on training and test sets
y_hat_train = mlr_split.predict(X_train)
y_hat_test = mlr_split.predict(X_test)

#compare MSE for the predicted training and test values 
train_mse = mean_squared_error(np.exp(y_train), np.log(y_hat_train))
test_mse = mean_squared_error(np.exp(y_test), y_hat_test)
print('Train Mean Squarred Error:', train_mse)
print('Test Mean Squarred Error:', test_mse)

print('Train score:', mlr_split.score(X_train, y_train))
print('Test score:', mlr_split.score(X_test, y_test))

## Cross Validation 

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [None]:
X

In [None]:
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None )
model = LogisticRegression()

model.fit(X,y)

scores = cross_val_score(model, X, y, scoring='accuracy', cv=5, n_jobs=-1)
scores
# for train_index, test_index in kf.split(X):
#       print("Train:", train_index, "Validation:",test_index)
#       X_train, X_test = X[train_index], X[test_index] 
#       y_train, y_test = y[train_index], y[test_index]
        

In [None]:
model6 = LinearRegression()
X = house.drop(['price', 'log_price'], axis = 1)
y = house['log_price']

# Create a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1000)
model3.fit(X_train, y_train)

model3.score(X_test, y_test)