In [96]:
#Importing some necessary packages

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

### Define functions

In [115]:
# A function that allows us to drop rows based on values we think are suspect

def drop_rows(df, colname, val):
    return df[df[colname] != val]

In [116]:
# A function to take columns with years (yr_built, yr_renovated) and bin them by decade

from sklearn.preprocessing import LabelEncoder

def bin_by_decade(df, colname):
    bins = pd.IntervalIndex.from_tuples([(1900, 1909), (1910, 1919), (1920, 1929), (1930, 1939), (1940, 1949), (1950, 1959), (1960, 1969), (1970, 1979), (1980, 1989), (1990, 1999), (2000, 2010), (2010, 2020)])
    series = pd.cut(df[colname], bins)
    df[colname] = series
    return df

In [128]:
# Contains all of our data cleaning operations

def clean():
    
# Load the data
    
    house = pd.read_csv('https://raw.githubusercontent.com/learn-co-students/dsc-v2-mod1-final-project-dc-ds-career-042219/master/kc_house_data.csv')
    
# Get rid of rows
    # The row with 33 bedrooms is a suspicious outlier, so we decided to drop it
    # The 454 ?s in sqft_basement needed to be remedied. We chose to drop them--454 seemed like too many to impute. 

    clean_house = house[house['bedrooms'] != 33]
    clean_house = clean_house[clean_house['sqft_basement'] != "?"]
    
# Bin decades
    
    clean_house = bin_by_decade(clean_house, 'yr_built')
    clean_house = bin_by_decade(clean_house, 'yr_renovated')
    
    
# Get dummies
    # Note: Originally, we dropped waterfront because it had a lot of null values. Get dummies gives you the option to create a dummy for nulls or not.
    # We did that, without the null column
    clean_house = pd.get_dummies(clean_house, columns=(['waterfront', 'view', 'floors', 'bedrooms', 'condition', 'zipcode', 'yr_built', 'yr_renovated']))
    
# Drop id, date, waterfront

    clean_house = clean_house.drop(['id', 'date'], axis=1)
    
# Convert sqft_basement from object to float

    clean_house['sqft_basement'] = clean_house['sqft_basement'].astype(float, inplace=True)

# Return clean_house
    
    return clean_house

### Get clean dataframe

In [129]:
clean_house = clean()


In [135]:
clean_house.head()

Unnamed: 0,price,bathrooms,sqft_living,sqft_lot,grade,sqft_above,sqft_basement,lat,long,sqft_living15,...,"yr_renovated_(1920, 1929]","yr_renovated_(1930, 1939]","yr_renovated_(1940, 1949]","yr_renovated_(1950, 1959]","yr_renovated_(1960, 1969]","yr_renovated_(1970, 1979]","yr_renovated_(1980, 1989]","yr_renovated_(1990, 1999]","yr_renovated_(2000, 2010]","yr_renovated_(2010, 2020]"
0,221900.0,1.0,1180,5650,7,1180,0.0,47.5112,-122.257,1340,...,0,0,0,0,0,0,0,0,0,0
1,538000.0,2.25,2570,7242,7,2170,400.0,47.721,-122.319,1690,...,0,0,0,0,0,0,0,1,0,0
2,180000.0,1.0,770,10000,6,770,0.0,47.7379,-122.233,2720,...,0,0,0,0,0,0,0,0,0,0
3,604000.0,3.0,1960,5000,7,1050,910.0,47.5208,-122.393,1360,...,0,0,0,0,0,0,0,0,0,0
4,510000.0,2.0,1680,8080,8,1680,0.0,47.6168,-122.045,1800,...,0,0,0,0,0,0,0,0,0,0


In [13]:
clean_house.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21143 entries, 0 to 21596
Columns: 135 entries, price to yr_renovated_(2010, 2020]
dtypes: float64(5), int64(6), uint8(124)
memory usage: 4.4 MB


### Split the data into training and test sets

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
# Split data into train and test sets

X = clean_house.drop(["price"], axis=1) #predictors
y = clean_house["price"] #target

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=3)
print(len(X_test), len(X_train), len(y_test), len(y_train))

4229 16914 4229 16914


### Regression 1: Manual scaling and feature selection

In [19]:
# copied function from Model Fit Linear Regression Lab

def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [82]:
target = clean_house["price"]
predictors = clean_house.drop(["price"], axis=1)

result = stepwise_selection(predictors, target, verbose = True)
print('resulting features:')
print(result)

  return ptp(axis=axis, out=out, **kwargs)
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Add  waterfront_1.0                 with p-value 0.0
Add  bathrooms                      with p-value 0.0
Add  sqft_living                    with p-value 0.0
Drop bathrooms                      with p-value 0.611508


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.


Add  zipcode_98004                  with p-value 0.0
Add  lat                            with p-value 0.0
Add  grade                          with p-value 0.0
Add  zipcode_98039                  with p-value 6.2004e-297
Add  view_0.0                       with p-value 3.39298e-249
Add  zipcode_98112                  with p-value 1.18467e-241
Add  zipcode_98040                  with p-value 1.67766e-195
Add  zipcode_98105                  with p-value 5.57944e-84
Add  condition_3                    with p-value 1.02155e-79
Add  zipcode_98119                  with p-value 4.35741e-80
Add  view_4.0                       with p-value 8.65977e-76
Add  zipcode_98199                  with p-value 2.17883e-61
Add  zipcode_98109                  with p-value 1.53737e-61
Add  zipcode_98102                  with p-value 1.05391e-62
Add  zipcode_98033                  with p-value 2.59056e-56
Add  zipcode_98103                  with p-value 8.23958e-56
Add  zipcode_98115                  with p-va

In [28]:
import statsmodels.api as sm
from sklearn import preprocessing

# Used Scikitlearn's Standard Scaler
    # Takes each data point, subtracts the mean of the sample, then divides by the standard deviation of the sample

ss_scaler = preprocessing.StandardScaler()
clean_house_ss = ss_scaler.fit_transform(clean_house)

# Standard scaler returns a numpy array, so we converted it back to a DF

clean_house_ss = pd.DataFrame(clean_house_ss, columns=list(clean_house.columns))

# We generated a list of features to use using a stepwise selection function from Flatiron's learn.co lessons
    # Finds the best p values from your features
    # Note: The resulting list contains 79 of the original 135 features

clean_house_selections_list = ['price', 'waterfront_1.0', 'sqft_living', 'zipcode_98004', 'lat', 'grade', 'zipcode_98039', 'view_0.0', 'zipcode_98112', 'zipcode_98040', 'zipcode_98105', 'condition_3', 'zipcode_98119', 'view_4.0', 'zipcode_98199', 'zipcode_98109', 'zipcode_98102', 'zipcode_98033', 'zipcode_98103', 'zipcode_98115', 'zipcode_98122', 'zipcode_98117', 'zipcode_98006', 'zipcode_98116', 'zipcode_98107', 'zipcode_98144', 'sqft_above', 'sqft_basement', 'floors_2.0', 'zipcode_98136', 'floors_3.0', 'zipcode_98005', 'yr_renovated_(2000, 2010]', 'zipcode_98126', 'bedrooms_4', 'zipcode_98029', 'zipcode_98118', 'zipcode_98008', 'view_3.0', 'condition_5', 'zipcode_98027', 'zipcode_98052', 'bedrooms_2', 'zipcode_98007', 'bedrooms_5', 'zipcode_98019', 'zipcode_98028', 'bedrooms_1', 'bedrooms_3', 'bathrooms', 'zipcode_98106', 'yr_built_(1930, 1939]', 'yr_built_(1940, 1949]', 'zipcode_98075', 'zipcode_98011', 'zipcode_98155', 'zipcode_98077', 'sqft_lot', 'yr_built_(1920, 1929]', 'yr_built_(2010, 2020]', 'yr_built_(1910, 1919]', 'floors_1.5', 'zipcode_98072', 'zipcode_98014', 'zipcode_98133', 'zipcode_98022', 'zipcode_98010', 'yr_built_(1970, 1979]', 'zipcode_98178', 'bedrooms_6', 'yr_built_(1900, 1909]', 'zipcode_98002', 'sqft_living15', 'yr_renovated_(2010, 2020]', 'bedrooms_8', 'zipcode_98058', 'zipcode_98070', 'zipcode_98065', 'yr_built_(1960, 1969]']
print("The number of features used in the model is:", len(clean_house_selections_list))

# Made a new DF with just the features from our selections list
clean_house_selections = clean_house_ss[clean_house_selections_list]

# # Used statsmodels OLS so we could get a handy summary of the regression results
target = clean_house_selections["price"]
predictors = clean_house_selections.drop(["price"], axis=1)

predictors_int = sm.add_constant(predictors)
model = sm.OLS(target, predictors_int).fit()
model.summary()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


The number of features used in the model is: 79


0,1,2,3
Dep. Variable:,price,R-squared:,0.812
Model:,OLS,Adj. R-squared:,0.811
Method:,Least Squares,F-statistic:,1180.0
Date:,"Wed, 08 May 2019",Prob (F-statistic):,0.0
Time:,19:48:12,Log-Likelihood:,-12341.0
No. Observations:,21143,AIC:,24840.0
Df Residuals:,21065,BIC:,25460.0
Df Model:,77,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.172e-14,0.003,-3.92e-12,1.000,-0.006,0.006
waterfront_1.0,0.1332,0.004,36.163,0.000,0.126,0.140
sqft_living,0.2455,0.004,69.193,0.000,0.239,0.252
zipcode_98004,0.2056,0.003,65.660,0.000,0.200,0.212
lat,0.2086,0.005,44.311,0.000,0.199,0.218
grade,0.1814,0.006,31.525,0.000,0.170,0.193
zipcode_98039,0.1497,0.003,49.436,0.000,0.144,0.156
view_0.0,-0.0606,0.004,-15.437,0.000,-0.068,-0.053
zipcode_98112,0.1396,0.003,43.791,0.000,0.133,0.146

0,1,2,3
Omnibus:,20376.422,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4363973.19
Skew:,4.146,Prob(JB):,0.0
Kurtosis:,72.892,Cond. No.,4580000000000000.0


### Model Validation

In [177]:
# We needed a special train, test, split for our altered data set
target = clean_house_selections["price"]
predictors = clean_house_selections.drop(["price"], axis=1)

X1_train,X1_test,y1_train,y1_test=train_test_split(predictors,target,test_size=0.2,random_state=3)

from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X1_train, y1_train)

y_hat_train = linreg.predict(X1_train)
y_hat_test = linreg.predict(X1_test)

# predictors_int = sm.add_constant(X1_train)
# model = sm.OLS(y1_train, predictors_int).fit()
# model.summary()

# model.predict(X1_train)


In [178]:
from sklearn.metrics import r2_score

print(r2_score(y1_train, y_hat_train))
print(r2_score(y1_test, y_hat_test))

0.8125209264458079
0.8059988504539424


In [181]:
linreg.coef_

array([ 1.27643174e-01,  1.52081982e+11,  2.18822969e-01,  2.10873805e-01,
        1.78818319e-01,  1.55415007e-01, -6.01122408e-02,  1.36977481e-01,
        1.26140626e-01,  7.67521858e-02, -3.35788727e-02,  8.56208801e-02,
        8.44116211e-02,  7.19585419e-02,  6.06153011e-02,  7.02047348e-02,
        6.78505898e-02,  6.88409805e-02,  6.01840019e-02,  6.15675449e-02,
        5.10954857e-02,  5.94900846e-02,  5.25407791e-02,  4.93001938e-02,
        5.09954095e-02, -1.37158058e+11, -7.32635160e+10, -8.89739990e-02,
        3.81517410e-02, -5.09977341e-02,  3.74053717e-02,  4.04899120e-02,
        2.84709930e-02,  2.22304821e-01,  3.08794975e-02,  2.63133049e-02,
        3.00521851e-02,  3.66535187e-02,  2.93531418e-02,  2.70595551e-02,
        1.96321011e-02,  2.10474014e-01,  2.41613388e-02,  1.11457825e-01,
       -3.26247215e-02, -3.38352919e-02,  6.71615601e-02,  2.84205437e-01,
        4.32243347e-02,  1.27162933e-02,  2.90718079e-02,  2.73823738e-02,
        1.17168427e-02, -

In [182]:
 # convert standardized coefficients to unstandardized

# for ind, val in enumerate(model.params):
#     print(predictors.loc[ind], val * (target.std()/predictors.loc[ind].std()))

for ind, val in enumerate(linreg.coef_):
        print(list(predictors.columns)[ind], (val * ((target.std())/(predictors.iloc[:,ind].std()))))

waterfront_1.0 0.12764317425871036
sqft_living 152081982001.27536
zipcode_98004 0.21882296901494322
lat 0.21087380468786113
grade 0.17881831851983734
zipcode_98039 0.15541500741277253
view_0.0 -0.06011224076703965
zipcode_98112 0.13697748079047065
zipcode_98040 0.12614062628579278
zipcode_98105 0.07675218582154494
condition_3 -0.033578872680663875
zipcode_98119 0.08562088012696674
view_4.0 0.08441162109373475
zipcode_98199 0.0719585418701187
zipcode_98109 0.060615301132197465
zipcode_98102 0.07020473480225091
zipcode_98033 0.06785058975218555
zipcode_98103 0.06884098052979802
zipcode_98115 0.06018400192259406
zipcode_98122 0.06156754493713499
zipcode_98117 0.051095485687257275
zipcode_98006 0.05949008464812183
zipcode_98116 0.052540779113771564
zipcode_98107 0.04930019378661292
zipcode_98144 0.05099540948867346
sqft_above -137158057707.55954
sqft_basement -73263516041.15254
floors_2.0 -0.0889739990234375
zipcode_98136 0.03815174102783735
floors_3.0 -0.050997734069823164
zipcode_98005 0

In [183]:
# Check the mean squared error

from math import sqrt

mse_train = sqrt(np.sum((y1_train-y_hat_train)**2)/len(y_train))
mse_test = sqrt(np.sum((y1_test-y_hat_test)**2)/len(y_test))
print('Train Root Mean Squared Error:', mse_train)
print('Test Root Mean Squared Error:', mse_test)

Train Root Mean Squared Error: 0.43609773337921043
Test Root Mean Squared Error: 0.42756431725017735


In [184]:
# Mean absolute error

print(mean_absolute_error(y1_train, y_hat_train))
print(mean_absolute_error(y1_test, y_hat_test))

0.26022572635218943
0.2599615881998187


In [185]:
# Cross validation scores
    # Took the mean cv score because cross_val_score returns a score for each fold

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

cv_5_results = np.mean(cross_val_score(linreg, X, y, cv=5, scoring="neg_mean_squared_error"))
cv_10_results = np.mean(cross_val_score(linreg, X, y, cv=10, scoring="neg_mean_squared_error"))
cv_20_results = np.mean(cross_val_score(linreg, X, y, cv=20, scoring="neg_mean_squared_error"))

In [186]:
print(cv_5_results)
print(cv_10_results)
print(cv_20_results)

-36000388037.17259
-3460842866955.49
-126057102063.8781


### Regression 2: Lasso

In [187]:
from sklearn.linear_model import Lasso

In [188]:
# Lasso Regression
    # Lasso helps prevent overfitting by reducing the influence of some of the predictors by penalizing their coefficients or reducing them to zero (effectively doing predictor selection)

lasso = Lasso()
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [189]:
# Checking the training/test scores (R2) of our model 

train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)

In [190]:
print("training score:", train_score)
print("test score: ", test_score)

training score: 0.8136391869874172
test score:  0.8069445302396594


In [191]:
# Checking to see how many features were used and how many were axed

coeff_used = np.sum(lasso.coef_!=0)
print("number of features used:", coeff_used)
print("number eliminated:", len(list(clean_house.columns)) - coeff_used)

number of features used: 129
number eliminated: 5


In [192]:
# Mean Squared Error

y_hat_train = lasso.predict(X_train)
y_hat_test = lasso.predict(X_test)

mse_train = sqrt(np.sum((y_train-y_hat_train)**2)/len(y_train))
mse_test = sqrt(np.sum((y_test-y_hat_test)**2)/len(y_test))
print('Train Root Mean Squarred Error:', mse_train)
print('Test Root Mean Squarred Error:', mse_test)

Train Root Mean Squarred Error: 160033.48186579844
Test Root Mean Squarred Error: 156988.00967939573


In [193]:
# Mean absolute error

mae = mean_absolute_error(y_hat_test, y1_test)
mae

544216.838030465

In [194]:
# Cross Validation

cv_5_results = np.mean(cross_val_score(lasso, X, y, cv=5, scoring="neg_mean_squared_error"))
cv_10_results = np.mean(cross_val_score(lasso, X, y, cv=10, scoring="neg_mean_squared_error"))
cv_20_results = np.mean(cross_val_score(lasso, X, y, cv=20, scoring="neg_mean_squared_error"))

In [195]:
print(cv_5_results)
print(cv_10_results)
print(cv_20_results)

-26203816865.915962
-26122960477.60771
-25962216612.195366


In [196]:

for ind, val in enumerate(lasso.coef_):
    print(list(X.columns)[ind], val) 

bathrooms 20576.64470432306
sqft_living 246.57896528317033
sqft_lot 0.2156348677902889
grade 55919.18177448795
sqft_above -37.72427184322255
sqft_basement -114.22604044666072
lat 263478.5566217207
long -176399.08838326155
sqft_living15 13.30150515347995
sqft_lot15 -0.10995235248684124
waterfront_0.0 4534.475296178721
waterfront_1.0 577265.9298961193
view_0.0 -71321.0657851416
view_1.0 4278.430582188568
view_2.0 1607.4039619812659
view_3.0 91224.83783165965
view_4.0 257733.14195529322
floors_1.0 57395.25296130182
floors_1.5 27577.812217750452
floors_2.0 -10028.332133930036
floors_2.5 64776.84981088967
floors_3.0 -53129.69455685694
floors_3.5 76132.12561648565
bedrooms_1 87941.96079600413
bedrooms_2 58125.940332100814
bedrooms_3 34453.317311733925
bedrooms_4 3.1833108165377886
bedrooms_5 -19123.461246494746
bedrooms_6 -13389.822779437198
bedrooms_7 -164557.16333545296
bedrooms_8 25699.73598016449
bedrooms_9 -233116.6536711736
bedrooms_10 -311272.1848758644
bedrooms_11 -88564.28594945547


### Regression 3: Ridge

In [136]:
from sklearn.linear_model import Ridge

In [137]:
# An important difference between Lasso and Ridge is that Ridge doesn't reduce any features' coefficient to 0 (though they can get very, very small)

ridge = Ridge()
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [138]:
# Getting the scores

rtrain_score=ridge.score(X_train,y_train)
rtest_score=ridge.score(X_test,y_test)

In [139]:
print("training score:", rtrain_score)
print("test score: ", rtest_score)

training score: 0.8136008450949628
test score:  0.80715057788118


In [140]:
# Mean Squared Error

y_hat_train = ridge.predict(X_train)
y_hat_test = ridge.predict(X_test)

mse_train = sqrt(np.sum((y_train-y_hat_train)**2)/len(y_train))
mse_test = sqrt(np.sum((y_test-y_hat_test)**2)/len(y_test))
print('Train Root Mean Squarred Error:', mse_train)
print('Test Root Mean Squarred Error:', mse_test)

Train Root Mean Squarred Error: 160049.94367147383
Test Root Mean Squarred Error: 156904.21085006962


In [141]:
# Mean absolute error

mse = mean_absolute_error(y_hat_test, y1_test)
mse

544201.3174486278

In [142]:
# Cross Validation

cv_5_results = np.mean(cross_val_score(ridge, X, y, cv=5, scoring="neg_mean_squared_error"))
cv_10_results = np.mean(cross_val_score(ridge, X, y, cv=10, scoring="neg_mean_squared_error"))
cv_20_results = np.mean(cross_val_score(ridge, X, y, cv=20, scoring="neg_mean_squared_error"))

In [143]:
print(cv_5_results)
print(cv_10_results)
print(cv_20_results)

-26192705525.218136
-26113909169.27703
-25955490873.20748


In [146]:
for ind, val in enumerate(ridge.coef_):
    print(list(X.columns)[ind], val)

bathrooms 20645.240308053115
sqft_living 113.68119809913051
sqft_lot 0.21383672047948665
grade 56346.1496849234
sqft_above 94.92928994000084
sqft_basement 18.75585356002015
lat 306546.063938398
long -172995.89865972966
sqft_living15 13.603555478754759
sqft_lot15 -0.11356059052918564
waterfront_0.0 4217.688799187426
waterfront_1.0 570823.9778817579
view_0.0 -75874.81332346423
view_1.0 149.33368284135608
view_2.0 -3176.5374329974084
view_3.0 86304.63239931272
view_4.0 254120.45007308022
floors_1.0 30474.051185736116
floors_1.5 809.938299763196
floors_2.0 -36722.366225963524
floors_2.5 37968.67828656634
floors_3.0 -78914.53447813322
floors_3.5 46384.232933097635
bedrooms_1 128861.52592821856
bedrooms_2 99603.53763032352
bedrooms_3 75807.9078156232
bedrooms_4 41275.96119859125
bedrooms_5 22406.452422001872
bedrooms_6 27876.82213166577
bedrooms_7 -119032.25544788536
bedrooms_8 64181.40781197446
bedrooms_9 -147008.53785474095
bedrooms_10 -205073.66834276923
bedrooms_11 -32729.101583405973
be

### Try Gridsearch for best alpha

In [176]:
from sklearn.model_selection import GridSearchCV

In [177]:
param = {'alpha': [1, 5, 10, 20, 25, 30, 35]}

In [178]:
lasso_regressor = GridSearchCV(lasso, param, scoring='neg_mean_squared_error', cv=10)
lasso_regressor.fit(X, y)

print('Best parameters:', lasso_regressor.best_params_)
print('Best score:', lasso_regressor.best_score_)

Best parameters: {'alpha': 20}
Best score: -26113231427.35375


In [147]:
lasso = Lasso(alpha = 20)
lasso.fit(X_train, y_train)

Lasso(alpha=20, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [180]:
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)

In [181]:
print("training score:", train_score)
print("test score: ", test_score)

training score: 0.8134989690934248
test score:  0.8068284942439361


In [182]:
coeff_used = np.sum(lasso.coef_!=0)
print("number of features used:", coeff_used)
print("number eliminated:", len(list(clean_house.columns)) - coeff_used)

number of features used: 123
number eliminated: 12


In [183]:
# Mean Squared Error
y_hat_train = lasso.predict(X_train)
y_hat_test = lasso.predict(X_test)

mse_train = sqrt(np.sum((y_train-y_hat_train)**2)/len(y_train))
mse_test = sqrt(np.sum((y_test-y_hat_test)**2)/len(y_test))
print('Train Mean Squared Error:', mse_train)
print('Test Mean Squared Error:', mse_test)

Train Mean Squared Error: 160093.67514929673
Test Mean Squared Error: 157035.18141621255


In [184]:
# Mean absolute error

mse = mean_absolute_error(y_hat_test, y1_test)
mse

544243.4361244787

In [185]:
# Cross Validation

cv_5_results = np.mean(cross_val_score(lasso, X, y, cv=5, scoring="neg_mean_squared_error"))
cv_10_results = np.mean(cross_val_score(lasso, X, y, cv=10, scoring="neg_mean_squared_error"))
cv_20_results = np.mean(cross_val_score(lasso, X, y, cv=20, scoring="neg_mean_squared_error"))

In [186]:
print(cv_5_results)
print(cv_10_results)
print(cv_20_results)

-26199746025.946964
-26113016000.716778
-25954632087.912758


In [149]:
for ind, val in enumerate(lasso.coef_):
    print(list(X.columns)[ind], val)

bathrooms 20242.194637827975
sqft_living 246.6121881419235
sqft_lot 0.21367732893976263
grade 56694.01943090996
sqft_above -38.62395400036627
sqft_basement -114.07280941727669
lat 381454.3846168642
long -111741.02845579524
sqft_living15 13.629440486468004
sqft_lot15 -0.11652887386181787
waterfront_0.0 4089.152119933989
waterfront_1.0 574778.2495667314
view_0.0 -75116.37738892905
view_1.0 0.0
view_2.0 -1694.4992519664966
view_3.0 86964.61041146726
view_4.0 253440.95917508088
floors_1.0 28844.402082017292
floors_1.5 -0.4713290888180398
floors_2.0 -37829.94429013399
floors_2.5 34389.731346296794
floors_3.0 -78987.03692394609
floors_3.5 1555.894537956023
bedrooms_1 94137.77520992466
bedrooms_2 66671.98290862046
bedrooms_3 42953.901774508464
bedrooms_4 8576.444687742327
bedrooms_5 -9673.319921212624
bedrooms_6 -2153.456766509594
bedrooms_7 -141351.11889161155
bedrooms_8 12652.32817399831
bedrooms_9 -111010.28940388949
bedrooms_10 -192525.50533957462
bedrooms_11 -0.0
bedrooms_33 0.0
conditio