In [1]:
import pandas as pd
import numpy as np

### Upload data and Drop NaN

In [2]:
X = pd.read_csv('housing.csv')
# NaN value exist! -> X_train.isnull().sum()
X.dropna(inplace=True) 
X.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


### Split predictors and responses (predictors into train/test)

In [3]:
Y = X.pop("median_house_value")
Y /= 10000
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5, shuffle=True)

### Encode categorical variable into integers ("ocean_proximity")

In [5]:
category = list(set(X["ocean_proximity"]))
category_idx = range(len(category))
X_train = X_train.replace(category, category_idx)
X_test = X_test.replace(category, category_idx)
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
2998,-119.02,35.32,14.0,2927.0,588.0,1821.0,561.0,3.3529,2
7892,-118.07,33.88,18.0,2436.0,375.0,1303.0,386.0,6.1968,0
18704,-122.38,40.56,23.0,2281.0,408.0,1164.0,420.0,3.5347,2
10706,-117.72,33.61,26.0,2653.0,621.0,774.0,584.0,2.4900,0
5593,-118.25,33.80,36.0,1697.0,394.0,1274.0,396.0,3.3500,0
...,...,...,...,...,...,...,...,...,...
1038,-120.93,38.50,15.0,1248.0,234.0,529.0,216.0,3.3393,2
5577,-118.30,33.84,37.0,1241.0,226.0,621.0,255.0,4.9196,0
3070,-119.25,35.79,8.0,3271.0,797.0,2700.0,688.0,1.7418,2
18817,-120.48,39.66,32.0,1516.0,289.0,304.0,131.0,1.8839,2


### Unregularized Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression
LinReg = LinearRegression()
LinReg.fit(X_train, Y_train)
LinReg.intercept_, LinReg.coef_

(-349.7089836643659,
 array([-4.10540151e+00, -3.92033284e+00,  1.22286393e-01, -7.63915890e-04,
         1.12995074e-02, -3.80646960e-03,  4.61740290e-03,  3.98313341e+00,
        -6.15890157e-01]))

In [7]:
Rsquared = LinReg.score(X_train, Y_train)
train_RSS = sum((Y_train - LinReg.predict(X_train))**2) / len(Y_train)
test_RSS = sum((Y_test - LinReg.predict(X_test))**2) / len(Y_test)
print("Unregularized Linear Regression")
print("R^2: {}, train RSS: {}, test RSS: {}".format(Rsquared, train_RSS, test_RSS))

Unregularized Linear Regression
R^2: 0.6407817584939959, train RSS: 47.66238747100593, test RSS: 49.70798372832744


### Ridge Regression

In [8]:
from sklearn.linear_model import Ridge
LinRegRidge = Ridge(alpha=1)
LinRegRidge.fit(X_train, Y_train)
LinRegRidge.intercept_, LinRegRidge.coef_

(-349.62629050557337,
 array([-4.10441533e+00, -3.91935832e+00,  1.22303321e-01, -7.63965147e-04,
         1.12982728e-02, -3.80653133e-03,  4.61939758e-03,  3.98316881e+00,
        -6.16107242e-01]))

In [9]:
Rsquared_ridge = LinRegRidge.score(X_train, Y_train)
train_RSS_ridge = sum((Y_train - LinRegRidge.predict(X_train))**2) / len(Y_train)
test_RSS_ridge = sum((Y_test - LinRegRidge.predict(X_test))**2) / len(Y_test)
print("Ridge Regression")
print("R^2: {}, train RSS: {}, test RSS: {}".format(Rsquared_ridge, train_RSS_ridge, test_RSS_ridge))

Ridge Regression
R^2: 0.6407817549942958, train RSS: 47.662387935358794, test RSS: 49.70802880419137


### Lasso Regression

In [10]:
from sklearn.linear_model import Lasso
LinRegLasso = Lasso(alpha=1)
LinRegLasso.fit(X_train, Y_train)
LinRegLasso.intercept_, LinRegLasso.coef_

(-33.31742607511385,
 array([-4.31786707e-01, -5.54262631e-01,  1.75055880e-01, -1.10940056e-03,
         6.28623989e-03, -3.89239802e-03,  1.28596113e-02,  4.17862542e+00,
        -3.79628511e-01]))

In [11]:
Rsquared_lasso = LinRegLasso.score(X_train, Y_train)
train_RSS_lasso = sum((Y_train - LinRegLasso.predict(X_train))**2) / len(Y_train)
test_RSS_lasso = sum((Y_test - LinRegLasso.predict(X_test))**2) / len(Y_test)
print("Lasso Regression")
print("R^2: {}, train RSS: {}, test RSS: {}".format(Rsquared_lasso, train_RSS_lasso, test_RSS_lasso))

Lasso Regression
R^2: 0.5878436270087826, train RSS: 54.68641198674636, test RSS: 57.15031165652484


### Elastic Net

In [12]:
from sklearn.linear_model import ElasticNet
LinRegElastic = ElasticNet(alpha=1, l1_ratio=0.5)
LinRegElastic.fit(X_train, Y_train)
LinRegElastic.intercept_, LinRegElastic.coef_

(-71.71394060955004,
 array([-8.99810402e-01, -9.61469476e-01,  1.68146280e-01, -1.66214402e-04,
         1.99819041e-03, -4.29145546e-03,  1.35318820e-02,  3.53718909e+00,
        -6.69133635e-01]))

In [13]:
Rsquared_elastic = LinRegElastic.score(X_train, Y_train)
train_RSS_elastic = sum((Y_train - LinRegElastic.predict(X_train))**2) / len(Y_train)
test_RSS_elastic = sum((Y_test - LinRegElastic.predict(X_test))**2) / len(Y_test)
print("Elastic Net")
print("R^2: {}, train RSS: {}, test RSS: {}".format(Rsquared_elastic, train_RSS_elastic, test_RSS_elastic))

Elastic Net
R^2: 0.5888725411168073, train RSS: 54.549891907238234, test RSS: 56.97270256484672
