### Upload data and Drop NaN

In [1]:
import pandas as pd
X = pd.read_csv('housing.csv')
# NaN values exist! -> X_train.isnull().sum()
X.dropna(inplace=True) 
X.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


### Split predictors and responses (predictors into train/test)

In [2]:
Y = X.pop("median_house_value")
Y /= 10000
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=5, shuffle=True)

### Pop out categorical variable

In [3]:
categorical_var_train = X_train.pop("ocean_proximity")
categorical_var_test = X_test.pop("ocean_proximity")

### Normalize "continuous" features

In [4]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)

X_train.loc[:,:] = scaler.transform(X_train)
X_test.loc[:,:] = scaler.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:,:] = scaler.transform(X_train)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.loc[:,:] = scaler.transform(X_

### Encode categorical variable into one-hot vector ("ocean_proximity")

In [5]:
list(set(categorical_var_train))

['NEAR OCEAN', 'NEAR BAY', 'ISLAND', '<1H OCEAN', 'INLAND']

In [6]:
category = list(set(categorical_var_train))
for ocean_proximity in category:
    train_new_column = (categorical_var_train == ocean_proximity).map(int)
    test_new_column = (categorical_var_test == ocean_proximity).map(int)
    X_train.loc[:,ocean_proximity] = train_new_column
    X_test.loc[:,ocean_proximity] = test_new_column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


### Unregularized Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression
LinReg = LinearRegression()
LinReg.fit(X_train, Y_train)
LinReg.intercept_, LinReg.coef_

(24.207397655791397,
 array([-5.28695042, -5.34710353,  1.3651657 , -1.41598326,  4.22797226,
        -4.19832371,  1.8809481 ,  7.4979804 , -1.88295899, -2.66511338,
        13.08346369, -2.2750731 , -6.26031821]))

In [8]:
Rsquared = LinReg.score(X_train, Y_train)
train_RSS = sum((Y_train - LinReg.predict(X_train))**2) / len(Y_train)
test_RSS = sum((Y_test - LinReg.predict(X_test))**2) / len(Y_test)
print("Unregularized Linear Regression")
print("R^2: {}, train RSS: {}, test RSS: {}".format(Rsquared, train_RSS, test_RSS))

Unregularized Linear Regression
R^2: 0.6485863280507143, train RSS: 46.8280413412322, test RSS: 49.65728944101335


### Ridge Regression

In [9]:
from sklearn.linear_model import Ridge
LinRegRidge = Ridge(alpha=1)
LinRegRidge.fit(X_train, Y_train)
LinRegRidge.intercept_, LinRegRidge.coef_

(23.757154559721414,
 array([-5.28332151, -5.34433538,  1.36579776, -1.41248693,  4.22560594,
        -4.1975706 ,  1.87909515,  7.49667514, -1.43116886, -2.21180336,
        11.27787916, -1.8242206 , -5.81068633]))

In [10]:
Rsquared_ridge = LinRegRidge.score(X_train, Y_train)
train_RSS_ridge = sum((Y_train - LinRegRidge.predict(X_train))**2) / len(Y_train)
test_RSS_ridge = sum((Y_test - LinRegRidge.predict(X_test))**2) / len(Y_test)
print("Ridge Regression")
print("R^2: {}, train RSS: {}, test RSS: {}".format(Rsquared_ridge, train_RSS_ridge, test_RSS_ridge))

Ridge Regression
R^2: 0.6485759169011653, train RSS: 46.829428691185804, test RSS: 49.65678878320347


### Lasso Regression

In [11]:
from sklearn.linear_model import Lasso
LinRegLasso = Lasso(alpha=1)
LinRegLasso.fit(X_train, Y_train)
LinRegLasso.intercept_, LinRegLasso.coef_

(21.870092698978507,
 array([-0.        , -0.        ,  0.59786902,  0.        ,  0.        ,
        -0.        ,  0.        ,  6.6193785 ,  0.        ,  0.        ,
         0.        ,  0.        , -3.7567174 ]))

In [12]:
Rsquared_lasso = LinRegLasso.score(X_train, Y_train)
train_RSS_lasso = sum((Y_train - LinRegLasso.predict(X_train))**2) / len(Y_train)
test_RSS_lasso = sum((Y_test - LinRegLasso.predict(X_test))**2) / len(Y_test)
print("Lasso Regression")
print("R^2: {}, train RSS: {}, test RSS: {}".format(Rsquared_lasso, train_RSS_lasso, test_RSS_lasso))

Lasso Regression
R^2: 0.5574057614876473, train RSS: 58.97841476537289, test RSS: 62.40550841153678


### Elastic Net

In [13]:
from sklearn.linear_model import ElasticNet
LinRegElastic = ElasticNet(alpha=1, l1_ratio=0.5)
LinRegElastic.fit(X_train, Y_train)
LinRegElastic.intercept_, LinRegElastic.coef_

(21.150388936137105,
 array([-0.23738846, -0.40798104,  0.78042065,  0.16872808,  0.05856723,
        -0.        ,  0.10440154,  4.83145813,  0.        ,  0.        ,
         0.        ,  0.31549791, -1.93729771]))

In [14]:
Rsquared_elastic = LinRegElastic.score(X_train, Y_train)
train_RSS_elastic = sum((Y_train - LinRegElastic.predict(X_train))**2) / len(Y_train)
test_RSS_elastic = sum((Y_test - LinRegElastic.predict(X_test))**2) / len(Y_test)
print("Elastic Net")
print("R^2: {}, train RSS: {}, test RSS: {}".format(Rsquared_elastic, train_RSS_elastic, test_RSS_elastic))

Elastic Net
R^2: 0.4849563535985497, train RSS: 68.63274565397879, test RSS: 71.4377037808179
