In [8]:
# import needed libraries
import pandas as pd
housing = pd.read_csv('housing.csv')
# create a deep copy of the dataset
df = housing.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [9]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
# Prepare the data
X = df.drop(['median_house_value'], axis=1) 
y = df['median_house_value']
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.4+ MB


In [11]:
# split the data into training set and a test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(16512, 9)
(4128, 9)


In [12]:
X_train.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.58229,35.643149,28.608285,2642.004784,538.496851,1426.453004,499.986919,3.880754
std,2.005654,2.136665,12.602499,2174.646744,419.007096,1137.05638,380.967964,1.904294
min,-124.35,32.55,1.0,2.0,1.0,3.0,1.0,0.4999
25%,-121.81,33.93,18.0,1454.0,296.75,789.0,280.0,2.5667
50%,-118.51,34.26,29.0,2129.0,437.0,1167.0,410.0,3.5458
75%,-118.01,37.72,37.0,3160.0,647.0,1726.0,606.0,4.773175
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001


In [13]:
# chose features.
num_features = ['housing_median_age','total_rooms','total_bedrooms','population','households',
               'median_income']
cat_features = ['ocean_proximity']

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# step 1 filling the missing data
# step 2 standardization - making features look like standard normally distributed 
num_pipeline = Pipeline(
    steps=[
        ('num_imputer', SimpleImputer(strategy='median')), # median is used here since there are outliers in some columns.
        ('scaler', StandardScaler()),
        ]
)

cat_pipeline = Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder()),
    ]
)

# Assign features to the pipelines
# Combine pipeline to form the preprocessor
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_features),
        ('cat_pipeline', cat_pipeline, cat_features),
    ]
)



In [15]:
from sklearn.linear_model import LinearRegression

lin_reg_full_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('lin_reg', LinearRegression()),
    ]
)

In [16]:
# use named_steps[] to access any step in your pipeline
lin_reg_full_pipeline.fit(X_train, y_train) # train the model using all training data
print(lin_reg_full_pipeline.named_steps['lin_reg'].intercept_, lin_reg_full_pipeline.named_steps['lin_reg'].coef_)

247932.31716241175 [ 15267.88893679 -15996.39181019  34533.9746986  -42361.34918236
  29284.69540693  77424.96092511 -21000.17790807 -89604.75617669
 136877.16226383 -17771.93062699  -8500.29755208]


In [17]:
# number of rows in the training and testing sets
print(f'the total number of rows in the traing set is {len(X_train)}')

the total number of rows in the traing set is 16512


In [18]:
housing_y_pred = lin_reg_full_pipeline.predict(X_test)

In [19]:
housing_y_pred[0:3]

array([ 54967.7468184 ,  95862.59498781, 265036.56872988])

In [20]:
y_test[0:3]

20046     47700.0
3024      45800.0
15663    500001.0
Name: median_house_value, dtype: float64

In [21]:
# calculate MSE and RMSE
# NOTE: the RMSE is measured on the same scale with the same units as y.
import numpy as np
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, housing_y_pred) 
lin_rmse = np.sqrt(lin_mse)
lin_rmse

70925.36950193123

* RMSE is a measurment to measure the performance of a linear regression model. It's full name is  
Root Mean Square Error. The RMSE measured on the same scale with the same units as y. Since it is   
the way to measuer the difference between the value we predict and the actual value. Therefore,   
a better linear regression model should have the smallest RMSE. 
* The formula to define the RMSE is: 
$\large RMSE(\vec{X}, h_\vec{\theta}) = \sqrt{\frac{1}{m} \sum_{i=1}^m (\vec{\theta}^\top\vec{x}^i - y^i)^2}$
* In the model above, the RMSE is 77257 which is not good. I should do more data preprocessing in the future.  
For example, remove the outliers.

In [22]:
y.describe()

count     20640.000000
mean     206855.816909
std      115395.615874
min       14999.000000
25%      119600.000000
50%      179700.000000
75%      264725.000000
max      500001.000000
Name: median_house_value, dtype: float64