In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
#load data
housing_df = pd.read_csv('HousingData.csv')
housing_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [4]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     486 non-null    float64
 1   ZN       486 non-null    float64
 2   INDUS    486 non-null    float64
 3   CHAS     486 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      486 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    486 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [3]:
#drop null values
housing_df = housing_df.dropna()

In [10]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 394 entries, 0 to 504
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     394 non-null    float64
 1   ZN       394 non-null    float64
 2   INDUS    394 non-null    float64
 3   CHAS     394 non-null    float64
 4   NOX      394 non-null    float64
 5   RM       394 non-null    float64
 6   AGE      394 non-null    float64
 7   DIS      394 non-null    float64
 8   RAD      394 non-null    int64  
 9   TAX      394 non-null    int64  
 10  PTRATIO  394 non-null    float64
 11  B        394 non-null    float64
 12  LSTAT    394 non-null    float64
 13  MEDV     394 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 46.2 KB


In [4]:
# declare X for the PREDICTOR columns and Y for the TARGET column
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1]

In [5]:
#Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
#Create the regressor: reg
reg = LinearRegression()

In [7]:
#Fit the regressor to the training data
reg.fit(X_train, y_train)

LinearRegression()

In [15]:
#Predict on the test data: y_pred
y_pred = reg.predict(X_test)

In [17]:
# Compute and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Root Mean Squared Error: 4.363639996769479


In [8]:
def regression_model(model):
    # Create training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # Create the regressor: reg_all
    reg_all = model
    # Fit the regressor to the training data
    reg_all.fit(X_train, y_train)
    # Predict on the test data: y_pred
    y_pred = reg_all.predict(X_test)
    # Compute and print RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Root Mean Squared Error: {}".format(rmse))  

In [9]:
regression_model(LinearRegression())

Root Mean Squared Error: 5.356978393107106


In [20]:
regression_model(LinearRegression())

Root Mean Squared Error: 4.207472266652231


In [21]:
regression_model(LinearRegression())

Root Mean Squared Error: 4.950813820189801


In [10]:
from sklearn.model_selection import cross_val_score

In [11]:
def regression_model_cv(model, k=5):
    scores = cross_val_score(model, X, y,
scoring='neg_mean_squared_error', cv=k)
    rmse = np.sqrt(-scores)
    print('Reg rmse:', rmse)
    print('Reg mean:', rmse.mean())

In [12]:
regression_model_cv(LinearRegression())

Reg rmse: [3.26123843 4.42712448 5.66151114 8.09493087 5.24453989]
Reg mean: 5.337868962878355


In [27]:
regression_model_cv(LinearRegression(), k=3)

Reg rmse: [ 3.72504914  6.01655701 23.20863933]
Reg mean: 10.983415161090733


In [28]:
regression_model_cv(LinearRegression(), k=6)

Reg rmse: [3.23879491 3.97041949 5.58329663 3.92861033 9.88399671 3.91442679]
Reg mean: 5.086590810801078


In [29]:
regression_model_cv(LinearRegression(), k=20)

Reg rmse: [ 3.42700247  1.62018539  3.7804628   3.44292074  3.16072275  2.76230741
  5.96924055  4.46443859  5.6446658   4.2779664   4.83888657  4.21531387
  2.32804778  2.7529636  14.91149379  5.17911663  6.51747111  2.87261021
  2.22906632  3.44450476]
Reg mean: 4.391969376682711


In [16]:
from sklearn.linear_model import Ridge
regression_model_cv(Ridge())

Reg rmse: [3.17202127 4.54972372 5.36604368 8.03715216 5.03988501]
Reg mean: 5.232965166251767


In [17]:
from sklearn.linear_model import Lasso
regression_model_cv(Lasso())

Reg rmse: [3.52318747 5.70083491 7.82318757 6.9878025  3.97229348]
Reg mean: 5.60146118538429


In [13]:
from sklearn.neighbors import KNeighborsRegressor
regression_model_cv(KNeighborsRegressor())

Reg rmse: [ 8.24568226  8.81322798 10.58043836  8.85643441  5.98100069]
Reg mean: 8.495356738515685


In [14]:
regression_model_cv(KNeighborsRegressor(n_neighbors=4))

Reg rmse: [ 8.44659788  8.99814547 10.97170231  8.86647969  5.72114135]
Reg mean: 8.600813339223432


In [16]:
regression_model_cv(KNeighborsRegressor(n_neighbors=7))

Reg rmse: [ 7.99710601  8.68309183 10.66332898  8.90261573  5.51032355]
Reg mean: 8.351293217401393


In [15]:
regression_model_cv(KNeighborsRegressor(n_neighbors=10))

Reg rmse: [ 7.47549287  8.62914556 10.69543822  8.91330686  6.52982222]
Reg mean: 8.448641147609868


In [17]:
from sklearn.model_selection import GridSearchCV
neighbors = np.linspace(1,20,20)
k = neighbors.astype(int)
param_grid = {'n_neighbors': k}
knn = KNeighborsRegressor()
knn_tuned = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')
knn_tuned.fit(X,y)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])},
             scoring='neg_mean_squared_error')

In [18]:
k = knn_tuned.best_params_
print("Best n_neighbors: {}".format(k))
score = knn_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'n_neighbors': 7}
Best score: 8.516767055977628


In [19]:
from sklearn import tree
regression_model_cv(tree.DecisionTreeRegressor(random_state=0))

Reg rmse: [3.7647936  7.26184759 7.78346186 6.48142428 4.79234165]
Reg mean: 6.016773796161434


In [20]:
from sklearn.ensemble import RandomForestRegressor
regression_model_cv(RandomForestRegressor(random_state=0))

Reg rmse: [3.21859405 3.76199072 4.96431026 6.55950671 3.7700697 ]
Reg mean: 4.454894289804201


In [21]:
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=100, 
                                          random_state=0))

Reg rmse: [3.21859405 3.76199072 4.96431026 6.55950671 3.7700697 ]
Reg mean: 4.454894289804201


In [22]:
from sklearn.model_selection import RandomizedSearchCV
#Set up the hyperparameter grid
param_grid = {'max_depth': [None, 10, 30, 50, 70, 100, 200, 400],
             'min_samples_split': [2, 3, 4, 5],
             'min_samples_leaf': [1, 2, 3],
             'max_features': ['auto', 'sqrt']}

In [23]:
reg = RandomForestRegressor(n_jobs=-1, random_state=0)
reg_tuned = RandomizedSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')
reg_tuned.fit(X,y)

RandomizedSearchCV(cv=5,
                   estimator=RandomForestRegressor(n_jobs=-1, random_state=0),
                   param_distributions={'max_depth': [None, 10, 30, 50, 70, 100,
                                                      200, 400],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3],
                                        'min_samples_split': [2, 3, 4, 5]},
                   scoring='neg_mean_squared_error')

In [24]:
p = reg_tuned.best_params_
print("Best n_neighbors: {}".format(p))
score = reg_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 200}
Best score: 4.584175183678625


In [25]:
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=500,
                                          random_state=0))

Reg rmse: [3.17084646 3.7593559  4.8534035  6.49732743 3.94043004]
Reg mean: 4.4442726650747915
