## Regularization (L1 & L2): Melbourne Homes

### Preface

**Summary**
- Model Fit Issues - Under Fit, Over Fit
- Model Under Fit - Tune Parameters
- Model Over Fit - L1 and L2 Regularization 

**Acknowledgements**
- Code Basics Machine Learning Course - Regularization Lesson


### Initialization

**Packages**

In [1]:
import pandas as pkg_pandas
import math as pkg_math
import warnings as pkg_warnings
import datetime as pkg_datetime
import matplotlib.pyplot as pkg_plot
import seaborn as pkg_seaborn
import sklearn.linear_model as pkg_linear_model
import sklearn.model_selection as pkg_model_selection
import sklearn.preprocessing as pkg_preprocessing
import sklearn.tree as pkg_tree
import sklearn.metrics as pkg_metrics
import sklearn.datasets as pkg_datasets
import sklearn.ensemble as pkg_ensemble
import sklearn.svm as pkg_svm
import sklearn.naive_bayes as pkg_naive_bayes

**Common**

In [2]:
%matplotlib inline
pkg_warnings.filterwarnings("ignore")

**Fill NaN Values**

In [3]:
def fillna_column_mean_int(df, column_name):
    value = int(df[column_name].mean())
    df[column_name].fillna(value, inplace=True)
    return value

In [4]:
def fillna_column_median_int(df, column_name):
    value = int(df[column_name].median())
    df[column_name].fillna(value, inplace=True)
    return value

In [5]:
def fillna_columns(df, nan_column_names, filler):
    fills = []
    for column_name in nan_column_names:
        fill_value = filler(df, column_name)
        fills.append({"column_name" : column_name, "fill_value" :  fill_value})
    return fills

In [6]:
def fillna_df(df, filler):
    # Find columns which have NaN values
    nan_columns = df.columns[df.isna().any()]
    return fillna_columns(df, nan_columns, filler)

**Load Data**

In [7]:
data_df = pkg_pandas.read_csv("../data/cbex-melbourne-homes.csv")
data_df.shape, data_df.nunique()

((34857, 21),
 suburb_name            351
 address              34009
 rooms                   12
 type                     3
 price                 2871
 method                   9
 seller_name            388
 date                    78
 distance               215
 postal_code            211
 bedroom2                15
 bathrooms               11
 car                     15
 land_size             1684
 building_area          740
 year_built             160
 council_area_name       33
 lattitude            13402
 longtitude           14524
 region_name              8
 property_count         342
 dtype: int64)

**Curate Data**

In [8]:
# Initialize
baseline_df = data_df

In [9]:
useful_columns = ["suburb_name", "rooms", "type", "method", "seller_name", "distance", "bedroom2", "bathrooms", 
    "car", "land_size", "building_area", "year_built", "council_area_name", "region_name", "property_count", "price"]

baseline_df = baseline_df[useful_columns]
baseline_df.shape, baseline_df.isna().sum()

((34857, 16),
 suburb_name              0
 rooms                    0
 type                     0
 method                   0
 seller_name              0
 distance                 1
 bedroom2              8217
 bathrooms             8226
 car                   8728
 land_size            11810
 building_area        21115
 year_built           19306
 council_area_name        3
 region_name              3
 property_count           3
 price                 7610
 dtype: int64)

In [10]:
baseline_df[1005:1015]

Unnamed: 0,suburb_name,rooms,type,method,seller_name,distance,bedroom2,bathrooms,car,land_size,building_area,year_built,council_area_name,region_name,property_count,price
1005,Balwyn,3,h,SP,Noel,9.7,3.0,2.0,2.0,404.0,190.0,2011.0,Boroondara City Council,Southern Metropolitan,5682.0,1500000.0
1006,Balwyn,3,h,S,Fletchers,9.7,3.0,1.0,1.0,607.0,,1950.0,Boroondara City Council,Southern Metropolitan,5682.0,1850000.0
1007,Balwyn,5,h,S,Marshall,9.7,6.0,3.0,4.0,876.0,323.0,1980.0,Boroondara City Council,Southern Metropolitan,5682.0,
1008,Balwyn,4,h,VB,Marshall,9.7,3.0,1.0,1.0,672.0,115.0,1935.0,Boroondara City Council,Southern Metropolitan,5682.0,1700000.0
1009,Balwyn,2,h,PI,Marshall,9.7,2.0,0.0,0.0,1611.0,,,Boroondara City Council,Southern Metropolitan,5682.0,1010000.0
1010,Balwyn North,3,t,S,hockingstuart,9.2,3.0,2.0,2.0,260.0,167.0,2000.0,Boroondara City Council,Southern Metropolitan,7809.0,1315000.0
1011,Balwyn North,3,h,PI,Jellis,9.2,3.0,2.0,1.0,778.0,154.0,1950.0,Boroondara City Council,Southern Metropolitan,7809.0,1500000.0
1012,Balwyn North,4,h,S,Jellis,9.2,4.0,4.0,3.0,1135.0,275.0,1960.0,Boroondara City Council,Southern Metropolitan,7809.0,3230000.0
1013,Balwyn North,2,h,S,Jellis,9.2,2.0,1.0,1.0,253.0,,,Boroondara City Council,Southern Metropolitan,7809.0,1080000.0
1014,Balwyn North,4,u,S,hockingstuart,9.2,,,,,,,Boroondara City Council,Southern Metropolitan,7809.0,1450000.0


In [11]:
zero_value_columns = ["car", "property_count", "distance", "bedroom2", "bathrooms"]
baseline_df[zero_value_columns] = baseline_df[zero_value_columns].fillna(0)
baseline_df.isna().sum()

suburb_name              0
rooms                    0
type                     0
method                   0
seller_name              0
distance                 0
bedroom2                 0
bathrooms                0
car                      0
land_size            11810
building_area        21115
year_built           19306
council_area_name        3
region_name              3
property_count           0
price                 7610
dtype: int64

In [12]:
fills = fillna_columns(baseline_df, ["land_size", "building_area", "year_built"], fillna_column_mean_int)
baseline_df["age"] = (pkg_datetime.date.today().year - baseline_df["year_built"])
baseline_df.drop(columns=["year_built"], inplace=True)
fills, baseline_df.isna().sum()

([{'column_name': 'land_size', 'fill_value': 593},
  {'column_name': 'building_area', 'fill_value': 160},
  {'column_name': 'year_built', 'fill_value': 1965}],
 suburb_name             0
 rooms                   0
 type                    0
 method                  0
 seller_name             0
 distance                0
 bedroom2                0
 bathrooms               0
 car                     0
 land_size               0
 building_area           0
 council_area_name       3
 region_name             3
 property_count          0
 price                7610
 age                     0
 dtype: int64)

In [13]:
baseline_df.dropna(inplace=True)
baseline_df.isna().sum()

suburb_name          0
rooms                0
type                 0
method               0
seller_name          0
distance             0
bedroom2             0
bathrooms            0
car                  0
land_size            0
building_area        0
council_area_name    0
region_name          0
property_count       0
price                0
age                  0
dtype: int64

In [14]:
baseline_df[1015:1025]

Unnamed: 0,suburb_name,rooms,type,method,seller_name,distance,bedroom2,bathrooms,car,land_size,building_area,council_area_name,region_name,property_count,price,age
1311,Bentleigh,3,h,S,Buxton,13.0,3.0,1.0,2.0,0.0,160.0,Glen Eira City Council,Southern Metropolitan,6795.0,895000.0,57.0
1313,Bentleigh,4,h,S,Woodards,13.0,4.0,2.0,1.0,292.0,160.0,Glen Eira City Council,Southern Metropolitan,6795.0,1041000.0,57.0
1314,Bentleigh,4,h,S,hockingstuart,13.0,4.0,2.0,2.0,611.0,168.0,Glen Eira City Council,Southern Metropolitan,6795.0,1430000.0,52.0
1315,Bentleigh,2,h,S,Buxton,13.0,2.0,1.0,2.0,274.0,96.0,Glen Eira City Council,Southern Metropolitan,6795.0,910000.0,52.0
1316,Bentleigh,4,h,PI,Buxton,13.0,4.0,4.0,2.0,579.0,361.0,Glen Eira City Council,Southern Metropolitan,6795.0,2520000.0,10.0
1317,Bentleigh,4,h,S,Buxton,13.0,0.0,0.0,0.0,593.0,160.0,Glen Eira City Council,Southern Metropolitan,6795.0,1860000.0,57.0
1318,Bentleigh,3,h,S,C21,13.0,3.0,1.0,1.0,617.0,160.0,Glen Eira City Council,Southern Metropolitan,6795.0,1350000.0,57.0
1319,Bentleigh,2,u,S,hockingstuart,13.0,2.0,1.0,2.0,118.0,160.0,Glen Eira City Council,Southern Metropolitan,6795.0,635000.0,57.0
1320,Bentleigh,3,h,S,Buxton,13.0,3.0,2.0,4.0,587.0,160.0,Glen Eira City Council,Southern Metropolitan,6795.0,1280000.0,57.0
1321,Bentleigh,5,h,S,Woodards,13.0,5.0,2.0,2.0,567.0,250.0,Glen Eira City Council,Southern Metropolitan,6795.0,1900000.0,96.0


In [15]:
shape_before = baseline_df.shape
non_numerical_columns=["suburb_name", "type", "method", "seller_name", "council_area_name", "region_name"]
baseline_df = pkg_pandas.get_dummies(baseline_df, columns=non_numerical_columns, drop_first=True)
shape_after = baseline_df.shape
shape_before, shape_after

((27244, 16), (27244, 746))

In [16]:
baseline_df[1005:1015]

Unnamed: 0,rooms,distance,bedroom2,bathrooms,car,land_size,building_area,property_count,price,age,...,council_area_name_Wyndham City Council,council_area_name_Yarra City Council,council_area_name_Yarra Ranges Shire Council,region_name_Eastern Victoria,region_name_Northern Metropolitan,region_name_Northern Victoria,region_name_South-Eastern Metropolitan,region_name_Southern Metropolitan,region_name_Western Metropolitan,region_name_Western Victoria
1300,5,13.0,5.0,3.0,2.0,772.0,160.0,6795.0,1600000.0,36.0,...,0,0,0,0,0,0,0,1,0,0
1301,3,13.0,3.0,1.0,2.0,694.0,125.0,6795.0,1165000.0,72.0,...,0,0,0,0,0,0,0,1,0,0
1303,4,13.0,4.0,2.0,3.0,591.0,160.0,6795.0,1355000.0,57.0,...,0,0,0,0,0,0,0,1,0,0
1304,4,13.0,4.0,2.0,4.0,736.0,284.0,6795.0,1910000.0,72.0,...,0,0,0,0,0,0,0,1,0,0
1305,4,13.0,0.0,0.0,0.0,593.0,160.0,6795.0,1250000.0,57.0,...,0,0,0,0,0,0,0,1,0,0
1306,3,13.0,3.0,2.0,3.0,568.0,160.0,6795.0,1448000.0,57.0,...,0,0,0,0,0,0,0,1,0,0
1307,4,13.0,0.0,0.0,0.0,593.0,160.0,6795.0,1400000.0,57.0,...,0,0,0,0,0,0,0,1,0,0
1308,2,13.0,0.0,0.0,0.0,593.0,160.0,6795.0,520000.0,57.0,...,0,0,0,0,0,0,0,1,0,0
1309,2,13.0,2.0,1.0,3.0,336.0,109.0,6795.0,820000.0,62.0,...,0,0,0,0,0,0,0,1,0,0
1310,3,13.0,0.0,0.0,0.0,593.0,160.0,6795.0,740000.0,57.0,...,0,0,0,0,0,0,0,1,0,0


**Split Data**

In [17]:
# Initialize
output_column_name = "price"
baseline_outputs = baseline_df[output_column_name]
baseline_inputs = baseline_df.drop(columns=[output_column_name]).to_numpy()
baseline_df.shape, baseline_inputs.shape, baseline_outputs.shape

((27244, 746), (27244, 745), (27244,))

In [18]:
train_inputs, test_inputs, train_outputs, test_outputs = pkg_model_selection.train_test_split(
    baseline_inputs, baseline_outputs, test_size=0.3, random_state=2
)
train_inputs.shape, test_inputs.shape, train_outputs.shape, test_outputs.shape

((19070, 745), (8174, 745), (19070,), (8174,))

### Process

**Common**

In [19]:
def perform_grid_search(model, model_params, fold_count, X_baseline, y_baseline):
    classifier = pkg_model_selection.GridSearchCV(estimator=model, param_grid=model_params, \
        cv=fold_count, return_train_score=False)
    classifier.fit(X=X_baseline, y=y_baseline)
    return classifier

In [20]:
def perform_random_search(model, model_params, fold_count, X_baseline, y_baseline, num_iterations):
    classifier = pkg_model_selection.RandomizedSearchCV(estimator=model, param_distributions=model_params, \
        cv=fold_count, n_iter=num_iterations, return_train_score=False)
    classifier.fit(X=X_baseline, y=y_baseline)
    return classifier
    

In [21]:
def perform_grid_searches(model_config, fold_count, X_baseline, y_baseline):
    best_results_df = pkg_pandas.DataFrame(columns=["name", "score", "params"])

    for mc in model_config:
        #print("DEBUG:: Grid Search: Model Config: {}".format(mc))
        classifier = perform_grid_search(model=mc["instance"], model_params=mc["params"], \
            fold_count=fold_count, X_baseline=X_baseline, y_baseline=y_baseline)
        best_results_df.loc[len(best_results_df)] = [mc["name"], classifier.best_score_, classifier.best_params_]

    return best_results_df

In [22]:
def perform_random_searches(model_config, fold_count, X_baseline, y_baseline, num_iterations):
    best_results_df = pkg_pandas.DataFrame(columns=["name", "score", "params"])

    for mc in model_config:
        #print("DEBUG:: Random Search: Model Config: {}".format(mc))
        classifier = perform_random_search(model=mc["instance"], model_params=mc["params"], \
            fold_count=fold_count, X_baseline=X_baseline, y_baseline=y_baseline, num_iterations=num_iterations)
        best_results_df.loc[len(best_results_df)] = [mc["name"], classifier.best_score_, classifier.best_params_]

    return best_results_df

**Models: Regularization**

In [23]:
model = pkg_linear_model.LinearRegression()
model.fit(train_inputs, train_outputs)
model 

In [24]:
print("Scores (LinearRegression): Train = {}, Test = {}".format(\
    model.score(train_inputs, train_outputs), model.score(test_inputs, test_outputs)))


Scores (LinearModel): Train = 0.685938386316607, Test = 0.10123646352254734


In [25]:
model = pkg_linear_model.Lasso(alpha=50, max_iter=100, tol=0.1)
model.fit(train_inputs, train_outputs)
model

In [26]:
print("Scores (L1 Regularization=Lasso): Train = {}, Test = {}".format(\
    model.score(train_inputs, train_outputs), model.score(test_inputs, test_outputs)))


Scores (L1 Regularization=Lasso): Train = 0.6799303831056884, Test = 0.6606012236909675


In [27]:
model = pkg_linear_model.Ridge(alpha=50, max_iter=100, tol=0.1)
model.fit(train_inputs, train_outputs)
model

In [28]:
print("Scores (L2 Regularization=Ridge): Train = {}, Test = {}".format(\
    model.score(train_inputs, train_outputs), model.score(test_inputs, test_outputs)))


Scores (L2 Regularization=Ridge): Train = 0.6658530758494113, Test = 0.6706288967076055


**Models: Cross Validation**

In [29]:
scores = pkg_model_selection.cross_val_score(pkg_linear_model.LinearRegression(), baseline_inputs, baseline_outputs)
scores

array([0.67400704, 0.62875492, 0.64045914, 0.53545618, 0.67212295])

**Models: Grid Search**

In [30]:
model_config = [
    {
        "name" : "LinearRegression",
        "instance" : pkg_linear_model.LinearRegression(),
        "params": { 
            "fit_intercept" : [False, True],
            "positive" : [False, True]
        }
    },
    {
        "name" : "Lasso",
        "instance" : pkg_linear_model.Lasso(),
        "params": { 
            "alpha" : [50, 100], 
            "max_iter": [100],
            "tol" : [0.1]
        }
    },
    {
        "name" : "Ridge",
        "instance" : pkg_linear_model.Ridge(),
        "params": { 
            "alpha" : [50, 100], 
            "max_iter": [100],
            "tol" : [0.1]
        }
    }
]

In [31]:
print("Grid Search CV:: Best Results")
results_df = perform_grid_searches(model_config, fold_count=5,
    X_baseline=baseline_inputs, y_baseline=baseline_outputs)
results_df

Grid Search CV:: Best Results


Unnamed: 0,name,score,params
0,LinearRegression,0.63016,"{'fit_intercept': True, 'positive': False}"
1,Lasso,0.631263,"{'alpha': 50, 'max_iter': 100, 'tol': 0.1}"
2,Ridge,0.623535,"{'alpha': 50, 'max_iter': 100, 'tol': 0.1}"
