# Cleaning data
Main takeaways:
 - In this case Z-Score provided a real advantage, resulting in the best model so far
 - Higher learning rate is preferred with lower feature values
 - MinMax didn't provide an edge over the basic dataset
 
 Known limitations of this notebook:
 - no test-set-validation
 - no normalization

In [1]:
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from helper import charts, lib

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
# Import the dataset.
training_df = pd.read_csv(filepath_or_buffer="https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")

#scale the label
training_df["median_house_value"] /= 1000.0

#set the label name early
my_label="median_house_value" # the median value of a house on a specific city block.

### Removing outlying values, creating 4 versions: 
 - outliers removed from both housing_median_age and median_house_value
 - outliers removed from only one of them
 - no outliers removed

In [3]:
#names will be hma, mhv, both, none

#because they occur a lot
hma = "housing_median_age"
mhv = "median_house_value"

hma_max = training_df[hma].max()
_hma_drop = training_df.loc[training_df[hma] == hma_max].index
print("Dataset contains", len(_hma_drop), "elements in the '",hma,"' column with the value", hma_max, "(max value)")

mhv_max = training_df[mhv].max()
_mhv_drop = training_df.loc[training_df[mhv] == mhv_max].index
print("Dataset contains", len(_mhv_drop), "elements in the '",mhv,"' column with the value", mhv_max, "(max value)")



Dataset contains 1052 elements in the ' housing_median_age ' column with the value 52.0 (max value)
Dataset contains 814 elements in the ' median_house_value ' column with the value 500.001 (max value)


In [4]:
hma_df = training_df.drop(_hma_drop)
mhv_df = training_df.drop(_mhv_drop)
both_df = hma_df.drop(_mhv_drop, errors='ignore')
none_df = training_df.copy(deep=True)

In [5]:
def build_model(my_learning_rate):
  """Create and compile a simple linear regression model."""
  # Most simple tf.keras models are sequential.
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Dense(units=1, input_shape=(6,)))
 
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.RootMeanSquaredError()])

  return model        


def train_model(model, training_df, feature, label, epochs, batch_size):
  """Train the model by feeding it data."""
  history = model.fit(
    x=training_df[feature],
    y=training_df[label],
    batch_size=batch_size,
    #validation_data=(training_df[feature], training_df[label]), #the weights shouldn't be influenced by the model, but the overall accurarcy should be measured against all datapoints
    epochs=epochs)

  return pd.DataFrame(history.history)

In [6]:
# Specify the feature and the label.
my_feature = ["longitude", "latitude", "total_rooms", "total_bedrooms", "population", "households"]

In [7]:
# Hyperparameters for none
learning_rate = 0.01
epoch_count = 20
batch_size = 64

# Build model
none_model = build_model(learning_rate)
none_history = train_model(none_model, none_df, my_feature, my_label, epoch_count, batch_size)

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
# Hyperparameters for mhv
learning_rate = 0.01
epoch_count = 20
batch_size = 64

# Build model
mhv_model = build_model(learning_rate)
mhv_history = train_model(mhv_model, mhv_df, my_feature, my_label, epoch_count, batch_size)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [9]:
# Hyperparameters for HMA
learning_rate = 0.01
epoch_count = 20
batch_size = 64

# Build model
hma_model = build_model(learning_rate)
hma_history = train_model(hma_model, hma_df, my_feature, my_label, epoch_count, batch_size)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20

KeyboardInterrupt: 

In [None]:
# Hyperparameters for both
learning_rate = 0.01
epoch_count = 20
batch_size = 64

# Build model
both_model = build_model(learning_rate)
both_history = train_model(both_model, both_df, my_feature, my_label, epoch_count, batch_size)

: 

In [None]:
#examining history
rmse = "root_mean_squared_error"

none_key = "Original"
mhv_key = "Mean House Value cleaned up"
hma_key = "Housing Median Age cleaned up"
both_key = "Both cleaned up"

keys = [none_key, mhv_key, hma_key, both_key]

together_df = pd.DataFrame({
    none_key : none_history[rmse],
    mhv_key : mhv_history[rmse],
    hma_key : hma_history[rmse],
    both_key : both_history[rmse]
})

: 

In [None]:
#plotting basic rmse history of the three different methods
charts.plot_training_losses(losses=keys, df=together_df)

: 

In [None]:
print("No cleaning metrics:")
_ = none_model.evaluate(training_df[my_feature], training_df[my_label])

print("MHV cleaned metrics:")
_ = mhv_model.evaluate(training_df[my_feature], training_df[my_label])

print("HMA cleaned metrics:")
_ = hma_model.evaluate(training_df[my_feature], training_df[my_label])

print("Both cleaned metrics:")
_ = both_model.evaluate(training_df[my_feature], training_df[my_label])

: 

In [None]:
#sampling data
sample_normal = lib.create_inference_sample(training_df, my_feature, basic_model)
sample_z = lib.create_inference_sample(z_df, my_feature, z_model)
sample_minmax = lib.create_inference_sample(minmax_df, my_feature, minmax_model)

: 

In [None]:
charts.plot_training_test_validat_accuracy(
    dfs=[sample_normal, sample_z, sample_minmax],
    label_key=my_label,
    titles=keys
)

: 

In [None]:
#weights per features
weights = pd.DataFrame.from_dict({
    "feature" : my_feature,
    basic_key : basic_model.get_weights()[0].ravel(),
    z_key : z_model.get_weights()[0].ravel(),
    minmax_key : minmax_model.get_weights()[0].ravel(),
})

weights

: 