# Basics

### Task 1 : create a Linear Dataset

In [None]:
import numpy as np

In [None]:
feature=np.arange(6,21)
print(feature)

In [None]:
label=feature*3+4
print(label)

### Task 2 : Add some noise to the Dataset

In [None]:
noise=2*(2*np.random.random(label.size)-1)
print(noise)

In [None]:
label=feature*3+4+noise

In [None]:
print(label)

# Linear Regression with Synthetic Data

In [None]:
import tensorflow as tf

In [None]:
import pandas as pd

In [None]:
from matplotlib import pyplot as plt

### Functions definition :
`build_model(the_learning_rate)` build an empty model

`train_model(model, feature, label, epoch)` train the model with examples (features and label)

In [None]:
#@title Define the functions that build and train a model
def build_model(the_learning_rate):
  """Create and compile a simple linear regression model."""
  # Most simple tf.keras models are sequential. 
  # A sequential model contains one or more layers.
  model = tf.keras.models.Sequential()

  # Describe the topography of the model.
  # The topography of a simple linear regression model
  # is a single node in a single layer. 
  model.add(tf.keras.layers.Dense(units=1, 
                                  input_shape=(1,)))

  # Compile the model topography into code that 
  # TensorFlow can efficiently execute. Configure 
  # training to minimize the model's mean squared error. 
  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=the_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.RootMeanSquaredError()])

  return model           


def train_model(model, feature, label, epochs, batch_size):
  """Train the model by feeding it data."""

  # Feed the feature values and the label values to the 
  # model. The model will train for the specified number 
  # of epochs, gradually learning how the feature values
  # relate to the label values. 
  history = model.fit(x=feature,
                      y=label,
                      batch_size=batch_size,
                      epochs=epochs)

  # Gather the trained model's weight and bias.
  trained_weight = model.get_weights()[0]
  trained_bias = model.get_weights()[1]

  # The list of epochs is stored separately from the 
  # rest of history.
  epochs = history.epoch
  
  # Gather the history (a snapshot) of each epoch.
  hist = pd.DataFrame(history.history)

  # Specifically gather the model's root mean 
  #squared error at each epoch. 
  rmse = hist["root_mean_squared_error"]

  return trained_weight, trained_bias, epochs, rmse

print("Defined build_model and train_model")

### Plotting functions definitions

In [None]:
#@title Define the plotting functions
def plot_the_model(trained_weight, trained_bias, feature, label):
  """Plot the trained model against the training feature and label."""

  # Label the axes.
  plt.xlabel("feature")
  plt.ylabel("label")

  # Plot the feature values vs. label values.
  plt.scatter(feature, label)

  # Create a red line representing the model. The red line starts
  # at coordinates (x0, y0) and ends at coordinates (x1, y1).
  x0 = 0
  y0 = trained_bias
  x1 = my_feature[-1]
  y1 = trained_bias + (trained_weight * x1)
  plt.plot([x0, x1], [y0, y1], c='r')

  # Render the scatter plot and the red line.
  plt.show()

def plot_the_loss_curve(epochs, rmse):
  """Plot the loss curve, which shows loss vs. epoch."""

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Root Mean Squared Error")

  plt.plot(epochs, rmse, label="Loss")
  plt.legend()
  plt.ylim([rmse.min()*0.97, rmse.max()])
  plt.show()

print("Defined the plot_the_model and plot_the_loss_curve functions.")

### Defining dataset

In [None]:
my_feature = [1.0, 2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0, 10.0, 11.0, 12.0]
my_label   = [5.0, 8.8,  9.6, 14.2, 18.8, 19.5, 21.4, 26.8, 28.9, 32.0, 33.8, 38.2]

### Specify hyperparameters, Build, Train and Plot

In [None]:
learning_rate=1.0
epochs=6
my_batch_size=8

my_model = build_model(learning_rate)
trained_weight, trained_bias, epochs, rmse = train_model(my_model, my_feature, 
                                                         my_label, epochs,
                                                         my_batch_size)
plot_the_model(trained_weight, trained_bias, my_feature, my_label)
plot_the_loss_curve(epochs, rmse)

# Linear Regression with a Real Dataset

In [None]:
# Import relevant modules
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt

# The following lines adjust the granularity of reporting. 
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

In [None]:
# Import csv file
training_dataset = pd.read_csv(filepath_or_buffer="https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")

In [None]:
# Print the first rows
training_dataset.head()

In [None]:
# Scale the label (label ~ output)
# Scaling helps keeping loss values at a frendlier range
# Scaling a label is usually not essential
# Scaling features in a multi-feature model usualli is essential
training_dataset["median_house_value"] /= 1000.0

### Examining dataset is important!

In [None]:
training_dataset.describe()

Seems that total_rooms, total_bedrooms, maximum values seems too high ?

In [None]:
training_dataset.drop(training_dataset["total_rooms"].idxmax())

In [None]:
#@title Define the functons that build and train a model
def build_model(my_learning_rate):
  """Create and compile a simple linear regression model."""
  # Most simple tf.keras models are sequential.
  model = tf.keras.models.Sequential()

  # Describe the topography of the model.
  # The topography of a simple linear regression model
  # is a single node in a single layer.
  model.add(tf.keras.layers.Dense(units=1, 
                                  input_shape=(1,)))

  # Compile the model topography into code that TensorFlow can efficiently
  # execute. Configure training to minimize the model's mean squared error. 
  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.RootMeanSquaredError()])

  return model

def train_model(model, df, feature, label, epochs, batch_size):
  """Train the model by feeding it data."""

  # Feed the model the feature and the label.
  # The model will train for the specified number of epochs. 
  history = model.fit(x=df[feature],
                      y=df[label],
                      batch_size=batch_size,
                      epochs=epochs)

  # Gather the trained model's weight and bias.
  trained_weight = model.get_weights()[0]
  trained_bias = model.get_weights()[1]

  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch
  
  # Isolate the error for each epoch.
  hist = pd.DataFrame(history.history)

  # To track the progression of training, we're going to take a snapshot
  # of the model's root mean squared error at each epoch. 
  rmse = hist["root_mean_squared_error"]

  return trained_weight, trained_bias, epochs, rmse

print("Defined the create_model and traing_model functions.")

In [None]:
#@title Define the plotting functions
def plot_the_model(trained_weight, trained_bias, feature, label, training_df):
  """Plot the trained model against 200 random training examples."""

  # Label the axes.
  plt.xlabel(feature)
  plt.ylabel(label)

  # Create a scatter plot from 200 random points of the dataset.
  random_examples = training_df.sample(n=200)
  plt.scatter(random_examples[feature], random_examples[label])

  # Create a red line representing the model. The red line starts
  # at coordinates (x0, y0) and ends at coordinates (x1, y1).
  x0 = 0
  y0 = trained_bias
  x1 = random_examples[feature].max()
  y1 = trained_bias + (trained_weight * x1)
  plt.plot([x0, x1], [y0, y1], c='r')

  # Render the scatter plot and the red line.
  plt.show()


def plot_the_loss_curve(epochs, rmse):
  """Plot a curve of loss vs. epoch."""

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Root Mean Squared Error")

  plt.plot(epochs, rmse, label="Loss")
  plt.legend()
  plt.ylim([rmse.min()*0.97, rmse.max()])
  plt.show()  

print("Defined the plot_the_model and plot_the_loss_curve functions.")

In [None]:
# The following variables are the hyperparameters.
learning_rate = 0.01
epochs = 20
batch_size = 30

# Specify the feature and the label.
my_feature = "total_rooms"  # the total number of rooms on a specific city block.
my_label="median_house_value" # the median value of a house on a specific city block.
# That is, you're going to create a model that predicts house value based 
# solely on total_rooms.  

# Discard any pre-existing version of the model.
my_model = None

# Invoke the functions.
my_model = build_model(learning_rate)
weight, bias, epochs, rmse = train_model(my_model, training_dataset, 
                                         my_feature, my_label,
                                         epochs, batch_size)

print("\nThe learned weight for your model is %.4f" % weight)
print("The learned bias for your model is %.4f\n" % bias )

plot_the_model(weight, bias, my_feature, my_label,training_dataset)
plot_the_loss_curve(epochs, rmse)

Here we trained the model to find a corellation between total_rooms and median_house_value. It's giving poor results.

Let's see how this trained model does in the "real world"
note : here the real world is just a sample from the entire dataset

In [None]:
def predict_house_values(n, feature, label,training_df):
  """Predict house values based on a feature."""

  batch = training_df[feature][10000:10000 + n]
  predicted_values = my_model.predict_on_batch(x=batch)

  print("feature   label          predicted      loss")
  print("  value   value          value          (sort of...)")
  print("          in thousand$   in thousand$   ")
  print("--------------------------------------------")
  for i in range(n):
    print ("%5.0f %6.0f %15.0f %15.0f" % (training_df[feature][10000 + i],
    training_df[label][10000 + i],
    predicted_values[i][0],
    (training_df[label][10000 + i] - predicted_values[i][0])**2/training_df[label][10000 + i] ))

In [None]:
predict_house_values(10, my_feature, my_label,training_dataset)

Loss can be important...
Let's change the feature to train the model on...
...
..
Few tries show that none of the available feature are that accurate to predict the price.

Let's try something else then... 
A SYNTHETIC FEATURE
...


In [None]:
training_dataset["housing_median_age"] = training_dataset["total_rooms"] / training_dataset["population"]

my_feature = "housing_median_age" 

# Experiment with the hyperparameters.
learning_rate = 0.06
epochs = 24
batch_size = 30

# Don't change anything below this line.
my_model = build_model(learning_rate)
weight, bias, epochs, rmse = train_model(my_model, training_dataset, 
                                         my_feature, my_label,
                                         epochs, batch_size)
plot_the_model(weight, bias, my_feature, my_label, training_dataset)
plot_the_loss_curve(epochs, rmse)

predict_house_values(15, my_feature, my_label, training_dataset)

It's still not that great ...
So let's use some correlation

In [None]:
training_dataset.corr()

In [None]:
my_feature = "median_income" 

# Experiment with the hyperparameters.
learning_rate = 1
epochs = 12
batch_size = 30

# Don't change anything below this line.
my_model = build_model(learning_rate)
weight, bias, epochs, rmse = train_model(my_model, training_dataset, 
                                         my_feature, my_label,
                                         epochs, batch_size)
plot_the_model(weight, bias, my_feature, my_label, training_dataset)
plot_the_loss_curve(epochs, rmse)

predict_house_values(15, my_feature, my_label, training_dataset)