<a href="https://colab.research.google.com/github/jdhaecker/Training/blob/master/Wage_per_hour.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Clone the entire repo.
!git clone -l -s https://github.com/cagBRT/Machine-Learning.git cloned-repo
%cd cloned-repo
!ls

In [0]:
# Use seaborn for pairplot
!pip install seaborn

# **Can a person's hourly wage be predicted from a set of features?**

This model uses linear regression:<br>
>Multiple inputs<br>
>One float output 

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Install TensorFlow
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

In [0]:
#read in data using pandas
dataset = pd.read_csv("hourly_wages.csv")
#check data has been read in properly


wage_per_hour --> the label<br>
all other columns --> features<br>

In [0]:
dataset.columns

In [0]:
!cat hourly_wages.csv


In [0]:
dataset.isna().sum


In [0]:
dataset["female"].value_counts()

In [0]:
dataset["age"].value_counts()

In [0]:
dataset.head

In [0]:
corr = dataset.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
plt.show()


# **Split the dataset into training and test sets**

This is a hyperparameter that can be adjusted. <br>
0.05 - 0.5

In [0]:
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)
print("done")

# **Separate the label from the features**

In [0]:
train_stats = train_dataset.describe()
train_stats.pop("wage_per_hour")
train_stats = train_stats.transpose()
train_stats

In [0]:
test_stats = test_dataset.describe()
test_stats.pop("wage_per_hour")
test_stats = test_stats.transpose()
test_stats

In [0]:
train_labels = train_dataset.pop('wage_per_hour')
test_labels = test_dataset.pop('wage_per_hour')
print("done")

# **Normalize the data**

In [0]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)
print("done")

# **Create the model**
Tune the hyperparameters to improve the model performance. <br>
1. Number of nodes in each layers
2. Number and kinds of layers
3. activation functions
4. The learning rate in the optimizer (.0001 - .1)


In [0]:
inputs = len(train_dataset.keys())
print("number of inputs to the model = " + str(inputs))

def build_model():
  model = keras.Sequential([
    layers.Dense(8, activation=tf.nn.relu,input_shape=([len(train_dataset.keys())]),),
    #layers.Dropout(0.2),
    #layers.Dense(8, activation=tf.nn.relu),
    layers.Dense(8, activation=tf.nn.relu),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mean_squared_error',
                optimizer=optimizer,
                metrics=['mean_absolute_error', 'mean_squared_error'])
  return model
  print("done")

In [0]:
model = build_model()
print("done")

# **Train the Model**
Modify this hyperparameter<br>
1. Number of epochs

In [0]:
# Display training progress by printing a single dot for each completed epoch

model = build_model()
EPOCHS = 1000

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

history = model.fit(normed_train_data, train_labels, epochs=EPOCHS,
                    validation_split = 0.2, verbose=0, callbacks=[early_stop])


In [0]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mean_absolute_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_absolute_error'],
           label = 'Val Error')
  plt.ylim([0,5])
  plt.legend()

  plt.show()

plot_history(history)

In [0]:
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=1)

print("Testing set Mean Abs Error: ${:5.2f} wage_per_hour".format(mae))

In [0]:
test_predictions = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [wage]')
plt.ylabel('Predictions [wage]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])
plt.show()

In [0]:
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [wage]")
_ = plt.ylabel("Count")
plt.show()