In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold

from keras.optimizers import Adam, RMSprop, SGD
from keras.models import Model, Sequential
from keras.metrics import mean_absolute_error, mae, mse
from keras.layers import Dense, Dropout, Input
from keras.regularizers import l2, l1_l2
from keras.wrappers.scikit_learn import KerasRegressor

from datetime import datetime

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Define some useful functions**

In [0]:
def load_data(url, columns=[1, 2, 4, 6, 11]):
  """ Load the dataset, change the column names, 
      and replace the categorical data by numeric values
  """
  # load data
  df = pd.read_csv(url, header=None, usecols=columns)
  print('Original Data shape: ', np.shape(df))

  # re-name all columns
  column_names = ['Price', 'PurchaseDate', 'PropertyType', 'LeaseDuration', 'City']
  df.columns = column_names
  
  # resplace column values
  df['PropertyType'] = df['PropertyType'].replace({'F':0, 'D':1, 'S':2, 'T':3, 'O':4})
  df['LeaseDuration'] = df['LeaseDuration'].replace({'L':0, 'F':1, 'U':2})
  df.loc[df['City']=='LONDON', 'City'] = 0
  df.loc[df['City'] != 0, 'City'] = 1

  # convert column values to appropriate dtype (to save memory)
  df['PurchaseDate'] = pd.to_datetime(df['PurchaseDate'])
  df['Price'] = pd.to_numeric(df["Price"], downcast="integer")
  df['PropertyType'] = pd.to_numeric(df['PropertyType'], downcast='integer')
  df['LeaseDuration'] = pd.to_numeric(df["LeaseDuration"], downcast="integer")
  df['City'] = pd.to_numeric(df["City"], downcast="integer")
  
  return df

In [0]:
def split_train_test(df):
  """ Split the data into training and test dataset 
  """
  # purchases prior to 1/1/2016 as training 
  cutoff = datetime(2016, 1, 1)
  column_sels = ['Price', 'PropertyType', 'LeaseDuration', 'City']
  train_df = df.loc[df['PurchaseDate'] <= cutoff][column_sels]
  test_df = df.loc[df['PurchaseDate'] > cutoff][column_sels] 
  
  # remove duplicates
  train_df.drop_duplicates(keep='first', inplace=True)
  test_df.drop_duplicates(keep='first', inplace=True)
  print("Train (and val) shape: ", train_df.shape)
  print("Test shape: ", test_df.shape)  
  
  return train_df, test_df  

In [0]:
def split_train_val(train_df, perc=0.2):
  """
  """
  train_df, val_df = train_test_split(train_df, test_size=perc, random_state=2019)
  print("Train shape : ", train_df.shape)
  print("Test shape : ", test_df.shape)
  
  return train_df, val_df

In [0]:
def prep(train_df, val_df, test_df):
  """ Prepare inputs/targets pair for training, val, and testing
      using one-hot encoding (for categorical data), and 
      down-scale the target values (prices) 
  """
  # training 
  train_X = train_df[['PropertyType', 'LeaseDuration', 'City']]
  train_y = train_df['Price']

  # validation
  val_X = val_df[['PropertyType', 'LeaseDuration', 'City']]
  val_y = val_df['Price']

  # testing 
  test_X = test_df[['PropertyType', 'LeaseDuration', 'City']]
  test_y = test_df['Price']

  # one-hot encoding the inputs
  ohc = OneHotEncoder(handle_unknown='ignore')
  ohc.fit(train_X)
  train_X = ohc.transform(train_X)
  val_X = ohc.transform(val_X)
  test_X = ohc.transform(test_X)

  # convert the targets to smaller range
  train_y = np.log1p(train_y * 1e-3)
  val_y = np.log1p(val_y * 1e-3)
  test_y = np.log1p(test_y * 1e-3)

  return (train_X, train_y), (val_X, val_y), (test_X, test_y)  

In [0]:
def build_model(l_rate=0.01, dout_rate=0.5, reg_rate=0.01, num_units=60, input_size=10):
  """ Build and compile a fully-connected neural network 
  """
  inp = Input(shape=(input_size,))
  fc1 = Dense(num_units, activation='relu', kernel_regularizer=l2(reg_rate))(inp)
  do1 = Dropout(dout_rate)(fc1)
  fc2 = Dense(num_units, activation='relu', kernel_regularizer=l2(reg_rate))(do1)
  do2 = Dropout(dout_rate)(fc2)
  fc3 = Dense(num_units, activation='relu', kernel_regularizer=l2(reg_rate))(do2)
  out = Dense(1)(fc3)

  model = Model(inputs=inp, outputs=out)
  print(model.summary())
  optim = RMSprop(lr=l_rate)
  model.compile(optimizer=optim, loss=mse, metrics=[mae])  
  
  return model

In [0]:
def train(model, epochs=10, batch_size=10000):
  """ Train a compiled model
  """
  history = model.fit(train_X, train_y, batch_size=batch_size, verbose=1,
                      epochs=epochs, validation_data=(val_X, val_y))  
  
  train_mae = history.history['mean_absolute_error']
  val_mae = history.history['val_mean_absolute_error']  

  return train_mae, val_mae

In [0]:
def train_with_cv(train_df, test_df, epochs=50, batch_size=512, k=5):
  """ Train with k-fold cross-validation
  """
  num_samples = np.shape(train_df)[0] // k
  train_errs = []
  val_errs = []

  for i in range(k):
    print('Processing fold {0}'.format(i))

    # prepare train and val data
    val_data = train_df.iloc[i*num_samples: (i+1)*num_samples]
    train_data = pd.concat([train_df.iloc[:i*num_samples], 
                            train_df.iloc[(i+1)*num_samples:]])
    (train_X, train_y), (val_X, val_y), _ = prep(train_data, val_data, test_df)

    # build a new model
    model = build_model(np.shape(train_X)[1])
    train_mae, val_mae = train(model, epochs=epochs, batch_size=batch_size)

    # append train/val errs to error lists
    train_errs.append(train_mae)
    val_errs.append(val_mae)

  return train_errs, val_errs

# **Load and pre-process data**


In [12]:
# load the data
url = '/content/drive/My Drive/pp-complete.csv'
df = load_data(url)
train_df, test_df = split_train_test(df)

Original Data shape:  (24852949, 5)
Train (and val) shape:  (317952, 4)
Test shape:  (144006, 4)


In [0]:
(train_X, train_y), (test_X, test_y), _ = prep(train_df, test_df, test_df)

# **Use GridSearch to find the best set of hyper-parameters**

In [14]:
model = KerasRegressor(build_fn=build_model, epochs=10, batch_size=512, verbose=0)

l_rates = [0.001, 0.01]
dout_rates = [0.1, 0.5]
reg_rates = [0.001, 0.005, 0.01]
num_units=[60, 80, 100]

kf = KFold(n_splits=3)

param_grid=dict(l_rate=l_rates,
                dout_rate=dout_rates, 
                reg_rate=reg_rates,
                num_units=num_units)

grid = GridSearchCV(estimator=model,
                    param_grid = param_grid,
                    cv=kf, scoring='neg_mean_squared_error')

grid.fit(train_X, train_y)

# summarize results
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.









Best: -1.216376 using {'dout_rate': 0.5, 'l_rate': 0.01, 'num_units': 60, 'reg_rate': 0.01}
-1.690654 (0.573621) with: {'dout_rate': 0.1, 'l_rate': 0.001, 'num_units': 60, 'reg_rate': 0.001}
-1.628682 (0.643845) with: {'dout_rate': 0.1, 'l_rate': 0.001, 'num_units': 60, 'reg_rate': 0.005}
-1.519647 (0.599978) with: {'dout_rate': 0.1, 'l_rate': 0.001, 'num_units': 60, 'reg_rate': 0.01}
-1.624866 (0.675319) with: {'dout_rate': 0.1, 'l_rate': 0.001, 'num_units': 80, 'reg_rate': 0.001}
-1.575711 (0.602384) with: {'dout_rate': 0.1, 'l_rate': 0.001, 'num_units': 80, 'reg_rate': 0.005}
-1.568136 (0.615687) with: {'dout_rate': 0.1, 'l_rate': 0.001, 'num_units': 80, 'reg_rate': 0.01}
-1.669899 (0.624518) with: {'dout_rate': 0.1, 'l_rate': 0.001, 'num_units': 100, 'reg_rate': 0.001}
-1.387787 (0.457219) with: {'dout_rate': 0.1, 'l_rate': 0.001, 'num_units': 100, 'reg_ra

In [21]:
# Implement the best model found by GS
model = build_model()
model.fit(train_X, train_y, epochs=10, batch_size=512, verbose=1)
_, test_mae = model.evaluate(test_X, test_y)
print('Test MAE: ', test_mae)

Model: "model_114"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_114 (InputLayer)       (None, 10)                0         
_________________________________________________________________
dense_453 (Dense)            (None, 60)                660       
_________________________________________________________________
dropout_227 (Dropout)        (None, 60)                0         
_________________________________________________________________
dense_454 (Dense)            (None, 60)                3660      
_________________________________________________________________
dropout_228 (Dropout)        (None, 60)                0         
_________________________________________________________________
dense_455 (Dense)            (None, 60)                3660      
_________________________________________________________________
dense_456 (Dense)            (None, 1)                 61

We are still off by £1,691.