In [0]:
# !pip install nvidia-smi
# !pip install google-colab

In [0]:
from google import auth
auth.default()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'Team18-10605-Final-Project'

In [0]:
!sudo echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!sudo curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!sudo apt -qq update
!sudo apt -qq install gcsfuse

In [0]:
!mkdir bucket
!gcsfuse bucket-team18-10605-final bucket

In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, layers, models, regularizers
from tensorflow.keras.layers import *
import numpy as np
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow_addons.metrics import RSquare

import os
import glob

Load Data in tf.Dataset

In [0]:
def filter_outliers(features, label):
  # label
  if not (-7.0 <= label and label <= 103.0):
    return False
  # p_score
  tmp = features[-2] * 13.664352804935564 + 5.390716682669022
  if not (-7.0 <= tmp and tmp <= 103.0):
    return False
  return True

def normalize(vals):
    # num_words
    vals[-1] = (vals[-1] - 17.13712707395722)/32.30891863874633
    # p_score
    vals[-2] = (vals[-2] - 5.390716682669022)/13.664352804935564
    # p_time
    vals[-4] = (vals[-4] - 9761.223722159446)/50008.29178605555
    return vals

def one_hot(vals):
    ohe = np.zeros((100,))
    subreddit_id = int(vals[0])
    ohe[subreddit_id] = 1.0
    return np.concatenate((ohe, vals[1:]))
  
def process(arr):  
    return one_hot(normalize(arr))

def get_dataset(filepath_pattern):    
    def parse_line(line): 
      line = tf.strings.split(line, ",")
      line = tf.strings.to_number(line)      
      label = line[0]
      features = line[1:]

      features = tf.numpy_function(process, [features], tf.double)
      label = tf.reshape(label, [1])
      features = tf.reshape(features, [157])
      return (features, label)
    ds = tf.data.Dataset.list_files(filepath_pattern)
    ds = tf.data.TextLineDataset(ds)
    ds = ds.map(parse_line)
    ds = ds.filter(lambda features, label: filter_outliers(features, label))
    return ds

Load Data

In [0]:
 train_ds = get_dataset("bucket/full_processed_train/part-*").batch(128).cache() 
val_ds = get_dataset("bucket/full_processed_val/part-*").batch(128).cache()  
test_ds = get_dataset("bucket/full_processed_test/part-*").batch(128).cache()

Model Configuration

In [0]:
# configuration dict
config = {
    'n_layers': 2,
    'h_dim': 128,
    'lr': 0.0001,
    'decay': 1e-6,
    'bs': 32,        
    'epochs': 30
}

Model Training with tf.Dataset

In [0]:
# define model structure
model = models.Sequential()
for i in range(config['n_layers']):
    model.add(Dense(config['h_dim']))
    model.add(Activation('relu'))
model.add(Dense(1))
model.build((None, 157))
print(model.summary())

# define optimizer and loss function
optim = keras.optimizers.Adam(learning_rate=config['lr'], decay=config['decay'])
loss_fn = tf.keras.losses.MeanSquaredError()

# # Checkpointing weights 
checkpoint_dir = "model"
monitor = "val_root_mean_squared_error"
checkpoint_path = checkpoint_dir + "/model.{epoch:03d}-{%s:.4f}.hdf5" % monitor
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 monitor=monitor,
                                                 save_weights_only=False,
                                                 save_best_only=False,
                                                 verbose=1)

callbacks = [cp_callback] 

# compile
model.compile(optimizer=optim, loss=loss_fn, metrics=['mae', RSquare(), RootMeanSquaredError()])
model.fit(train_ds, epochs=config['epochs'], validation_data=val_ds, callbacks=callbacks)

Model Evaluation

In [0]:
result = model.evaluate(test_ds)
print('Test MAE: {}\n Test r2: {}\n Test RMSE: {}'.format(result[1], result[2], result[3]))

In [0]:
# save the model to bucket
model.save_weights("bucket/model.h5")