In [1]:
import os
import datetime
import tensorflow as tf
import numpy as np
import pandas as pd
import io
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
file_path = '/home/visionlab/Jon/Deep Learning Class/Eluvio/'

In [2]:
# Build the sequences from the df input
def build_sequences(df, pre=True, show_word_indices=False):
    # Get the sentences from the dataframe
    sentences = df['title']
 
    # Tokenize the data
    tokenizer = Tokenizer(num_words=1000, lower= 1, oov_token="<OOV>")
    tokenizer.fit_on_texts(sentences)
 
    # Show the word indices for debugging
    if show_word_indices:
        word_index = tokenizer.word_index
        print(word_index)
 
    # Convert the sentences to seqences
    sequences = tokenizer.texts_to_sequences(sentences)
 
    # Pad the sequences so they are all the same length
    if pre:
        padded_data = pad_sequences(sequences, padding='pre')
    else:
        padded_data = pad_sequences(sequences, padding='post')
 
    # Get useful information about each sequence
    num_sentences = len(sentences)
    sentence_len = len(padded_data[0])
 
    # Reshape for input
    padded_data = np.array([padded_data]).reshape(num_sentences, sentence_len, 1)
    return padded_data, sentence_len
 

In [3]:
# Build the output directories for tensorboard
def build_directories():
    # Get the current directory and time
    cwd = os.getcwd()
    model_current_time = datetime.datetime.now().strftime("%b%d%H-%M-%S")
 
    # Create the log directory if it doesn't exist
    log_dir = os.path.join(cwd, 'logs')
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
 
    # Create the individual log directory if it doesn't exist
    model_count = 0
    original_path = log_dir
    log_dir = os.path.join(original_path, f'model_{model_count}')
 
    while os.path.exists(log_dir):
        model_count += 1
        log_dir = os.path.join(original_path, f'model_{model_count}')
 
    return log_dir

In [4]:
# Build the dataset and labels
def build_data(in_df):
    data, sentence_len = build_sequences(in_df)
    labels = np.array(in_df['up_votes'].astype('uint8'))
    return data, labels, sentence_len

In [5]:
# Create the model
def build_model(sentence_len):
    act = 'relu'
    model = keras.models.Sequential()
    model.add(layers.LSTM(500, activation=act, return_sequences=True, input_shape=((sentence_len,1))))
    model.add(layers.Dense(100, activation=act))
    model.add(layers.Dense(50, activation=act))
    model.add(layers.Dense(1, activation='linear'))
    model.summary()
    return model

In [6]:
def run_model(path):
 
    with open(path) as f:
        df = pd.read_csv(f)
        df.head()
        padded_data, labels, sentence_len = build_data(df)
 
        model = build_model(sentence_len)
 
        # Model Parameters
        num_epochs = 50
        verbose = 1
        batch_size = 320
        
        # Tensorboard callback
        log_dir = build_directories()
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
        
        # Best model callback
        filepath = os.join(file_path, 'my_best_model.epoch{epoch:02d}-loss{mean_squared_error:.2f}.hdf5')
        checkpoint = ModelCheckpoint(filepath=filepath, monitor='mean_squared_error', verbose=1, save_best_only=True, mode=min) 
        
        # Define callbacks
        callbacks = [tensorboard_callback, checkpoint]
 
        # Compile the model using a mean squared error loss
        model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy', 'mean_squared_error', 'mean_absolute_error'])
        
        # Fit the model
        model.fit(padded_data, labels, batch_size=batch_size, epochs=num_epochs, verbose=verbose, validation_split=0.2, callbacks=callbacks)
 

In [None]:
filename = "Eluvio_DS_Challenge.csv"
run_model(os.join(file_path,filename))

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 63, 500)           1004000   
_________________________________________________________________
dense (Dense)                (None, 63, 100)           50100     
_________________________________________________________________
dense_1 (Dense)              (None, 63, 50)            5050      
_________________________________________________________________
dense_2 (Dense)              (None, 63, 1)             51        
Total params: 1,059,201
Trainable params: 1,059,201
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 407388 samples, validate on 101848 samples
Instructions for updating:
Use tf.cast instead.


In [80]:
#Predict on data set
with open(os.join(file_path,filename)) as f:
        df = pd.read_csv(f)
        padded_data, labels, sentence_len = build_data(df)

filepath = os.join(file_path, 'my_best_model.epoch13-loss1978.35.hdf5')

# Load the model
model = keras.models.load_model(filepath, compile = True)

predictions = model.predict(padded_data)
print(predictions)

[[[14.259061 ]
  [14.566287 ]
  [14.982493 ]
  ...
  [13.454431 ]
  [13.235612 ]
  [12.4011545]]

 [[14.259061 ]
  [14.566287 ]
  [14.982493 ]
  ...
  [14.240964 ]
  [10.051866 ]
  [14.211129 ]]

 [[14.259061 ]
  [14.566287 ]
  [14.982493 ]
  ...
  [14.352952 ]
  [12.748651 ]
  [12.126031 ]]

 ...

 [[14.259061 ]
  [14.566287 ]
  [14.982493 ]
  ...
  [15.425714 ]
  [13.963937 ]
  [12.157189 ]]

 [[14.259061 ]
  [14.566287 ]
  [14.982493 ]
  ...
  [14.864923 ]
  [14.934037 ]
  [14.657594 ]]

 [[14.259061 ]
  [14.566287 ]
  [14.982493 ]
  ...
  [14.437989 ]
  [14.583459 ]
  [14.124749 ]]]


In [3]:
for j in range(len(predictions)):
    print(max(predictions[j]))

NameError: name 'predictions' is not defined

In [None]:
#average upvotes = 112.2363
#max upvotes
#random guessing
#std deviation