<a href="https://colab.research.google.com/github/henouji/hnj/blob/master/HNJ_Tweets_AI_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports 

In [0]:
import tweepy as tp
import tensorflow as tf 
import numpy as np
import os 

tf.enable_eager_execution()

# File Upload

In [0]:
from google.colab import files
uploaded = files.upload()

# Data preparation

In [0]:
text = uploaded['tweets.txt'].decode('utf-8')

In [0]:
vocabfile = text

# Create a vocabulary of characters
vocab = sorted(set(vocabfile))

# Convert to index 
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

len(vocab)

text_to_int = np.array([char2idx[x] for x in text])

# Parse to train per sequence
sequence_length = 30
examples_per_epoch = len(text) //sequence_length

# Slice dataset as a tensor
char_dataset = tf.data.Dataset.from_tensor_slices(text_to_int)

# Cut for desirable length 
sequences = char_dataset.batch(sequence_length+1, drop_remainder=True)
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

# Function to split dataset {FOR TRAINING}: input and next target character
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    
    return input_text, target_text

dataset = sequences.map(split_input_target)

# Example Training
for iExample, tExample in dataset.take(1):
    print('Input Data: ', repr(''.join(idx2char[iExample.numpy()])))
    print('Target Data: ', repr(''.join(idx2char[tExample.numpy()])))
    for i, (input_idx, target_idx) in enumerate(zip(iExample[:4], tExample[:4])):
        print("Step {:4d}".format(i))
        print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
        print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))


# Model Building 

In [0]:
# Initialize size of batch for training: The input node layer
batch_size = 8
steps_per_epoch = examples_per_epoch//batch_size

# Size of buffer
buffer_size = 1000
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

vocab_size = len(vocab)
# 1st Layer of hidden Node
embedding_dim = 256 
# Recursive Layer 
rnn_units = 1024

# Initialize RNN Layer if for GPU or CPU
if tf.test.is_gpu_available():
    rnn = tf.keras.layers.CuDNNGRU
else:
    import functools
    rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

# Build model for initializing
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                  batch_input_shape=[batch_size, None]),
        rnn(rnn_units,
           return_sequences=True,
           recurrent_initializer='glorot_uniform',
           stateful=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

# Building Model
model = build_model(vocab_size=len(vocab),
                   embedding_dim=embedding_dim,
                   rnn_units=rnn_units,
                   batch_size=batch_size)

# Model Summary 

In [0]:

# Model summary
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
model.summary()

# Samples 
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

# Model Training

In [0]:
# Calculates Loss
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction Shape: ", example_batch_predictions.shape,"# (batch_size, sequence_length, vocab_size)")
print("Scalar Loss: ", example_batch_loss.numpy().mean())

# Compile model 
model.compile(optimizer=tf.train.AdamOptimizer(), loss=loss)

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

# Model Training 
# Callbacks are used to refit shape of model
history = model.fit(dataset.repeat(), epochs=50, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))



# Use Model to Predict Tweet

In [0]:
def generate_text(model, start_string):
  
  num_generate = 100
  
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)
  
  print(input_eval)
  
  text_generated = []
  
  temperature = 1.0
  
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    
    predictions = tf.squeeze(predictions, 0)
    
    predictions = predictions/ temperature
    
    predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
    
    input_eval = tf.expand_dims([predicted_id],0)
    
    text_generated.append(idx2char[predicted_id])
    
  return (start_string + ''.join(text_generated))

In [0]:
print(generate_text(model, start_string=u"Dude"))

# Saving Model to download 

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
model_save_name = 'classifier.pt'
path = F"/content/gdrive/My Drive/{model_save_name}" 
model.save('twitterRnn.h5')

# Loading Model 
    Upload model and weights 

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
from keras.models import model_from_json
model = model_from_json(uploaded['twitterRNN.json'])
model.load_weights(uploaded['twitterWeights.h5'])