In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from src.modelling.model import gen_vocab, generate_text_sequences
from src.datapipeline.datapipeline import Datapipeline
from sklearn.metrics import accuracy_score

%load_ext autoreload
%autoreload 2

In [3]:
def initialize(data_path, model_path):
    dpl = Datapipeline(data_path)
    dpl.transform()
    train, val = dpl.split_data()
    model = load_model(model_path)
    vocab, vectorizer = gen_vocab(train.values)
    
    return train, val, model, vocab, vectorizer

def generate_next_word(phrase, vectorizer, model, vocab):
    # if phrase is list of tokens
    if type(phrase) == list:
        x_input_val = vectorizer.apply([' '.join(phrase)])
    # if phrase is string
    elif type(phrase) == str:
        x_input_val = vectorizer.apply([phrase])
    x_input_val = tf.gather(x_input_val, [0,1,2,3,4], axis=1)
    prob_ = model.predict(x_input_val)
    idx = np.argmax(prob_)
    return vocab[idx]

# generate_single_tweet('the trade deficit rose to')

def generate_tweet(phrase, max_char=140):
    word = phrase[-1]
    char_count = 0
    tweet_range = np.random.randint(15,25)
    while word != '' and char_count <= max_char and len(phrase) < tweet_range:
        input_phrase = phrase[-5:]
        word = generate_next_word(input_phrase, vectorizer, model, vocab)
        if word == '':
            break
        elif word =='a' and input_phrase[-1] == 's' and input_phrase[-2] == 'u':
            word = 'usa'
            phrase.pop()
            phrase.pop()

        phrase.append(word)
        char_count = len(' '.join(phrase))
        
    return ' '.join(phrase)

In [4]:
# Bidir
train, val, model, vocab, vectorizer = initialize('./data/realdonaldtrump.csv', 'trump_bot_bidirstack.h5')

X_val_line, y_val_line = generate_text_sequences(val.values[:100], 5, vocab)

val_emb = tf.gather(vectorizer.apply(list(map(lambda line: [' '.join(line)], X_val_line))), [0,1,2,3,4], axis=1)

result = np.argmax(model.predict(val_emb), axis=1)

true = tf.gather(vectorizer.apply(y_val_line),[0], axis=1)

true = np.array(true).reshape(-1)

accuracy_score(true, result)

Instructions for updating:
Please use `layer.__call__` method instead.


0.1523076923076923

In [5]:
# Bidir
train, val, model, vocab, vectorizer = initialize('./data/realdonaldtrump.csv', 'trump_bot_singlebidir.h5')

X_val_line, y_val_line = generate_text_sequences(val.values[:100], 5, vocab)

val_emb = tf.gather(vectorizer.apply(list(map(lambda line: [' '.join(line)], X_val_line))), [0,1,2,3,4], axis=1)

result = np.argmax(model.predict(val_emb), axis=1)

true = tf.gather(vectorizer.apply(y_val_line),[0], axis=1)

true = np.array(true).reshape(-1)

accuracy_score(true, result)

0.14717948717948717

In [7]:
# Single LSTM
train, val, model, vocab, vectorizer = initialize('./data/realdonaldtrump.csv', 'trump_bot.h5')

X_val_line, y_val_line = generate_text_sequences(val.values[:100], 5, vocab)

val_emb = tf.gather(vectorizer.apply(list(map(lambda line: [' '.join(line)], X_val_line))), [0,1,2,3,4], axis=1)

result = np.argmax(model.predict(val_emb), axis=1)

true = tf.gather(vectorizer.apply(y_val_line),[0], axis=1)

true = np.array(true).reshape(-1)

accuracy_score(true, result)

0.14256410256410257