In [18]:
from tensorflow.keras.models import load_model
from src.modelling.model import twitter_model, gen_vocab, generate_text_sequences
from src.datapipeline.datapipeline import Datapipeline
import numpy as np
import pandas as pd


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
dpl = Datapipeline(r'./data/raw/realdonaldtrump.csv')
dpl.transform()
train, val = dpl.split_data()

In [3]:
trump_model = twitter_model(r'./data/raw/glove.6B.100d.txt')
trump_model.build_model(train.values)

Converted 13125 words (1273 misses)
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         1440200   
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               160800    
_________________________________________________________________
dense (Dense)                (None, 14400)             2894400   
Total params: 4,495,400
Trainable params: 3,055,200
Non-trainable params: 1,440,200
_________________________________________________________________


In [4]:
trump_model.get_train_data(train.values)

Instructions for updating:
Please use `layer.__call__` method instead.


In [5]:
x, y = trump_model.get_data()

In [18]:
vocab, vec = trump_model.get_vocab()

In [6]:
loaded_model = load_model("trump_bot.h5")

In [10]:
x_sample = x[:5]

In [11]:
preds = loaded_model.predict(x_sample)

In [16]:
np.argmax(preds, axis = 1)

array([ 4, 24, 51, 10, 12], dtype=int64)

Jin Segment 2 Stacked Bidir Model

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from src.modelling.model import gen_vocab, generate_text_sequences
from src.datapipeline.datapipeline import Datapipeline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
dpl = Datapipeline(r'./data/realdonaldtrump.csv')
dpl.transform()
train, val = dpl.split_data()

In [5]:
model = load_model('trump_bot_bidirstack.h5')

In [5]:
vocab, vectorizer = gen_vocab(train.values)

In [8]:
x_val, y_val = generate_text_sequences(val.values, 5, vocab)

In [14]:
x_input_val = vectorizer.apply(list(map(lambda x: ' '.join(x), x_val)))
x_input_val = tf.gather(x_input_val, [0,1,2,3,4], axis=1)

In [15]:
x_input_val

<tf.Tensor: shape=(74079, 5), dtype=int64, numpy=
array([[8022,  549,   68,   16,   83],
       [ 549,   68,   16,   83,    9],
       [  68,   16,   83,    9, 5574],
       ...,
       [5574,    4,   13,   16,  280],
       [   4,   13,   16,  280,    6],
       [  13,   16,  280,    6,  620]], dtype=int64)>

In [17]:
y_pred_val = model.predict(x_input_val)

In [24]:
index_label = np.argmax(y_pred_val, axis = 1)

output_word = []

for idx in index_label:
    output_word.append(vocab[idx])

In [57]:
y_pred_val.shape

(74079, 14400)

In [47]:
x_val

[['defunding', 'police', 'would', 'be', 'good'],
 ['police', 'would', 'be', 'good', 'for'],
 ['would', 'be', 'good', 'for', '<Unknown>'],
 ['the', 'trade', 'deficit', 'rose', 'to'],
 ['trade', 'deficit', 'rose', 'to', 'a'],
 ['deficit', 'rose', 'to', 'a', 'yr'],
 ['rose', 'to', 'a', 'yr', 'high'],
 ['to', 'a', 'yr', 'high', 'thanks'],
 ['a', 'yr', 'high', 'thanks', 'to'],
 ['yr', 'high', 'thanks', 'to', 'horrible'],
 ['high', 'thanks', 'to', 'horrible', 'trade'],
 ['thanks', 'to', 'horrible', 'trade', 'policies'],
 ['to', 'horrible', 'trade', 'policies', 'clinton'],
 ['horrible', 'trade', 'policies', 'clinton', 'supports'],
 ['trade', 'policies', 'clinton', 'supports', 'i'],
 ['policies', 'clinton', 'supports', 'i', 'will'],
 ['clinton', 'supports', 'i', 'will', 'fix'],
 ['supports', 'i', 'will', 'fix', 'it'],
 ['i', 'will', 'fix', 'it', 'fast'],
 ['you', 'must', 'be', 'doing', 'something'],
 ['must', 'be', 'doing', 'something', 'right'],
 ['be', 'doing', 'something', 'right', 'cause']

In [42]:
[' '.join(x_val[0])]

['defunding police would be good']

# Jin Consolidation

In [5]:
def initialize(data_path, model_path):
    dpl = Datapipeline(data_path)
    dpl.transform()
    train, val = dpl.split_data()
    model = load_model(model_path)
    vocab, vectorizer = gen_vocab(train.values)
    
    return train, val, model, vocab, vectorizer

In [6]:
def tokenize(val, vocab, vectorizer):
    
    x_val, y_val = generate_text_sequences(val.values, 5, vocab)
    x_input_val = vectorizer.apply(list(map(lambda x: ' '.join(x), x_val)))
    x_input_val = tf.gather(x_input_val, [0,1,2,3,4], axis=1)
    
    return x_input_val

In [7]:
def generate_next_word(phrase, vectorizer, model, vocab):
    # if phrase is list of tokens
    if type(phrase) == list:
        x_input_val = vectorizer.apply([' '.join(phrase)])
    # if phrase is string
    elif type(phrase) == str:
        x_input_val = vectorizer.apply([phrase])
    x_input_val = tf.gather(x_input_val, [0,1,2,3,4], axis=1)
    prob_ = model.predict(x_input_val)
    idx = np.argmax(prob_)
    return vocab[idx]

# generate_single_tweet('the trade deficit rose to')

In [8]:
def generate_tweet(phrase, max_char=140):
    word = phrase[-1]
    char_count = 0
    tweet_range = np.random.randint(15,25)
    while word != '' and char_count <= max_char and len(phrase) < tweet_range:
        input_phrase = phrase[-5:]
        word = generate_next_word(input_phrase, vectorizer, model, vocab)
        if word == '':
            break
        elif word =='a' and input_phrase[-1] == 's' and input_phrase[-2] == 'u':
            word = 'usa'
            phrase.pop()
            phrase.pop()

        phrase.append(word)
        char_count = len(' '.join(phrase))
        
    return ' '.join(phrase)

In [138]:
# train, val, model, vocab, vectorizer = initialize('./data/realdonaldtrump.csv', 'trump_bot_bidirstack.h5')
# x_input_val = tokenize(val, vocab, vectorizer)

# phrase = ['the', 'white', 'house', 'our', 'country']

In [139]:
# generate_tweet(phrase)

'the white house our country is doing great for the people of our country is now the be the to to usa is a great country and has my complete and'

Calvin - Try out functions

In [14]:
train, val, model, vocab, vectorizer = initialize('./data/raw/realdonaldtrump.csv', 'trump_bot.h5')
x_input_val = tokenize(val, vocab, vectorizer)
phrase = ['the', 'white', 'house', 'our', 'country']
generate_tweet(phrase)

Instructions for updating:
Please use `layer.__call__` method instead.


'the white house our country is doing great for the people of our country is now the be the to to'

In [30]:
def generatenewval(row):
    phrase = row.split()[:5]
    if len(phrase) < 5:
        return ""
    else: 
        return generate_tweet(phrase)

In [33]:
# val['gen_tweets'] = val['clean_text'].apply(generatenewval)

In [35]:
# gen_list = []
# for i in range(len(val)):
#     if i % 500 == 0:
#         print (i)
#     phrase = val.iloc[i].item().split()[:5]
#     if len(phrase) < 5:
#         gen_list.append("")
#     else:
#         gen_list.append(generate_tweet(phrase))

In [37]:
val_50 = val[val['clean_text'].str.len() > 50]
val_50_100 = val_50.sample(100)
val_50_100['gen_tweets'] = val_50_100['clean_text'].apply(generatenewval)

In [39]:
val_50_100['gen_tweets'].to_csv('generate.csv', index = False)

Jin Evaluation

In [9]:
train, val, model, vocab, vectorizer = initialize('./data/realdonaldtrump.csv', 'trump_bot_bidirstack.h5')

In [13]:
X_val_line, y_val_line = generate_text_sequences(val.values, 5, vocab)

In [24]:
df = pd.DataFrame({'precede_phrase': X_val_line, 'true_next': y_val_line})

In [None]:
df['pred_next'] = df['precede_phrase'].apply(lambda x: generate_next_word(x, vectorizer, model, vocab))