In [111]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

tf.random.set_seed(42)

In [112]:
data = pd.read_csv('data/file.csv', index_col=0)
data.head()


In [114]:
dataset = tf.data.Dataset.from_tensor_slices((data.tweets, data.labels))

In [115]:
for X_batch, y_batch in dataset.batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print(review)
        print(label)

b'ChatGPT: Optimizing Language Models for Dialogue https://t.co/K9rKRygYyn @OpenAI'
b'neutral'
b'Try talking with ChatGPT, our new AI system which is optimized for dialogue. Your feedback will help us improve it. https://t.co/sHDm57g3Kr'
b'good'


In [116]:
def preprocess(X_batch, y_batch):
    #ogranicznie znakow do 300
    X_batch = tf.strings.substr(X_batch, 0, 300)
    #usuniecie linkow
    X_batch = tf.strings.regex_replace(X_batch, r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", b" ") 
    #usuniecie \n
    X_batch = tf.strings.regex_replace(X_batch, r"\\n", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)

    y_batch = tf.where(y_batch=="good", 2   , tf.where(y_batch=="neutral", 1, 0));

    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [117]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in dataset.batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))


In [118]:
vocabulary.most_common()[:25]

[(b'<pad>', 11585),
 (b'ChatGPT', 382),
 (b'a', 218),
 (b'to', 201),
 (b'is', 172),
 (b'the', 171),
 (b'and', 156),
 (b'it', 146),
 (b'OpenAI', 141),
 (b'for', 116),
 (b'I', 113),
 (b'of', 94),
 (b'with', 69),
 (b'in', 67),
 (b'GPT', 60),
 (b'AI', 59),
 (b'about', 56),
 (b'new', 55),
 (b'by', 48),
 (b'that', 45),
 (b'from', 44),
 (b'this', 43),
 (b'on', 42),
 (b'you', 41),
 (b'me', 40)]

In [119]:


vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]



In [120]:


word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"OpenAI my about be can".split():
    print(word_to_id.get(word) or vocab_size)



8
29
16
26
30


In [121]:


words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)



In [122]:


def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = dataset.batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)



In [123]:


for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)



tf.Tensor(
[[  1  33  53 ...   0   0   0]
 [117 105  12 ...   0   0   0]
 [  1  33  53 ...   0   0   0]
 ...
 [ 81  13   1 ...   0   0   0]
 [  1   4  31 ...   0   0   0]
 [117 105  12 ...   0   0   0]], shape=(32, 45), dtype=int64)
tf.Tensor([1 2 1 2 0 2 0 2 2 1 0 1 1 1 1 1 1 0 1 1 1 2 1 1 2 1 2 1 1 1 2 2], shape=(32,), dtype=int32)


In [124]:


embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(3, activation="sigmoid")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
