In [1]:
!pip3 install tokenizers
import tensorflow as tf
from tensorflow.keras import layers, Sequential,Model
from tokenizers import Tokenizer
from sklearn.preprocessing import OneHotEncoder
import pickle
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

Collecting tokenizers
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub<0.17,>=0.16.4 (from tokenizers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface_hub, tokenizers
Successfully installed huggingface_hub-0.16.4 tokenizers-0.14.0
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
class TokenAndPositionalEmbedding(layers.Layer):
    def __init__(self, seq_len, vocab_size, embed_dim):
        super(TokenAndPositionalEmbedding, self).__init__()
        self.token_embedding = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.positional_embedding = layers.Embedding(input_dim=seq_len, output_dim=embed_dim)

    def call(self, input):
        len = tf.shape(input)[-1]
        positions = tf.range(start=0, limit=len, delta=1)
        positions = self.positional_embedding(positions)
        tokens = self.token_embedding(input)
        return tokens + positions

    def get_config(self):
        return {"token_embedding": self.token_embedding, "positional_embedding": self.positional_embedding}

In [3]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock,self).__init__()
        self.attn_layer = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [layers.Dense(ff_dim, activation='relu'), layers.Dense(embed_dim)]
        )
        self.l_norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.l_norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.attn_layer(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.l_norm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(out1, training=training)
        return self.l_norm2(out1 + ffn_output)

    def get_config(self):
        return {"attn_layer": self.attn_layer, "ffn": self.ffn, "l_norm1": self.l_norm1, "l_norm2": self.l_norm2, "dropout1": self.dropout1, "dropout2": self.dropout2}

In [4]:
onehotencoder_path = "/content/drive/MyDrive/SAIL Exam Datasets/label-encoder.pickle"
with open(onehotencoder_path, 'rb') as handle:
    label_encoder = pickle.load(handle)

In [5]:
tokenizer_path = "/content/drive/MyDrive/SAIL Exam Datasets/tweets_reviews/bpe-tokenizer-tweets-reviews.json"
tokenizer = Tokenizer.from_file(tokenizer_path)

In [6]:
hyperparameters = {"DROPOUT": 0.1, "LABEL_COUNT": 3, "LEARNING_RATE": 0.00005, "BATCH_SIZE":64,
                   "VOCAB_SIZE": tokenizer.get_vocab_size(), "SEQ_LEN": 10000, "EMBED_DIM": 1, "NUM_HEADS": 1, "FF_DIM": 1
                  }

In [7]:
inputs = tf.keras.layers.Input(shape=(hyperparameters["SEQ_LEN"],))
embedding_layer = TokenAndPositionalEmbedding(hyperparameters["SEQ_LEN"], tokenizer.get_vocab_size(), hyperparameters["EMBED_DIM"])
x = embedding_layer(inputs)
transformer_block = TransformerBlock(hyperparameters["EMBED_DIM"], hyperparameters["NUM_HEADS"], hyperparameters["FF_DIM"])
x = transformer_block(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(hyperparameters["DROPOUT"])(x)
x = tf.keras.layers.Dense(1024, activation="relu")(x)
x = tf.keras.layers.Dropout(hyperparameters["DROPOUT"])(x)
x = tf.keras.layers.Dense(512, activation="relu")(x)
outputs = tf.keras.layers.Dense(hyperparameters["LABEL_COUNT"], activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [8]:
opt = tf.keras.optimizers.Adam(learning_rate=hyperparameters["LEARNING_RATE"], amsgrad=True)

In [9]:
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [12]:
def create_dataset(path,batch_size):
    df = pd.read_csv(path, sep=';')
    df = df[df.text.notna()]
    tokenized = df.text.apply(lambda row: tokenizer.encode(row).ids).tolist()
    tokenized_tf = tf.convert_to_tensor(tokenized)
    target = label_encoder.transform(np.array(df.expected_sentiment).reshape(-1,1)).toarray()
    target_tf = tf.convert_to_tensor(target)

    dataset = tf.data.Dataset.from_tensor_slices((tokenized_tf, target_tf))
    dataset = dataset.shuffle(100).batch(batch_size)

    return dataset


In [13]:
train_dataset = create_dataset("/content/drive/MyDrive/SAIL Exam Datasets/tweets/train.csv", 128)
val_dataset = create_dataset("/content/drive/MyDrive/SAIL Exam Datasets/tweets/val.csv", 128)

In [1]:
model.fit(train_dataset, epochs=1, validation_data=val_dataset)

NameError: ignored