# Text Classification with Transformer Model


In [29]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd

In [30]:
import json
 
f = open('primate_dataset.json')
 
data = json.load(f)
X = [d['post_text'] for d in data if 'post_text' in d]
data_pre = {}
data_pre['text'] = X
y_pre = [d['annotations'] for d in data if 'annotations' in d]


for i in range(9):
    temp =[]
    for j in range(len(y_pre)):
        temp.append(y_pre[j][i][1])
    switcher = {
        0: 'feeling-bad',
        1: 'depressed',
        2: 'tired',
        3: 'little_interest',
        4: 'unusual_behaviour',
        5: 'unusual_appetite',
        6: 'want_to_harm',
        7: 'cannot_concentrate',
        8: 'cannot_fall_asleep'
    }
    data_pre[switcher.get(i)] = np.array(temp)

df = pd.DataFrame(data_pre)
print(df.head())

                                                text feeling-bad depressed  \
0  When I was in high school a few years back, I ...         yes        no   
1  Nine years ago I was diagnosed with depression...         yes       yes   
2  Some background information: My GF of almost 3...         yes       yes   
3  My girlfriend ,of about 3 months now ,has been...         yes       yes   
4  I'm alway feeling like this. It doesn't even m...         yes       yes   

  tired little_interest unusual_behaviour unusual_appetite want_to_harm  \
0   yes             yes                no               no           no   
1    no              no               yes               no           no   
2   yes             yes                no               no          yes   
3    no              no                no               no          yes   
4    no              no                no               no           no   

  cannot_concentrate cannot_fall_asleep  
0                 no                 n

In [31]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [32]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [33]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

vocab_size = 20000  # Only consider the top 20k words
maxlen = 300  # Only consider the first 200 words of each movie review

X = df['text'].values
y = df['feeling-bad'].values
mapping_dict = {'yes': 1, 'no': 0}
y = [mapping_dict[i] for i in y]
y=np.array(y)


# Tokenize the entire dataset
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to a fixed length
X = pad_sequences(sequences, maxlen=maxlen, padding="post")

print(X)
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

[[ 133    2 2510 ... 1099  116   60]
 [2815   14    6 ...   62 1392   10]
 [  25   10   14 ...  162  227  347]
 ...
 [ 432   69   32 ...    0    0    0]
 [ 456   40    3 ...    0    0    0]
 [  89 3903    4 ...    0    0    0]]
[1 1 1 ... 1 1 0]


In [34]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer


inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)


In [36]:
X_train = np.array(X_train)
X_test = np.array(X_test)

print(model.summary())

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    X_train, y_train, batch_size=32, epochs=100, validation_data=(X_test, y_test)
)


Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 300)]             0         
                                                                 
 token_and_position_embeddin  (None, 300, 32)          649600    
 g_4 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_4 (Transf  (None, 300, 32)          10656     
 ormerBlock)                                                     
                                                                 
 global_average_pooling1d_4   (None, 32)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_18 (Dropout)        (None, 32)                0   

: 