In [3]:
!pip install kaggle

from google.colab import files
files.upload()

# Download kaggle dataset
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
    # also need to get kaggle api key and put it in ~/.kaggle/kaggle.json
!kaggle datasets download -d mrmorj/hate-speech-and-offensive-language-dataset



Saving kaggle.json to kaggle.json
Downloading hate-speech-and-offensive-language-dataset.zip to /content
  0% 0.00/1.01M [00:00<?, ?B/s]
100% 1.01M/1.01M [00:00<00:00, 172MB/s]


In [4]:
# Unzip the data
!unzip hate-speech-and-offensive-language-dataset.zip

Archive:  hate-speech-and-offensive-language-dataset.zip
  inflating: labeled_data.csv        


In [5]:
# Examine the labeled data
import pandas as pd
df = pd.read_csv('labeled_data.csv')
df.head()

# combine classes 0 and 1 into class 0, rename class 2 to class 1
df['class'] = df['class'].replace(1, 0)
df['class'] = df['class'].replace(2, 1)
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,1,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,0,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,0,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,0,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,0,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [6]:
# Create df of just tweet and the class label
df = df[['tweet','class']]
df.head()

# See the breakdown of class labels
df['class'].value_counts()

0    20620
1     4163
Name: class, dtype: int64

In [7]:
import re
import string
import nltk
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
stopword=set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
df['tweet'] = df['tweet'].apply(clean_text)
df.head()

Unnamed: 0,tweet,class
0,rt mayasolov woman shouldnt complain clean ho...,1
1,rt boy dat coldtyga dwn bad cuffin dat hoe ...,0
2,rt urkindofbrand dawg rt ever fuck bitch sta...,0
3,rt cganderson vivabas look like tranni,0
4,rt shenikarobert shit hear might true might f...,0


In [9]:
# Write data to a csv
df.to_csv('removed_stop_words_data.csv', index=False)

# Transformer Code

In [10]:
# Create a transfomer from scratch

# Import libraries
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Layer
from tensorflow import math, matmul, reshape, shape, transpose, cast, float32
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import the dataset
df = pd.read_csv('removed_stop_words_data.csv')
df.head()

Unnamed: 0,tweet,class
0,rt mayasolov woman shouldnt complain clean ho...,1
1,rt boy dat coldtyga dwn bad cuffin dat hoe ...,0
2,rt urkindofbrand dawg rt ever fuck bitch sta...,0
3,rt cganderson vivabas look like tranni,0
4,rt shenikarobert shit hear might true might f...,0


In [11]:
'''
FeedForward class
This class will create a feedforward network with two linear transformations and a ReLU activation in between.

Parameters:
Layer: keras.layers.Layer
'''
class FeedForward(Layer):
  '''
  Parameters:
  d_model: int, dimension of the embedding
  dff: int, dimension of the hidden layer of the feedforward network
  '''
  def __init__(self, d_model, dff):
    super(FeedForward, self).__init__()
    self.fc1 = keras.layers.Dense(dff)
    self.fc2 = keras.layers.Dense(d_model)
    self.activation = keras.layers.ReLU()

  '''
  Parameters:
  x: input tensor to be passed through the fully connnected network
  '''
  def call(self, x):
    x_fc1 = self.fc1(x)
    x_activation = self.activation(x_fc1)
    return self.fc2(x_activation)

In [12]:
'''
AddNormalization class
This class will normalize result and allow for residual connections.

Parameters:
Layer: keras.layers.Layer
'''
class AddNormalization(Layer):
  '''
  Parameters:
  kwargs: set of any parameters
  '''
  def __init__(self, **kwargs):
    super(AddNormalization, self).__init__(**kwargs)
    self.layer_norm = keras.layers.LayerNormalization()

  '''
  Parameters:
  x: input tensor to be passed through the layer
  sublayer: input tensor to be added to the output
  '''
  def call(self, x, sublayer):
    # Add the sublayer input and the output
    return self.layer_norm(x + sublayer)

In [13]:
'''
DotProductAttention class
This class will compute the dot product attention and mask.

Parameters:
'''
class DotProductAttention(Layer):
  '''
  Parameters:
  kwargs: set of any parameters
  '''
  def __init__(self, **kwargs):
    super(DotProductAttention, self).__init__(**kwargs)

  def call(self, q, k, v, dim_k, mask=None):
    # Compute the dot product of the query
    dot_product = matmul(q, k, transpose_b=True)

    # Scale the dot product
    scaled_dot_product = dot_product / math.sqrt(float(dim_k))

    # Apply mask if necessary
    if mask is not None:
      scaled_dot_product += (mask * -1e9)

    # Compute the weights by applying softmax
    weights = tf.nn.softmax(scaled_dot_product, axis=-1)

    # Compute the attention, y vector, by calculating a weighted sum of value vectors
    output_y = matmul(weights, v)

    return output_y

In [14]:
'''
MultiHeadAttention class
This class will create a multi-head attention layer.
It finds the context and influence of input.

Parameters:
'''
class MultiHeadAttention(Layer):
  def __init__(self, heads, dim_k, dim_v, d_model, **kwargs):
    super(MultiHeadAttention, self).__init__(**kwargs)
    self.attention = DotProductAttention()
    self.heads = heads
    self.dim_k = dim_k
    self.dim_v = dim_v
    self.d_model = d_model

    # Set key, query, value vectors to be linear
    self.W_q = keras.layers.Dense(dim_k)
    self.W_k = keras.layers.Dense(dim_k)
    self.W_v = keras.layers.Dense(dim_v)

    # Set the output to be linear
    self.W_o = keras.layers.Dense(d_model)

  def reshape_tensor(self, x, heads, flag):
    if flag:
      # Set tensor shape to (batch_size, heads, seq_length, -1)
      x = tf.reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
      x = tf.transpose(x, perm=[0, 2, 1, 3])

    else:
      # Set tensor shape to (batch_size, seq_length, heads, -1)
      x = tf.transpose(x, perm=[0, 2, 1, 3])
      x = tf.reshape(x, shape=(shape(x)[0], shape(x)[1], self.dim_k))

    return x

  def call(self, q, k, v, mask=None):
    # Rearrange queries, keys, values to compute heads in parallel
    q_heads = self.reshape_tensor(self.W_q(q), self.heads, True)
    k_heads = self.reshape_tensor(self.W_k(k), self.heads, True)
    v_heads = self.reshape_tensor(self.W_v(v), self.heads, True)

    # Compute attention
    attention = self.attention(q_heads, k_heads, v_heads, self.dim_k, mask)

    # Concat and dense the attention outputs
    concat_output = self.reshape_tensor(attention, self.heads, False)

    # Apply final linear projection and return
    return self.W_o(concat_output)

In [15]:
'''
PositionalEncoding class
Create positional encodings of words in a sequence.

Parameters:
'''
class PositionalEncoding(Layer):
  def __init__(self, seq_length, vocab_size, output_dim, **kwargs):
    super(PositionalEncoding, self).__init__(**kwargs)
    self.word_embedding = keras.layers.Embedding(vocab_size, output_dim)
    self.position_embedding = keras.layers.Embedding(seq_length, output_dim)

  def call(self, inputs):
    # Create word embeddings
    word_embeddings = self.word_embedding(inputs)

    # Create positional embeddings
    position_embeddings = self.position_embedding(tf.range(start=0, limit=shape(inputs)[1], delta=1))

    # Add the word and positional embeddings
    return word_embeddings + position_embeddings

In [16]:
'''
EncoderLayer class
This class creates an encoder for the transformer, which can be called N times.

Parameters:
Layer: keras.layers.Layer
'''
class EncoderLayer(Layer):
  def __init__(self, heads, dim_k, dim_v, d_model, d_ff, rate, **kwargs):
    super(EncoderLayer, self).__init__(**kwargs)
    self.multiheadAttention = MultiHeadAttention(heads, dim_k, dim_v, d_model)
    self.dropout1 = keras.layers.Dropout(rate)
    self.addNorm1 = AddNormalization()
    self.feedForward = FeedForward(d_model, d_ff)
    self.dropout2 = keras.layers.Dropout(rate)
    self.addNorm2 = AddNormalization()

  def call(self, x, padding_mask, training):
    # Apply multi-head attention
    attention = self.multiheadAttention(x, x, x, padding_mask)

    # Apply dropout
    attention = self.dropout1(attention, training=training)

    # Add and normalize
    normalized_output = self.addNorm1(x, attention)

    # Apply feed forward (fully connected layer)
    FFN_output = self.feedForward(normalized_output)

    # Apply dropout
    FFN_output = self.dropout2(FFN_output, training=training)

    # Add and normalize
    final_output = self.addNorm2(normalized_output, FFN_output)

    return final_output

In [17]:
'''
Encoder class
Creates an encoder for the transformer.

Parameters:
'''
class Encoder(Layer):
  def __init__(self, vocab_size, seq_length, heads, dim_k, dim_v, d_model, d_ff, n, rate, **kwargs):
    super(Encoder, self).__init__(**kwargs)
    self.posEncoding = PositionalEncoding(seq_length, vocab_size, d_model)
    self.dropout = keras.layers.Dropout(rate)
    self.encoderLayer = [EncoderLayer(heads, dim_k, dim_v, d_model, d_ff, rate) for _ in range(n)]
    self.dense = keras.layers.Dense(2, activation='softmax')

  def call(self, input_sentence, padding_mask, training):
    # Creeate positional encoding
    pos_encoding = self.posEncoding(input_sentence)

    # Add in a dropout layer
    dropout = self.dropout(pos_encoding, training=training)

    # Pass the positional encodings values to each encoder layer
    for layer in self.encoderLayer:
      x = layer(dropout, padding_mask, training)

    # Apply softmax layer
    output = self.dense(x)

    return output

## Testing

In [18]:
from numpy import random

heads = 8
dim_k = 64
dim_v = 64
d_model = 512
d_ff = 2048
n = 6
rate = 0.1

input_seq = random.random((64, 5))
'''
input_seq = df['tweet'].head(5)
print(input_seq)

m = 0
for i in range(5):
  if len(input_seq[i]) > m:
    m = len(input_seq[i])
'''
encoder = Encoder(vocab_size=5, seq_length=5, heads=heads, dim_k=dim_k, dim_v=dim_v, d_model=d_model, d_ff=d_ff, n=n, rate=rate)

output = encoder(input_seq, padding_mask=None, training=False)

print(output.shape)

(64, 5, 2)


In [19]:
len(df)

24783

In [None]:
# Create a function that will take df, which is a dataframe with a tweet column and a class column, and train it using the encoder model.
# Display the training loss and accuracy

def train_model(df, encoder, epochs=10, batch_size=32):
    # Create a tokenizer
    tokenizer = keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df['tweet'])
    vocab_size = len(tokenizer.word_index) + 1
    
    # Create a sequence of the tweets
    X = tokenizer.texts_to_sequences(df['tweet'])
    X = keras.preprocessing.sequence.pad_sequences(X, padding='post')
    
    # Create the target
    y = df['class']
    
    # Split the data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create the padding mask
    padding_mask = np.where(X_train == 0, 1, 0)
    
    # Compile the model
    encoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Train the model
    history = encoder.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
    
    # Plot the training history
    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.ylim([0, 1])
    plt.legend(loc='lower right')
    plt.show()
    
    return history

# Train the model
history = train_model(df, encoder, epochs=10, batch_size=32)

# Create a function that will take a tweet and the encoder model and return the class label

def predict_class(tweet, encoder):
    # Create a tokenizer
    tokenizer = keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df['tweet'])
    
    # Create a sequence of the tweet
    X = tokenizer.texts_to_sequences([tweet])
    X = keras.preprocessing.sequence.pad_sequences(X, padding='post')
    
    # Make a prediction
    prediction = encoder.predict(X)
    
    return prediction

# Make a prediction
tweet = "I hate you"
prediction = predict_class(tweet, encoder)
print(prediction)