<a href="https://colab.research.google.com/github/joelali5/Hate-Speech-classification-with-machine-learning/blob/main/BERT_word_embeddings_bert_BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
!pip install tweet-preprocessor
import pandas as pd
import numpy as np
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM,Conv1D,GlobalMaxPooling1D,Flatten,MaxPooling1D,Bidirectional
from keras.losses import BinaryCrossentropy, SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
import seaborn as sns
import re
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn import metrics
from numpy import array
import preprocessor as p

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
from collections import namedtuple
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

TensorFlow Version: 2.8.2
Hub version:  0.12.0


In [4]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)

In [5]:
MAX_SEQ_LEN=240
input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                    name="segment_ids")

In [6]:
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [7]:
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [8]:
FullTokenizer=bert.bert_tokenization.FullTokenizer

vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()

do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()

tokenizer=FullTokenizer(vocab_file,do_lower_case)

def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [9]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\ð)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

def clean_tweets(df):
  tempArr = []
  for line in df:
    # send to tweet_processor
    tmpL = p.clean(line)
    # remove puctuation
    tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower())
    tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
    tempArr.append(tmpL)
  return tempArr

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/drive/MyDrive/labeled_data.csv")
# df = df.sample(frac=0.5)

df_hate = df[df['class']==0]
df_hate.shape

(1430, 7)

In [12]:
df_off = df[df['class']==1]
df_off.shape

df_off = df_off.sample(df_hate.shape[0])
df_off.shape

(1430, 7)

In [13]:
df_neither = df[df['class']==2]
df_off.shape

df_neither = df_neither.sample(df_hate.shape[0])
df_neither.shape

(1430, 7)

In [14]:
data = pd.concat([df_hate, df_off, df_neither])

In [15]:
X = clean_tweets(data["tweet"])
y = array(data['class'])

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=42)

unique_classes = list(set(y_train))
target_classes = ["hate_speech", "offensive_language", "neither"]

In [17]:
def create_single_input(sentence,MAX_LEN):
  
  stokens = tokenizer.tokenize(sentence)
  
  stokens = stokens[:MAX_LEN]
  
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
  ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
  masks = get_masks(stokens, MAX_SEQ_LEN)
  segments = get_segments(stokens, MAX_SEQ_LEN)

  return ids,masks,segments

def create_input_array(sentences):

  input_ids, input_masks, input_segments = [], [], []

  for sentence in tqdm(sentences,position=0, leave=True):
  
    ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2)

    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

  return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

In [None]:
import gc
from tensorflow.keras.optimizers import SGD
from keras.losses import sparse_categorical_crossentropy, binary_crossentropy, categorical_crossentropy

x = tf.keras.layers.Bidirectional(LSTM(240, recurrent_dropout=0.2))(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(len(target_classes), activation="sigmoid", name="dense_output")(x)


model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
opt = SGD(learning_rate=0.0001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

model.summary()

In [36]:
inputs=create_input_array(X_train)
history = model.fit(inputs,y_train,epochs=2, batch_size=32, validation_split=0.2,shuffle=True)
gc.collect()

100%|██████████| 3432/3432 [00:00<00:00, 4109.69it/s]


Epoch 1/2
Epoch 2/2


22933

In [37]:
test_inputs=create_input_array(X_test)

100%|██████████| 858/858 [00:00<00:00, 4537.68it/s]


In [None]:
from sklearn.metrics import accuracy_score, classification_report

test_preds = model.predict(test_inputs)


print("Test  Accuracy : {}".format(accuracy_score(y_test, np.argmax(test_preds, axis=1))))
print("\nClassification Report : ")
print(classification_report(y_test, np.argmax(test_preds, axis=1), target_names=target_classes, zero_division=1))

In [None]:
!pip install scikit-plot
from sklearn.metrics import confusion_matrix
import scikitplot as skplt
import matplotlib.pyplot as plt

skplt.metrics.plot_confusion_matrix([target_classes[i] for i in y_test], [target_classes[i] for i in np.argmax(test_preds, axis=1)],
                                    normalize=True,
                                    title="Confusion Matrix",
                                    cmap="Greens",
                                    hide_zeros=False,
                                    figsize=(5,5)
                                    );
plt.xticks(rotation=90);

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()