# Detecting Slang Using BERT

### Installing Hugging Face library

In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from tokenizers import BertWordPieceTokenizer
from tqdm.notebook import tqdm
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import backend as K
import transformers
from transformers import TFAutoModel, AutoTokenizer
import matplotlib.pyplot as plt

# Downloading data


In [None]:
!wget https://raw.githubusercontent.com/abcom-mltutorials/detecting-slang/master/jigsawdata.zip

In [None]:
!unzip '/content/jigsawdata.zip'

In [None]:
train = pd.read_csv("/content/drive/MyDrive/train.csv")
test = pd.read_csv('/content/drive/MyDrive/test.csv')

# Examining data

In [None]:
# train[train['toxic']==1]
train

In [None]:
comments_size=[]
for i in train["comment_text"]:
   comments_size.append(len(i))

import numpy as np
comments_size_np=np.asarray(comments_size)


In [None]:
columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
zeros =[]
ones =[]
for col in columns:
  zeros.append(train[col].value_counts()[0])
  ones.append(train[col].value_counts()[1])
  
df = pd.DataFrame({'zero': zeros,'one': ones}, index=columns)
df.plot.bar(rot=0)

In [None]:
columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
labels=[]
for i in range(0,159571):
  t1=train.toxic[i]
  t2=train.severe_toxic[i]
  t3=train.obscene[i]
  t4=train.threat[i]
  t5=train.insult[i]
  t6=train.identity_hate[i]

  Total=t1+t2+t3+t4+t5+t6
  if Total >0:
    labels.append(1)
  else:
    labels.append(0)

train["labels"]=labels


In [None]:
train["labels"].value_counts()


### Target distribution

# Build the model
Instantiating TFAutoModel, AutoConfig and AutoTokenizer will directly create a class of the relevant BERT architecture 

In [None]:
def build_model(transformer, loss, max_len=128):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    x = tf.keras.layers.Dropout(0.35)(cls_token)
    out = Dense(100, activation='relu')(x)
    # out = Dense(75, activation='relu')(out)
    # out = tf.keras.layers.Dropout(0.35)(out)
    # out = Dense(50, activation='relu')(out)
    # out = Dense(25, activation='relu')(out)
    # out = tf.keras.layers.Dropout(0.35)(out)
    out = Dense(10, activation='relu')(out)
    out = Dense(5, activation='relu')(out)
    out = Dense(1, activation='sigmoid')(out)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=3e-5), loss=loss, metrics=[])
    # model.summary()
    return model


# The focal loss function

In [None]:
def focal_loss(gamma=2., alpha=.2):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * 
                       K.pow(1. - pt_1, gamma) * 
                       K.log(pt_1)) - K.mean((1 - alpha) * 
                       K.pow(pt_0, gamma) * 
                       K.log(1. - pt_0))
    return focal_loss_fixed

# Instantiating model

In [None]:

transformer_layer = transformers.TFBertModel.from_pretrained('bert-base-uncased')
model = build_model(transformer_layer, loss=focal_loss(gamma=1.5), max_len=128)
model.summary()

# Data preprocessing

### Tokenizing

In [None]:
# First load the real tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Save the loaded tokenizer locally
save_path = 'distilbert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)


# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('distilbert_base_uncased/vocab.txt', lowercase=True)
fast_tokenizer



### Encoding

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=128):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
x = fast_encode(train.comment_text.astype(str), fast_tokenizer)
x_test = fast_encode(test.comment_text.astype(str), fast_tokenizer)
y = train.labels.values

# Preparing datasets



In [None]:
BATCH_SIZE=64

train_dataset = (
    tf.data.Dataset 
      .from_tensor_slices((x, y))
      .repeat()
      .shuffle(2048)
      .batch(BATCH_SIZE)
    # AUTOTUNE prompts the runtime to prepare the next set 
    # while processing the current one
    .prefetch(tf.data.experimental.AUTOTUNE) 
)

test_data = (
    tf.data.Dataset# create dataset
    .from_tensor_slices(x_test) 
    .batch(BATCH_SIZE)
)

# training 

In [None]:
class_weight = {0: 1.,
                1: 7.
                }



train_history = model.fit(
                              train_dataset,
                              steps_per_epoch=10,
                              class_weight=class_weight,
                              epochs=10,
                             )

# Predicting on test set

In [None]:
dataset_name = 'slang'

saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

model.save(saved_model_path, include_optimizer=False)



In [None]:
test

In [None]:
reloaded_model = tf.saved_model.load("/content/slang_bert")

In [None]:
test['toxic'] = model.predict(test_data, verbose=1)

Save it to CSV and load it 

In [None]:
test.to_csv('test.csv', index=False)
data=pd.read_csv('/content/test.csv')
data.head()

In [None]:
def replace(toxic):
  if toxic >=0.5:
    toxic=1
  else:
    toxic=0
  return toxic

test['prediction']=test['toxic'].apply(lambda x : replace(x))

In [None]:
test

In [None]:
test.prediction.value_counts().plot(kind='bar')
plt.xlabel('toxic or non-toxic')
plt.ylabel('count')

In [None]:
text1=test.comment_text[186]
text1

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []
    
    #for i in tqdm(range(0, len(texts), chunk_size)):
    #text_chunk = texts[i:i+chunk_size].tolist()
    encs = tokenizer.encode_batch(texts)
    all_ids.extend([enc.ids for enc in encs])

    return np.array(all_ids)

In [None]:
p1=fast_encode([text1], fast_tokenizer, maxlen=512)
p1 = model.predict(p1)
if (replace(p1) == 0):
  print ("Okay contents")
else:
  print ("Contents not permitted")

In [None]:
text2=test.comment_text[0]
text2

In [None]:
p2=fast_encode([text2], fast_tokenizer, maxlen=512)
p2=model.predict(p2)
if (replace(p2) == 0):
  print ("Okay contents")
else:
  print ("Contents not permitted")

In [None]:
text3 =["Every once in a while, I get the urge. You know what I'm talking about, don't you? The urge for destruction. The urge to hurt, maim, kill. It's quite a thing to experience that urge, to let it wash over you, to give in to it. It's addictive. It's all-consuming. You lose yourself to it. It's quite, quite wonderful. I can feel it, even as I speak, tapping around the edges of my mind, trying to prise me open, slip its fingers in. And it would be so easy to let it happen. But we're all like that, aren't we? We're all barbarians at our core. We're all savage, murderous beasts. I know I am. I'm sure you are. The only difference between us, Mr. Prave, is how loudly we roar. I know I roar very loudly indeed. How about you. Do you think you can match me"]
text3

In [None]:
p3=fast_encode(text3, fast_tokenizer, maxlen=512)
p3=model.predict(p3)
if (replace(p2) == 0):
  print ("Okay contents")
else:
  print ("Contents not permitted")

In [None]:
fast_tokenizer
model.save

In [None]:
import pickle

In [None]:
pickle.dump(model,open('toxic1.pkl','wb'))

In [None]:
pickle.dump(fast_tokenizer,open('toxicvec1.pkl','wb'))

In [None]:

dbfile = open("/content/toxic1.pkl", 'rb')     
db = pickle.load(dbfile)

In [None]:
db.summary()