In [1]:
#libraries - basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as plt
import tensorflow as tf
import os
import nltk
import gc 

In [2]:
from tensorflow.keras.layers import TextVectorization
from tensorboard.plugins import projector #in case want to visualize vectorization

In [6]:
df = pd.read_pickle('df_stemmed.pkl')
df = df[['txt_clean', 'label']]
df

Unnamed: 0,txt_clean,label
5,bot seriously driving crazy image exists kill,1.0
6,unbloc,0.0
7,unblockyour reason,0.0
8,sock puppet good dont confused thanks genious ...,0.0
9,glen love fag,1.0
...,...,...
379175,sensual pleasures minditalic text endless crea...,0.0
379176,sensual pleasures minditalic text endless crea...,0.0
379177,“benpobjie “there secret element we’ve decided...,0.0
379178,shame kat amp andre mkr,0.0


In [33]:
print(df['txt_tokenized'].apply(len).max(), 
df['txt_tokenized'].apply(len).mean())

2500 31.021787297873296


In [35]:
print(df['txt_clean'].apply(len).max(), 
df['txt_clean'].apply(len).mean())

13978 228.9318017963297


In [38]:
df['source'] = [x for x in df[['aggression_attack', 'kaggle', 'toxicity',
       'twitter', 'twitter_racism', 'twitter_sexism', 'youtube',
       'train_kaggle2']].to_numpy()]
df

Unnamed: 0,txt,cnt,label,aggression_attack,kaggle,toxicity,twitter,twitter_racism,twitter_sexism,youtube,train_kaggle2,txt_clean,txt_tokenized,stemmed,source
5,...,2,1.0,1,0,1,0,0,0,0,0,bot seriously driving crazy image exists kill,"[bot, seriously, driving, crazy, image, exists...","[bot, serious, drive, crazi, imag, exist, kill]","[1, 0, 1, 0, 0, 0, 0, 0]"
6,...,1,0.0,0,0,1,0,0,0,0,0,unbloc,[unbloc],[unbloc],"[0, 0, 1, 0, 0, 0, 0, 0]"
7,...,1,0.0,1,0,0,0,0,0,0,0,unblockyour reason,"[unblockyour, reason]","[unblockyour, reason]","[1, 0, 0, 0, 0, 0, 0, 0]"
8,...,2,0.0,1,0,1,0,0,0,0,0,sock puppet good dont confused thanks genious ...,"[sock, puppet, good, dont, confused, thanks, g...","[sock, puppet, good, dont, confus, thank, geni...","[1, 0, 1, 0, 0, 0, 0, 0]"
9,...,2,1.0,1,0,1,0,0,0,0,0,glen love fag,"[glen, love, fag]","[glen, love, fag]","[1, 0, 1, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379175,Sensual Pleasures of the MindItalic text\n\nTh...,1,0.0,0,0,0,0,0,0,0,1,sensual pleasures minditalic text endless crea...,"[sensual, pleasures, minditalic, text, endless...","[sensual, pleasur, mindital, text, endless, cr...","[0, 0, 0, 0, 0, 0, 0, 1]"
379176,Sensual Pleasures of the MindItalic text The...,1,0.0,0,0,1,0,0,0,0,0,sensual pleasures minditalic text endless crea...,"[sensual, pleasures, minditalic, text, endless...","[sensual, pleasur, mindital, text, endless, cr...","[0, 0, 1, 0, 0, 0, 0, 0]"
379177,😂 “@benpobjie: “There is a secret element that...,3,0.0,0,0,0,1,1,1,0,0,“benpobjie “there secret element we’ve decided...,"[“, benpobjie, “, there, secret, element, we, ...","[“, benpobji, “, there, secret, element, we, ’...","[0, 0, 0, 1, 1, 1, 0, 0]"
379178,😂😂😂 shame @ kat &amp; andre #MKR,3,0.0,0,0,0,1,1,1,0,0,shame kat amp andre mkr,"[shame, kat, amp, andre, mkr]","[shame, kat, amp, andr, mkr]","[0, 0, 0, 1, 1, 1, 0, 0]"


In [124]:
gc.collect()

34192

In [87]:
#ds preprocessing for nn
X = df['txt_clean'].values
y = df['label'].values

In [90]:
#params
vocab_size = 200000
max_length = 1800

#additional params if plain tokenizer is used:
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
# Output dimensions of the Embedding layer
embedding_dim = 32


In [17]:
vectorizer = TextVectorization(max_tokens=vocab_size,
                               output_sequence_length=max_length,
                               output_mode='int')

In [19]:
vectorizer.adapt(X)
vectorized_text = vectorizer(X)

In [44]:
#workflow - map, chache, shuffle, batch, prefetch 
#from_tensor_slices, list_file -- to resolve memory errors
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [45]:
#train-val-test split (plain, after randomization .7 - .2 - .1 respectively)
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

### Model creation

In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [47]:
#mixed from ML with Nick and coursera!
model = Sequential()
# Create the embedding layer (+1 is for oov)
model.add(Embedding(vocab_size+1, 32))
# Bidirectional LSTM Layer (for better context understanding)
#tanh is required for GPU
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer - I have only 1 node for y/n toxicity
#tf.keras.layers.Dense(dense_dim, activation='relu'), 
model.add(Dense(1, activation='sigmoid'))

In [52]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [53]:
# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                        

In [54]:
history = model.fit(train, epochs=1, validation_data=val)



In [57]:
#save model
model.save('seq_LSTM')



INFO:tensorflow:Assets written to: seq_LSTM\assets


INFO:tensorflow:Assets written to: seq_LSTM\assets


In [56]:
#Test and EVALUATION
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [58]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

















In [59]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8941068053245544, Recall:0.7029440402984619, Accuracy:0.46618765592575073


In [60]:
f1 = 2*(pre.result().numpy() * re.result().numpy() / (pre.result().numpy() + re.result().numpy()))
f1

0.7870845794677734

In [None]:
#testing results

In [81]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')
f1 = 2*(pre.result().numpy() * re.result().numpy() / (pre.result().numpy() + re.result().numpy()))
print(f1)















Precision: 0.8977482914924622, Recall:0.6951277852058411, Accuracy:0.46407437324523926
0.7835509777069092


In [None]:
#trying another model 

In [88]:
#split to train-test and save to txt file
x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=True)

In [84]:
#the simplest tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [91]:
# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary
tokenizer.fit_on_texts(x_train_ann)
word_index = tokenizer.word_index

# Generate and pad the training sequences
training_sequences = tokenizer.texts_to_sequences(x_train_ann)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Generate and pad the testing sequences
testing_sequences = tokenizer.texts_to_sequences(x_test_ann)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert the labels lists into numpy arrays
training_labels = np.array(y_train_ann)
testing_labels = np.array(y_test_ann)

In [92]:
wordindex_dict = tokenizer.word_index

In [None]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Print the model summary
model.summary()

In [None]:
# Compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
num_epochs = 30

# Train the model
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

In [None]:
import matplotlib.pyplot as plt

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
# Plot the accuracy and loss
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [107]:
#trying the prebuilt Facebook AI FastText model
import fasttext
from fasttext import train_supervised

In [89]:
#prepare the text for fasttext model 
#- add '__label__'
ds_ft = '__label__' + df['label'].astype(int).astype(str) + ' ' + df['txt_clean']

In [32]:
from sklearn.model_selection import train_test_split

In [98]:
#split to train-test and save to txt file
train_ft, test_ft = train_test_split(ds_ft, test_size=0.3, random_state=1)

In [None]:
train_ft.to_csv('train_ft.txt', index = False, header = None)
test_ft.to_csv('test_ft.txt', index = False, header = None)

In [110]:
train_ft=pd.read_csv('train_ft.txt')
test_ft=pd.read_csv('test_ft.txt')

In [None]:
####trying to fit the model again just for tensorboard visualization

In [111]:
# Training the fastText classifier
model_ft2 = fasttext.train_supervised('train_ft.txt', wordNgrams = 3)

In [112]:
#test using the model
n, pre, re = model_ft2.test("test_ft.txt") #(n, precision, recall)
f1 = 2*(pre * re / (pre + re))
f1

0.9571999507033575

In [102]:
# Training the fastText classifier
model_ft1 = fasttext.train_supervised('train_ft.txt', wordNgrams = 2)

In [105]:
#test using the model
n, pre, re = model_ft1.test("test_ft.txt") #(n, precision, recall)
f1 = 2*(pre * re / (pre + re))
f1

0.9569274377415294

In [107]:
#trying autofinetuning opt
#spliting test to test and val:
test_ft, val_ft = train_test_split(test_ft, test_size=0.5, random_state=1)

In [109]:
#to txt
val_ft.to_csv('val_ft.txt', index = False, header = None)
test_ft.to_csv('test_ft.txt', index = False, header = None)

In [110]:
#autotune for 10 min on val, f1 score is used by default
model_ft2 = fasttext.train_supervised(input='train_ft.txt', autotuneValidationFile='val_ft.txt', autotuneDuration=600)

In [111]:
n, pre, re = model_ft2.test("test_ft.txt") #(n, precision, recall)
f1 = 2*(pre * re / (pre + re))
f1

0.9564605010651596

#### Trying distilBert model with extra layers for my ds training (embedding the dB to my model)

Using the model described by Swatimeena, Bombai IIT (https://gist.github.com/swati210994/963e084e8b76e8b5065a360d6d0741a0)

In [28]:
from transformers import AutoTokenizer

dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [9]:
ds = tf.data.Dataset.from_tensor_slices(dict(df))

In [13]:
from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel,DistilBertTokenizer,DistilBertConfig

In [36]:
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from tqdm import tqdm
from tensorflow.keras import regularizers

In [14]:
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Downloading: 100%|██████████████████████████████████████████████████████████████████| 363M/363M [00:55<00:00, 6.52MB/s]
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBert

In [15]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model   

In [25]:
max_len=32
sentences=df['txt_clean']
labels=df['label']
num_classes=len(df.label.unique())
len(sentences),len(labels), num_classes

(378661, 378661, 2)

In [26]:
model=create_model()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['input_5[0][0]',                
 BertModel)                     ast_hidden_state=(N               'input_6[0][0]']                
                                one, 32, 768),                                                    
                                 hidden_states=None                                           

In [29]:
input_ids=[]
attention_masks=[]

for sent in sentences:
    dbert_inps=dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids.append(dbert_inps['input_ids'])
    attention_masks.append(dbert_inps['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)



In [33]:
#Train-Test split and setting up the loss function, accuracy and optimizer for the model.
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

In [34]:
print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))

Train inp shape (302928, 32) Val input shape (75733, 32)
Train label shape (302928,) Val label shape (75733,)
Train attention mask shape (302928, 32) Val attention mask shape (75733, 32)


In [37]:
log_dir='dbert_model'
model_save_path='./dbert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

In [38]:
model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

In [39]:
#Training
history=model.fit([train_inp,train_mask],train_label,batch_size=16,epochs=5,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)

Epoch 1/5


  output, from_logits = _get_logits(


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [40]:
#save model
model.save('dbert_embeded_seq_model')





INFO:tensorflow:Assets written to: dbert_embeded_seq_model\assets


INFO:tensorflow:Assets written to: dbert_embeded_seq_model\assets


In [41]:
%load_ext tensorboard

In [42]:
#visualizing loss and accuracy
%tensorboard --logdir {log_dir}

In [43]:
trained_model = create_model()
trained_model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
trained_model.load_weights(model_save_path)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['input_7[0][0]',                
 BertModel)                     ast_hidden_state=(N               'input_8[0][0]']                
                                one, 32, 768),                                                    
                                 hidden_states=None                                         

In [45]:
from sklearn.metrics import confusion_matrix,f1_score,classification_report

In [None]:
preds = trained_model.predict([val_inp,val_mask],batch_size=16)
pred_labels = preds.argmax(axis=1)

In [46]:
f1 = f1_score(val_label,pred_labels)
f1

0.8265245707519242

In [47]:
#printing the report
target_names=['non-toxic','toxic']
print('F1 score',f1)
print('Classification Report')
print(classification_report(val_label,pred_labels,target_names=target_names))

F1 score 0.8265245707519242
Classification Report
              precision    recall  f1-score   support

   non-toxic       0.98      0.98      0.98     67112
       toxic       0.84      0.81      0.83      8621

    accuracy                           0.96     75733
   macro avg       0.91      0.90      0.90     75733
weighted avg       0.96      0.96      0.96     75733



In [None]:
#embeddings and loss visualization with tensorboard

In [2]:
%load_ext tensorboard

In [48]:
from tensorboard.plugins import projector

In [54]:
#os.path.abspath('/logs/fit/metadata.tsv')

'C:\\logs\\fit\\metadata.tsv'

In [100]:
# create logging directory
log_dir='/logs/fit/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# write wordindex dictionary
with open(os.path.join(log_dir, 'metadata1.tsv'), "w", encoding="utf-8") as f:
  for w in wordindex_dict:
    f.write("{}\n".format(w))

In [65]:
#open model
model= tf.keras.models.load_model('seq_LSTM')

# Show the model architecture
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                        

In [101]:
# weights from the embedding layer 
weights = tf.Variable(model.get_layer('embedding').get_weights()[0])
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

'/logs/fit/embedding.ckpt-1'

In [102]:
weights

<tf.Variable 'Variable:0' shape=(200001, 32) dtype=float32, numpy=
array([[-1.45969748e-01, -1.84923023e-01, -8.35564956e-02, ...,
         1.21140964e-01, -2.00795919e-01, -1.09345019e-01],
       [-1.04358867e-01, -1.18014142e-01, -3.59479375e-02, ...,
        -2.73233629e-04,  3.64161246e-02, -2.90092025e-02],
       [-2.76752621e-01, -1.84305340e-01,  1.86819315e-01, ...,
        -5.75037599e-02,  6.71313331e-02, -1.67743087e-01],
       ...,
       [-5.72445802e-02,  3.24377194e-02, -1.68376733e-02, ...,
         4.11397740e-02, -4.31021713e-02, -4.28487584e-02],
       [-6.64273873e-02, -2.96788141e-02, -1.31438347e-02, ...,
        -3.97349373e-02,  2.32337322e-03,  1.35388854e-03],
       [ 1.44541003e-02, -3.39642987e-02, -1.44375786e-02, ...,
         4.97754477e-02, -3.06885131e-02, -2.75944229e-02]], dtype=float32)>

In [103]:
# configuration set-up
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata1.tsv'
projector.visualize_embeddings(log_dir, config)

In [106]:
config.embeddings

[tensor_name: "embedding/.ATTRIBUTES/VARIABLE_VALUE"
metadata_path: "metadata1.tsv"
]

In [123]:
%tensorboard --logdir /log/

In [113]:
word2vec = model_ft2

In [122]:
embedding = np.empty((len(word2vec.words), word2vec.dim), dtype=np.float32)
for i, word in enumerate(word2vec.words):
    embedding[i] = word2vec[word]

# create logging directory
log_dir='/log/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)


# write labels
with open(os.path.join(log_dir, 'metadata.tsv'), 'w', encoding="utf-8") as f:
    for word in word2vec.words:
        f.write(word + '\n')

config = projector.ProjectorConfig()
embedding_conf = config.embeddings.add()
embedding_conf.tensor_name = 'embedding:0'
embedding_conf.metadata_path = os.path.join('log', 'metadata.tsv')
projector.visualize_embeddings(log_dir, config)
