<a href="https://colab.research.google.com/github/huanyanwei/ai-projects/blob/main/Yan_Wei_ANLP_TF2_Toxic_Words_Text_Classification_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 2.x

In [None]:
!wget -qq https://www.dropbox.com/s/fz2d61pwgigtra7/toxic_words.zip
!unzip toxic_words.zip

!ls

Archive:  toxic_words.zip
  inflating: test_labels.csv         
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
sample_data	       test.csv		toxic_words.zip
sample_submission.csv  test_labels.csv	train.csv


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")

import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence

from tensorflow.keras.optimizers import Adam

In [None]:
# Aim to train the model using train_df
# Aim to fill in the classification of test_df 

train_df = pd.read_csv("./train.csv").fillna("blank")
test_df = pd.read_csv("./test.csv").fillna("blank")

In [None]:
train_df.head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [None]:
train_df.shape

(159571, 8)

In [None]:
X_train = train_df["comment_text"].values
y_train = train_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [None]:
y_train[0]

array([0, 0, 0, 0, 0, 0])

In [None]:
X_train[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

## **My attempt**
1. Create list of all the words
2. Tokenise the words
3. Pad each lines to be of the same length
4. Build and train model


In [None]:
X_test = test_df["comment_text"].values

## y_test does not exist...

In [None]:
# Split Labelled Training Data into 
    # X_train2, y_train2 --> training data (75%) and 
    # X_validn, y_validn --> validation data (25%)
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

from sklearn.model_selection import train_test_split

X_train2, X_validn, y_train2, y_validn = train_test_split(X_train, y_train, test_size=0.25)

# **Now I have 3 types of data:**


1.   train2 (X_train2, y_train2)
2.   valdn (X_validn, y_validn)
3.   test (X_test)



**Tokenizer**

In [None]:
# create the tokenizer
t = Tokenizer()

# Get the total number of words from all datasets (i.e. train2, valdn and test)
all_comments = list (X_train2) + list(X_validn) + list(X_test)
print("There are a total of", len(all_comments), "comments in all of the data")

# fit the tokenizer on the documents
t.fit_on_texts(all_comments)

# summarize what was learned
total_num_of_words = len(t.word_counts)
print("There are a total of", total_num_of_words, "distinct words in all of the data")



There are a total of 312735 comments in all of the data
There are a total of 394787 distinct words in all of the data


In [None]:
# re-create the tokenizer with only top 75% of the words
num_words = int(0.75*total_num_of_words)
t2 = Tokenizer(num_words = num_words)
t2.fit_on_texts(all_comments)

In [None]:
encoded_train = t2.texts_to_sequences(X_train2)
encoded_valdn = t2.texts_to_sequences(X_validn)
#encoded_train[1]


**Padding Sequence**

In [None]:
# pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 50

padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='pre')
padded_valdn = pad_sequences(encoded_valdn, maxlen=max_length, padding='pre')


In [None]:
padded_valdn[1]

array([13691,     7,    43,    48,    88,     2,    16,    15,     1,
       21805,  2270,   174,     5,    19,   173,     1,   770,    50,
          63,    74,     7,    90,    11,     9,     7,   220,    56,
         460,    88,     2,    92,   481,  1238,   197,    51,    59,
         364,     2,   314,    73,     5,     7,   229,    79,     1,
         370,   936,    55,  1123,    95], dtype=int32)

In [None]:
print(padded_valdn.shape)
print(y_train2.shape)
y_validn.shape


(39893, 50)
(119678, 6)


(39893, 6)

**Create Model**

Model based on Colab shared by lecturers.

In [None]:
# Based on LSTM + Embedding Test Classification

from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop

Inp = Input(name='inputs',shape=[max_length])
x = Embedding(num_words,50,input_length=max_length)(Inp)
x = LSTM(64,name='LSTM_01')(x)
x = Dense(256,activation='relu',name='Dense_01')(x)
x = Dropout(0.5,name='Dropout')(x)
# Need to change output to 6 --> 6 types of output...
out = Dense(6,activation='sigmoid', name='output')(x)

In [None]:
model = Model(inputs=Inp,outputs=out)
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 50)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 50)            14804500  
_________________________________________________________________
LSTM_01 (LSTM)               (None, 64)                29440     
_________________________________________________________________
Dense_01 (Dense)             (None, 256)               16640     
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
output (Dense)               (None, 6)                 1542      
Total params: 14,852,122
Trainable params: 14,852,122
Non-trainable params: 0
_______________________________________________

**Start Training**

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss',min_delta=0.0001)

In [None]:
model.fit(padded_train,y_train2,
          batch_size=128,
          epochs=10,
          validation_data=(padded_valdn, y_validn),
          callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<tensorflow.python.keras.callbacks.History at 0x7f3b0c3ee7b8>

Encode and Pad the Test Data!

In [None]:
encoded_test = t2.texts_to_sequences(X_test)
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='pre')
print(padded_test[0])

[  1487 251803    578   5873      5     94      6      2   3771     30
    339      6    742  34978     37   4884    724      8     35   4220
     10   1205    653    400    476  17275      9    227     15    154
      5  20164      8    247  23359     48   4329     52     24      4
   2108    155   2432    578   2429     94    218    143    490     85]


Send test data into model to predict...

In [None]:
prediction = model.predict(padded_test)
prediction[0]

array([0.99459165, 0.34296796, 0.95605326, 0.04550057, 0.8812876 ,
       0.23515476], dtype=float32)

In [None]:
# To round the output to 0/1
round_predictions= np.round(prediction)
results_df= pd.concat([test_df, pd.DataFrame(round_predictions, columns= ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])], axis=1)

results_df.head(50)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,1.0,0.0,1.0,0.0,1.0,0.0
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0.0,0.0,0.0,0.0,0.0,0.0
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.0,0.0,0.0,0.0,0.0,0.0
3,00017563c3f7919a,":If you have a look back at the source, the in...",0.0,0.0,0.0,0.0,0.0,0.0
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.0,0.0,0.0,0.0,0.0,0.0
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0.0,0.0,0.0,0.0,0.0,0.0
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...,0.0,0.0,0.0,0.0,0.0,0.0
7,000247e83dcc1211,:Dear god this site is horrible.,0.0,0.0,0.0,0.0,0.0,0.0
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ...",0.0,0.0,0.0,0.0,0.0,0.0
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...,0.0,0.0,0.0,0.0,0.0,0.0


----------------------------------- **End of Challenge ** ----------------------------------------

**Addition**

Actually did not need to manually split the training data into train2 & validation... model can auto split the data for validation. 

To try again without splitting the data manually and using a differnet model...

In [None]:
# Experimenting with a different model

from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, GlobalMaxPooling1D
from tensorflow.keras.optimizers import RMSprop

Inp = Input(name='inputs',shape=[max_length])
x = Embedding(num_words,50,input_length=max_length)(Inp)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.5,name='Dropout')(x)
# Need to change output to 6 --> 6 types of output...
out = Dense(6,activation='sigmoid', name='output')(x)

In [None]:
model2 = Model(inputs=Inp,outputs=out)
# Change Optimiser
model2.compile(loss='binary_crossentropy',optimizer=Adam(0.01),metrics=['accuracy'])

model2.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 50)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 50, 50)            14804500  
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
Dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
output (Dense)               (None, 6)                 306       
Total params: 14,804,806
Trainable params: 14,804,806
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Create the same padded train data
# Try "post" padding instead...
encoded_full_train = t2.texts_to_sequences(X_train)
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')

# change early_stop 0.0001 --> 0.001
early_stop = EarlyStopping(monitor='val_loss',min_delta=0.001)

# Reduce the number of epochs to 3 as the previous training was completed within 4
model2.fit(X_train,y_train,
          batch_size=128,
          epochs=3,
          validation_split=0.2,
          callbacks=[early_stop])

Epoch 1/3


In [None]:
prediction = model2.predict(padded_test)
prediction[0]

In [None]:
round_predictions= np.round(prediction)
results_df= pd.concat([test_df, pd.DataFrame(round_predictions, columns= ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])], axis=1)

results_df.head(50)