## This notebook contains experiment of Wikipedia talk pages comments. These comments are annotated by 10 persons on crowdflower platform.

### Number of Experiments in this notebook:
1. LSTM with embeddings from scratch (With and without drop out)
2. LSTM with fasttext embeddings (With dropout) (Not in the notebook yet due to infra limitations)
3. LSTM with CNN 

Help with models: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
Help with pre-trained embedding: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [31]:
import pandas as pd
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import confusion_matrix
from tensorflow import keras

from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

##Preprocessing
import nltk
import string
import re    
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn

In [2]:
comments_train = pd.read_csv('train.csv') 

In [3]:
comments_train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [4]:
comments_train = comments_train[['comment_text','toxic']]

In [7]:
## Put data processing code here later.
stoplist = set(stopwords.words("english"))

class Remove_Noise(object):
    
    def __init__(self,stop_word = stoplist):
        self.stop_word = stoplist
    
    def noise_rm(self,doc):
        doc = re.sub('[#$%^&\',:()*+/<=>@[\\]^_``{|}~]',' ',doc)
        doc = re.sub('[0-9]+',' ',doc)
        doc = re.sub('\n','',doc)
        doc = re.sub(' +',' ',doc)
        doc = doc.lower()
        return doc
    
    def lemmatize(self,token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        lemmatizer = WordNetLemmatizer()
        return lemmatizer.lemmatize(token, tag)
    
    def tokenize(self,document): 
        #document = unicode(document,'utf-8')
        lemmy = []
        for sent in sent_tokenize(document):
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                if token in self.stop_word:
                    continue
                lemma = self.lemmatize(token, tag)
                lemmy.append(lemma)
        return lemmy
    
def join_tokens(data):
    ans = ' '.join(data)
    return ans

In [8]:
remover = Remove_Noise()

In [9]:
comments_train['comment_text'].fillna(' ', inplace=True)

In [10]:
comments_train['comment_text'] = comments_train['comment_text'].apply(remover.noise_rm)

In [11]:
## this will take time so after cleaning the data, save it!
comments_train['comment_text'] = comments_train['comment_text'].apply(remover.tokenize)

In [12]:
comments_train['comment_text'] = comments_train['comment_text'].apply(join_tokens)

In [13]:
comments_train.to_csv('processed_toxic_data.csv')

In [14]:
t = Tokenizer()
t.fit_on_texts(comments_train['comment_text'])
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(comments_train['comment_text'])
#print(comments_train)

In [15]:
vocab_size = len(t.word_index) + 1
print(vocab_size)

180295


In [16]:
print(encoded_docs[:1])

[[77731, 74, 7, 545, 3479, 8812, 616, 75, 132, 4983, 1980, 472, 61, 1007, 9580, 2225, 8, 36, 193, 4, 2, 81, 2411]]


In [17]:
max_length = 80
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [18]:
y = comments_train['toxic']
X = padded_docs
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42, stratify=y)

In [35]:
class_weight = {0: 1.,
                1: 8.}

In [20]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

In [21]:
# create the model
embedding_vecor_length = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
print(model.summary())
model.fit(X_train, y_train, epochs=1, batch_size=128,class_weight=class_weight)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 80, 100)           18029500  
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 18,110,001
Trainable params: 18,110,001
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f89d06d2c10>

In [22]:
y_pred = model.predict_classes(X_test)

In [24]:
#y_pred_bool = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96     47612
           1       0.82      0.35      0.49      5047

    accuracy                           0.93     52659
   macro avg       0.88      0.67      0.73     52659
weighted avg       0.92      0.93      0.92     52659



In [23]:
confusion_matrix(y_test, y_pred,labels=[0,1])

array([[47228,   384],
       [ 3271,  1776]])

## Results 1: LSTM with embeddings from scratch (without drop out)
### Class weights 1:10
Confusion Matrix
[13766, 33846]
[  627,  4420]

### Class weights 1:5 + Data preprocessing
Confusion Matrix
[47228,   384]
[ 3271,  1776]

In [27]:
# create the model
embedding_vecor_length = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_length))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
print(model.summary())
model.fit(X_train, y_train, epochs=1, batch_size=128,class_weight=class_weight)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 80, 100)           18029500  
_________________________________________________________________
dropout_1 (Dropout)          (None, 80, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 18,110,001
Trainable params: 18,110,001
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f89bd14aa90>

### Results 2: LSTM with embeddings from scratch (with drop out)

In [28]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96     47612
           1       0.86      0.25      0.38      5047

    accuracy                           0.92     52659
   macro avg       0.89      0.62      0.67     52659
weighted avg       0.92      0.92      0.90     52659



In [29]:
confusion_matrix(y_test, y_pred,labels=[0,1])

array([[47407,   205],
       [ 3807,  1240]])

In [None]:
# create the model - Reccurent dropout to be experimented with later
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)

In [36]:
## LSTM with CNN
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)
print(model.summary())
model.fit(X_train, y_train, epochs=1, batch_size=128,class_weight=class_weight)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 80, 32)            5769440   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 80, 32)            3104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 40, 32)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 5,825,845
Trainable params: 5,825,845
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f89d1070a90>

### Results 3: LSTM with CNN.
FN have dropped by a great number almost by a factor of 6 as compared to last model.

In [33]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96     47612
           1       0.62      0.86      0.72      5047

    accuracy                           0.94     52659
   macro avg       0.80      0.90      0.84     52659
weighted avg       0.95      0.94      0.94     52659



In [34]:
confusion_matrix(y_test, y_pred,labels=[0,1])

array([[44920,  2692],
       [  684,  4363]])

### Results 3.1 Class weight 1:8
#### Model's ability to learn toxicity is improving but it's ability to learn non-toxic is decreasing. 

In [37]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.92      0.95     47612
           1       0.55      0.90      0.68      5047

    accuracy                           0.92     52659
   macro avg       0.77      0.91      0.82     52659
weighted avg       0.95      0.92      0.93     52659



In [38]:
confusion_matrix(y_test, y_pred,labels=[0,1])

array([[43867,  3745],
       [  498,  4549]])