In this notebook, we will test a pooled RCNN model.

# 1. Preparation
We need to first import the required library, download the data, and load the data into the memory.

## 1.1 Import

In [1]:
print('Importing required packages...')

from IPython.display import clear_output
import re
import csv
import pandas as pd
import numpy as np
np.random.seed()
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import TweetTokenizer
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer 
nltk.download('wordnet')
from keras.preprocessing import sequence
from keras.preprocessing import text as ktxt
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, GRU, SpatialDropout1D, Bidirectional
from keras.layers import concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import class_weight


def hint(message):
    """
    erase previous ipynb output and show new message
    """
    clear_output()
    print(message)

  

Importing required packages...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ChuanLi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ChuanLi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using TensorFlow backend.


## 1.2 Loading the Data

In [2]:
hint('loading data...')
train = pd.read_csv('data/train.csv')
train, valid = train_test_split(train, test_size=0.2)

labels = [
    'toxic', 
    'severe_toxic', 
    'obscene', 
    'threat', 
    'insult', 
    'identity_hate'
]

Ytr = train[labels].values
Yva = valid[labels].values

hint('Label distribution between training and validation set:')
print(pd.DataFrame({
    'label': labels,
    'train': [np.mean(train[label]) for label in labels],
    'validation' : [np.mean(valid[label]) for label in labels],
}))

Label distribution between training and validation set:
           label     train  validation
0          toxic  0.096298    0.094031
1   severe_toxic  0.009941    0.010215
2        obscene  0.052790    0.053580
3         threat  0.002992    0.003008
4         insult  0.049351    0.049413
5  identity_hate  0.008844    0.008648


# 2. Pre-processing the Input
There are many ways to pre-process the raw strings into valid input for the model. Here we will do it by building a dictionary with all the comments from the training set, mapping the words to their index in the dictionary, and pad/crop the resulting sequences so that they have the same length.

## 2.1 Cleaning Input

In [3]:
tkzr = TweetTokenizer(preserve_case=False)
eng_stopwords = (
    'what', 'which', 'who', 'whom', 
    'this', 'that', 'these', 'those', 
    'am', 'is', 'are', 'was', 'were', 
    'be', 'been', 'being', 
    'have', 'has', 'had', 'having', 
    'do', 'does', 'did', 'doing', 
    'a', 'an', 'the', 
    'and', 'but', 'if', 'or', 
    'because', 'as', 'until', 'while', 
    'of', 'at', 'by', 'for', 'with', 
    'about', 'against', 'between', 
    'into', 'through', 'during', 'before', 'after', 
    'above', 'below', 'to', 'from', 
    'up', 'down', 'in', 'out', 'on', 'off', 
    'over', 'under', 'again', 'further', 
    'then', 'once', 'here', 
    'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 
    'few', 'more', 'most', 'other', 'some', 
    'such', 'no', 'nor', 'not', 'only', 
    'own', 'same', 'so', 'than', 'too', 'very', 
    'can', 'will', 'just', 'don', 'should', 'now'
)
lmtzr = WordNetLemmatizer()
appos = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not"
}

def preprocess(comment):
  
    # credit to the author of this post:
    # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda

    # remove special format
    comment = re.sub('\n\t', '', comment)

    # remove IP addresses
    comment = re.sub('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' specipaddress ', comment)

    # remove username
    comment = re.sub("\[\[User.*\]", ' specusername ', comment)
    comment = re.sub("\[\[User.*\|", ' specusername ', comment)

    # tokenization 
    tokens = tkzr.tokenize(comment)

    # aphostophe replacement
    tokens = [ appos[token] if token in appos else token for token in tokens]

    # remove stopwords
    tokens = [ token for token in tokens if not token in eng_stopwords ]

    # stemming
    tokens = [ lmtzr.lemmatize(token, 'v') for token in tokens]

    return " ".join(tokens)
  

hint('Cleaning train set...')
Xtr = train['comment_text'].apply(lambda c: preprocess(c))
hint('Cleaning test set...')
Xva = valid['comment_text'].apply(lambda c: preprocess(c))
hint('Done')

Done


## 2.2 Transforming Comments to Sequences

In [4]:
vocab_max = 100000

hint('Fitting the tokenizer...')
tokenizer = ktxt.Tokenizer(num_words=vocab_max)
tokenizer.fit_on_texts(Xtr)

hint('Tokenizing...')
Xtr = tokenizer.texts_to_sequences(Xtr)
Xva = tokenizer.texts_to_sequences(Xva)

hint('padding the sequences...')
max_comment_length = 200  # padded/cropped comment length
Xtr = sequence.pad_sequences(Xtr, maxlen=max_comment_length)
Xva = sequence.pad_sequences(Xva, maxlen=max_comment_length)

hint('Done')

Done


# 3. Training Model

In [5]:
hint("Loading pre-embedding file...")
emb = pd.read_table(
    'preembedding/glove.6B.300d.txt', 
    sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE
)

hint("Preparing embedding matrix...")
embedding_dimension = 300
embedding_matrix = np.random.normal(
    emb.mean(axis=0), 
    emb.std(axis=0), 
    (vocab_max, embedding_dimension)
)
hint("Constructing embedding matrix")
for word, i in tokenizer.word_index.items():
    if i < vocab_max and word in emb.index:
        embedding_matrix[i] = emb.loc[word].as_matrix()

hint("Done")
# optional: free memory:
emb = None

Done


In [6]:
model = None
x = None
sequence_input = Input(shape=(max_comment_length, ))
x = Embedding(vocab_max, embedding_dimension, weights=[embedding_matrix])(sequence_input)
x = SpatialDropout1D(0.5)(x)
x = Conv1D(filters=256, kernel_size=3, padding='same', activation='relu')(x)
x = MaxPooling1D(pool_size=2)(x)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(64, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
preds = Dense(6, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

Now training the model.

In [7]:
epochs = 4
batch_size = 64

def get_class_weight(x):
    k = 100
    return 3.32*np.log(k/x + 1)
    

history = model.fit(
    Xtr, Ytr, 
    epochs=epochs, 
    batch_size=batch_size,
    validation_data=(Xva, Yva),
    class_weight={
        0: get_class_weight(98),
        1: get_class_weight(10),
        2: get_class_weight(53),
        3: get_class_weight(2),
        4: get_class_weight(49),
        5: get_class_weight(8),
    }
)

Train on 127656 samples, validate on 31915 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Making prediction on the validation set.

In [8]:
hint("Making prediction...")
Yva_ = model.predict(Xva)
hint("Done")

Done


# 4. Result Analysis
## 4.1 Global Accuracy

In [9]:
total_sample = Xva.shape[0]
print("validation set sample count: %d\n" % total_sample)
prediction_total = total_sample*Yva.shape[1]
best_t = None
best_accuracy = 0
for t in [i*0.1 for i in range(1, 10)]:
    accuracy = np.sum(Yva == (Yva_ >= t))/prediction_total
    if accuracy > best_accuracy: 
        best_t = t
        best_accuracy = accuracy
    print("accuracy for threshold %.1f: %.2f%%" % (t, accuracy*100))
Yva_T = Yva_ >= best_t
correct = Yva == Yva_T
print("\nbest threshold: %.1f" % best_t)
print("best accuracy: %.2f%%" % (best_accuracy*100))

validation set sample count: 31915

accuracy for threshold 0.1: 96.92%
accuracy for threshold 0.2: 97.64%
accuracy for threshold 0.3: 98.01%
accuracy for threshold 0.4: 98.21%
accuracy for threshold 0.5: 98.34%
accuracy for threshold 0.6: 98.41%
accuracy for threshold 0.7: 98.39%
accuracy for threshold 0.8: 98.30%
accuracy for threshold 0.9: 98.07%

best threshold: 0.6
best accuracy: 98.41%


## 4.2 Accuracy by Classes

In [10]:
overview = pd.DataFrame(index=[
    'label‰ of all',
    'total wrong', 
    'P->N', 
    'N->P', 
    'P->N %', 
    'N->P %',
    'avg len',
])

def analyze_class(i):
    wrong = valid[correct[:, i] != 1]
    total_class_error = len(wrong)
    print("%d predicted incorrectly (%.2f%% of all samples)" % (
        total_class_error, 
        100*total_class_error/total_sample
    ))
        
    wrong_seqs = Xva[correct[:, i] != 1]
    lens = [ len(seq[seq != 0]) for seq in wrong_seqs]
    avg_len = np.mean(lens)
    print("Falsely predicted sequences have an average length of %d" % avg_len)

    PpN = valid[(Yva[:, i] == 1) & (Yva_T[:, i] == 0)]
    PpN_count = len(PpN)
    print("\n%d (%.2f%%) positive label were predicted to be negative" % (
        PpN_count, 
        100*PpN_count/total_class_error 
    ))
    if PpN_count > 4:
        print("Samples:")
        for sample in PpN.sample(5)['comment_text']:
            display(sample)
  
    NpP = valid[(Yva[:, i] == 0) & (Yva_T[:, i] == 1)]
    NpP_count = len(NpP)
    print("\n%d (%.2f%%) negative label were predicted to be positive" % (
        NpP_count, 
        100*NpP_count/total_class_error 
    ))
    if NpP_count > 4:
        print("Samples:")
        for sample in NpP.sample(5)['comment_text']:
            display(sample)
  
    overview[labels[i]] = [
        np.mean(Yva[:, i]*1000),
        total_class_error, 
        PpN_count,  
        NpP_count,
        100*PpN_count/total_class_error,
        100*NpP_count/total_class_error,
        avg_len
    ]
  
    print('\n')
  

### 4.2.1 Toxic

In [11]:
analyze_class(0)

1085 predicted incorrectly (3.40% of all samples)
Falsely predicted sequences have an average length of 33

635 (58.53%) positive label were predicted to be negative
Samples:


"SERIOUSLY \n\nAS SOON ASI'M UNBANENED I WLL WREACFK EVERYTHING"

'"\n Your claims of ""weasel wording"" are lies and attempts to poison the well. -  ✉ "'

'YOUR PAGE \n\nI own this page, if you dont comply I will sue you for 1 billion dollars'



'Halt! You appear to have joined Wikipedia to promote your Racist Beliefs. This is not the place for old men with high school educations to vent their rage at minorities, friend.   \n\nWikipedia is not a soapbox to promote your racist beliefs, bigot. You know about as much about History as the Pope knows about intercourse. Please take your tired, uniformed bigotry somewhere else. Why not take a visit to the local chapter of the Ku Klux Klan? Your beliefs will fit in much better there. Thanks.'


450 (41.47%) negative label were predicted to be positive
Samples:


'Hey loser get a life.  Time to reset IP woot.'

'AGAIN YOU SLANDER HIS NAME! \n\nI get strange message that I cannot attack user and must attack content! I am attacking the content, the content of a wrong that was unjustly delivered upon one of my Brotherhood members. LEITMOTIV.'

"You reverted my edition when I added Arouca but when SLBedit added it you didn't! Wtf?"

'Fun \n\nWikipedia can be edited by anyone and your ruining how a random person can make up rubbish on the pages which idiots might just believe. Also how do you know what Frankie Boyle has done, or what Arbroath is like?'

'"\n\n Who is Dreadstar and How is it that they are able to censor me? \n\nSo I log into Wikipedia one evening and find a few things worth commenting on.  Suddenly all my edits are blocked by ""Dreadstar.""  Who is this censor?  Why does he have the right to determine that what my opinions about particular edits on archeological topics are vandalism?\n\nWTF and who the heck is Dreadstar?"'





### 4.2.2 Severe Toxic

In [12]:
analyze_class(1)

318 predicted incorrectly (1.00% of all samples)
Falsely predicted sequences have an average length of 31

228 (71.70%) positive label were predicted to be negative
Samples:


'fuck you shut the fuck up'



"Go fuck yourself this ain't any of your business and i fucking know what tor is. Asshole."

'Your a dick head!! \n\nHow dare you go around deleting peoples articles. What kind of retard spends all their time on the computer... you, you nerd!!'

'GET A LIFE shit Nerd!'


90 (28.30%) negative label were predicted to be positive
Samples:


'scum who look up to Che deserve the worst kind of death.==\n\nyou faggot little bitches suck balls.\n\n=='

"YOU FUCKING NIGGER, HOW DARE YOU REVERT MY EDITS???? I'LL HUNT YOU DOWN AND RAPE YOU UNTIL YOU SHIT YOUR INTERNAL ORGANS OUT"

'Fuck you niggerkite. y you deleting so much shit. fuckn loser. dont delete anymore shit. its none of your fucking business'

'fuck u jimmy wales fuck fuck fuck'

') \n\nFuck you. Eat shit and die.'





### 4.2.3 Obscene

In [13]:
analyze_class(2)

563 predicted incorrectly (1.76% of all samples)
Falsely predicted sequences have an average length of 37

264 (46.89%) positive label were predicted to be negative
Samples:


'One pint of feces. \n\nON YOUR FAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACE'

'B-5? \n\nExcuse me but what do you mean by contributing in a profeshionall level of bullshit?\nUser 8th sinn'

'How do you know he is dead.  Its just his plane that crashed.  Jeezz, quit busting his nuts, folks.'

'stupid\nYes he did JBKramer called me stupid, here  in an edit summary.'

'I wanted to let you know that Tymon.r. is a stupid moron.'


299 (53.11%) negative label were predicted to be positive
Samples:




"Giant oarfish edits \n\nHi, instead of completely reverting my edit like a lazy prick, perhaps you should just remove my commentary, instead of being a lazy prick. Did I mention that you're a lazy prick? Oh, and you're a lazy prick."

'FAG FAG FAG FAG FAG FAG FAG... GET A REAL JOB PANSY!'



'YOU WHATS THE DEAL U PICE OF COW CRAP \n\nu CAN EAT MY BALLZ'





### 4.2.4 Threat

In [14]:
analyze_class(3)

93 predicted incorrectly (0.29% of all samples)
Falsely predicted sequences have an average length of 27

81 (87.10%) positive label were predicted to be negative
Samples:


'FUCK YOU \n\nYOU ARE THE BIGGEST PIG ON THE PLANET.\n\nDIE IN HELL ASSHOLE!'

'YOU WILL DIE IN 7 DAYS.'

'Go fuck yourself \n\nEat shit and die. 68.0.119.139'

'How to kill mozart \n\nWOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO STAB HIM, SHOOT HIM, JUMP ON HIM'

'Die \n\nI HATE YOU PRICK YOU DINT DESERVE A PLACE HERE'


12 (12.90%) negative label were predicted to be positive
Samples:


"Just be quiet. \n\nWho the fuck are you to tell me what to do? I will post what I want, where I want, and if you don't like it, you can get cancer and die."

'RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE RAPE'

'"cardiac arrest and died.""\n\n"'

'Hi! \n\nI wanna rape you!'

'If a girl ever sees you naked, she will cry for laughing so hard. \n\nYou should just kill yourself.'





### 4.2.5 Insult

In [15]:
analyze_class(4)

749 predicted incorrectly (2.35% of all samples)
Falsely predicted sequences have an average length of 31

450 (60.08%) positive label were predicted to be negative
Samples:


'Get out my talk page you smelling wog!'

"You're right. I don't like how Wikipedia works, I don't like any of the experienced editors. I shouldn't be here. I did not insult anyone because I think I'm better. I do not want a Wikipedia page for your previously stated reasons. I use Wikipedia and consider it credible.nif anyone can tell you how big my dick is, it's me. Neither you nor your website give me any clout. I wasn't promoting. What am I going to say, hey go check out my Wikipedia. Fuck you, your editor friends and Wikipedia. And fuck my Wikipedia page I dont want one   \n\nadd that to my wiki \n\nI hope they ban me and delete my account so I don't have to do it"

'Why not admit it? The only reason you disagree with anything is because you are an obsessed troll, whose sole agenda is causing discord.'

'"\n\nHello, you recently yipped and yapped about me breaking the rules, but I notice that your latest update you edited in ""a proud faggot nutsack"" within my userpage.  I find this intolerable, bigoted, and downright hateful.  Perhaps homosexual African-Americans elecit anger in you?  You should visit www.adl.org .  "'

'Lack of New York Post report in the artical  \n\n lack of Mark Lavin / New York Post report in the artical  \n There should be someting in the artical about Mark Lavin exposing Olbermann about his alegedly being a pervert with a small penis audio here Artical Here'


299 (39.92%) negative label were predicted to be positive
Samples:


"Arrgh \n\nOK I'll go back to wikiHow it's fun unlike dumb Wikiepedia it sucks it's boring I hate it goodbye block me fucker I don't give a shit!!"

'Quit your snivelling, you twit. There is nothing you can do stop me from fixing this mess that some refer to as an encyclopedia.'

'I have to log in moron to reply, this is my normal ip jerk and you made countless more mistakes, not honest!'



'DOO DOO DA DA DA DA  GO SCREW YOURSELF'





### 4.2.6 Identity Hate

In [16]:
analyze_class(5)

237 predicted incorrectly (0.74% of all samples)
Falsely predicted sequences have an average length of 33

225 (94.94%) positive label were predicted to be negative
Samples:


'your mums ugly \n\nyour gay'

'Hi Motherfucker Zhanzhao, are you a slant-eyed chink or a blackie indian or both? \n\nSays the above. Answer my question, Fucker. 218.186.12.250'

"Busy in real life?! LOL, pathetic cunt. Learn to read English, while you have time, fucking piece of chingching trash. You've to see who was right, poor squint. But yeah, you Asian animals don't have self-judgement."

'PENIS VAGINA YOU SUCK FAGGOT ASSHOLES AT WIKIPEDIA'

'RSPW Poster is a sick gay freak'


12 (5.06%) negative label were predicted to be positive
Samples:


'YOU ARE A MINDLESS FAGGOT PIG PLEASE GOD BURN THIS FUCKING MAGGOT TO DEATH IN HELL\nYOU HAVE OFFENDED MY SENSES YOU MINDLESS SELF IMPORTANT FUCKING PEST\nTHE GOD THAT YOU HATE HATES YOU'

'ura fag... \n\nu gay L'

'Hello fuckhead,\nContributions my faggot asshole. You think that because im gay that i dont know shit. you are wrong. you just sit there with nothing to do with your pathetic fucking life that you decide to go and be a complete prick on wikipedia and just delete whatever the fuck u want. Thanks nazi fuck.'

', what ever you do. Do not be gay'







### 4.2.7 Overview

In [17]:
overview.astype(int)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
label‰ of all,94,10,53,3,49,8
total wrong,1085,318,563,93,749,237
P->N,635,228,264,81,450,225
N->P,450,90,299,12,299,12
P->N %,58,71,46,87,60,94
N->P %,41,28,53,12,39,5
avg len,33,31,37,27,31,33
