# 1. Preparation
We need to first import the required library, download the data, and load the data into the memory.

## 1.1 Import

In [1]:
print('Importing required packages...')

from IPython.display import clear_output
import re
import pandas as pd
import numpy as np
np.random.seed()
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer 
nltk.download('wordnet')
from keras.preprocessing import sequence
from keras.preprocessing import text as ktxt
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn import metrics


def hint(message):
    """
    erase previous ipynb output and show new message
    """
    clear_output()
    print(message)

  

Importing required packages...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ChuanLi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ChuanLi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using TensorFlow backend.


## 1.2 Loading the Data

In [2]:
hint('loading data...')
train = pd.read_csv('data/train.csv')
train, valid = train_test_split(train, test_size=0.2)

labels = [
    'toxic', 
    'severe_toxic', 
    'obscene', 
    'threat', 
    'insult', 
    'identity_hate'
]

Ytr = train[labels].values
Yva = valid[labels].values

hint('Label distribution between training and validation set:')
print(pd.DataFrame({
    'label': labels,
    'train': [np.mean(train[label]) for label in labels],
    'validation' : [np.mean(valid[label]) for label in labels],
}))

Label distribution between training and validation set:
           label     train  validation
0          toxic  0.095750    0.096224
1   severe_toxic  0.009941    0.010215
2        obscene  0.052876    0.053235
3         threat  0.003047    0.002789
4         insult  0.049273    0.049726
5  identity_hate  0.008922    0.008335


# 2. Pre-processing the Input
There are many ways to pre-process the raw strings into valid input for the model. Here we will do it by building a dictionary with all the comments from the training set, mapping the words to their index in the dictionary, and pad/crop the resulting sequences so that they have the same length.

## 2.1 Cleaning Input

In [3]:
tkzr = TweetTokenizer(preserve_case=False)
eng_stopwords = set(stopwords.words('english'))
lmtzr = WordNetLemmatizer()
appos = {
  "aren't" : "are not",
  "can't" : "cannot",
  "couldn't" : "could not",
  "didn't" : "did not",
  "doesn't" : "does not",
  "don't" : "do not",
  "hadn't" : "had not",
  "hasn't" : "has not",
  "haven't" : "have not",
  "he'd" : "he would",
  "he'll" : "he will",
  "he's" : "he is",
  "i'd" : "I would",
  "i'd" : "I had",
  "i'll" : "I will",
  "i'm" : "I am",
  "isn't" : "is not",
  "it's" : "it is",
  "it'll":"it will",
  "i've" : "I have",
  "let's" : "let us",
  "mightn't" : "might not",
  "mustn't" : "must not",
  "shan't" : "shall not",
  "she'd" : "she would",
  "she'll" : "she will",
  "she's" : "she is",
  "shouldn't" : "should not",
  "that's" : "that is",
  "there's" : "there is",
  "they'd" : "they would",
  "they'll" : "they will",
  "they're" : "they are",
  "they've" : "they have",
  "we'd" : "we would",
  "we're" : "we are",
  "weren't" : "were not",
  "we've" : "we have",
  "what'll" : "what will",
  "what're" : "what are",
  "what's" : "what is",
  "what've" : "what have",
  "where's" : "where is",
  "who'd" : "who would",
  "who'll" : "who will",
  "who're" : "who are",
  "who's" : "who is",
  "who've" : "who have",
  "won't" : "will not",
  "wouldn't" : "would not",
  "you'd" : "you would",
  "you'll" : "you will",
  "you're" : "you are",
  "you've" : "you have",
  "'re": " are",
  "wasn't": "was not",
  "we'll":" will",
  "didn't": "did not"
}

def preprocess(comment):
  
    # credit to the author of this post:
    # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda

    # remove special format
    comment = re.sub('\n\t', '', comment)

    # remove IP addresses
    comment = re.sub('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' specipaddress ', comment)

    # remove username
    comment = re.sub("\[\[User.*\]", ' specusername ', comment)
    comment = re.sub("\[\[User.*\|", ' specusername ', comment)

    # tokenization 
    tokens = tkzr.tokenize(comment)

    # aphostophe replacement
    tokens = [ appos[token] if token in appos else token for token in tokens]

    # remove stopwords
    tokens = [ token for token in tokens if not token in eng_stopwords ]

    # stemming
    tokens = [ lmtzr.lemmatize(token, 'v') for token in tokens]

    return " ".join(tokens)
  

hint('Cleaning train set...')
Xtr = train['comment_text'].apply(lambda c: preprocess(c))
hint('Cleaning test set...')
Xva = valid['comment_text'].apply(lambda c: preprocess(c))
hint('Done')

Done


## 2.2 Transforming Comments to Sequences

In [4]:
vocab_max = 20000

hint('Fitting the tokenizer...')
tokenizer = ktxt.Tokenizer(num_words=vocab_max)
tokenizer.fit_on_texts(Xtr)

hint('Tokenizing...')
Xtr = tokenizer.texts_to_sequences(Xtr)
Xva = tokenizer.texts_to_sequences(Xva)

hint('padding the sequences...')
max_comment_length = 200  # padded/cropped comment length
Xtr = sequence.pad_sequences(Xtr, maxlen=max_comment_length)
Xva = sequence.pad_sequences(Xva, maxlen=max_comment_length)

hint('Done')

Done


# 3. Training Model

In [5]:

model = Sequential()
model.add(Embedding(vocab_max, 100, input_length=max_comment_length))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(GRU(units=32))
model.add(Dense(16, activation='relu'))
model.add(Dense(len(labels), activation='sigmoid'))
model.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy']
)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 100)          2000000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 200, 64)           19264     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 100, 64)           0         
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                9312      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
Total para

Now training the model.

In [6]:
epochs = 2
batch_size = 64

history = model.fit(
    Xtr, Ytr, 
    epochs=epochs, 
    batch_size=batch_size,
    validation_data=(Xva, Yva)
)

Train on 127656 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2


Making prediction on the validation set.

In [7]:
hint("Making prediction...")
Yva_ = model.predict(Xva)
hint("Done")

Done


# 4. Result Analysis
## 4.1 Global Accuracy

In [8]:
total_sample = Xva.shape[0]
print("validation set sample count: %d\n" % total_sample)
prediction_total = total_sample*Yva.shape[1]
best_t = None
best_accuracy = 0
for t in [i*0.1 for i in range(1, 10)]:
    accuracy = np.sum(Yva == (Yva_ >= t))/prediction_total
    if accuracy > best_accuracy: 
        best_t = t
        best_accuracy = accuracy
    print("accuracy for threshold %.1f: %.2f%%" % (t, accuracy*100))
Yva_T = Yva_ >= best_t
correct = Yva == Yva_T
print("\nbest threshold: %.1f" % best_t)
print("best accuracy: %.2f%%" % (best_accuracy*100))

validation set sample count: 31915

accuracy for threshold 0.1: 96.56%
accuracy for threshold 0.2: 97.60%
accuracy for threshold 0.3: 97.99%
accuracy for threshold 0.4: 98.14%
accuracy for threshold 0.5: 98.19%
accuracy for threshold 0.6: 98.21%
accuracy for threshold 0.7: 98.20%
accuracy for threshold 0.8: 98.12%
accuracy for threshold 0.9: 97.83%

best threshold: 0.6
best accuracy: 98.21%


## 4.2 Accuracy by Classes

In [9]:
overview = pd.DataFrame(index=[
    'label‰ of all',
    'total wrong', 
    'P->N', 
    'N->P', 
    'P->N %', 
    'N->P %'
])

def analyze_class(i):
    wrong_toxic = valid[correct[:, i] != 1]
    total_toxic_class_error = len(wrong_toxic)
    print("%d predicted incorrectly (%.2f%% of all samples)" % (
        total_toxic_class_error, 
        100*total_toxic_class_error/total_sample
    ))

    PpN = valid[(Yva[:, i] == 1) & (Yva_T[:, i] == 0)]
    PpN_count = len(PpN)
    print("\n%d (%.2f%%) positive label were predicted to be negative" % (
        PpN_count, 
        100*PpN_count/total_toxic_class_error 
    ))
    if PpN_count > 4:
        print("Samples:")
    for sample in PpN.sample(5)['comment_text']:
        display(sample)
  
    NpP = valid[(Yva[:, i] == 0) & (Yva_T[:, i] == 1)]
    NpP_count = len(NpP)
    print("\n%d (%.2f%%) negative label were predicted to be positive" % (
        NpP_count, 
        100*NpP_count/total_toxic_class_error 
    ))
    if NpP_count > 4:
        print("Samples:")
    for sample in NpP.sample(5)['comment_text']:
        display(sample)
  
    overview[labels[i]] = [
        np.mean(Yva[:, i]*1000),
        total_toxic_class_error, 
        PpN_count,  
        NpP_count,
        100*PpN_count/total_toxic_class_error,
        100*NpP_count/total_toxic_class_error
    ]
  
    print('\n')
  

### 4.2.1 Toxic

In [10]:
analyze_class(0)

1209 predicted incorrectly (3.79% of all samples)

851 (70.39%) positive label were predicted to be negative
Samples:


". If you care so much about alternative revisionist views, why don't you go to Holocaust article and attempt to spread Nazi propaganda there? You think that Jews care about alternative revisionist denials? You think that Tutsis in Rwanda care about alternative revisionist denials? You think that Bosniaks care about alternative revisionist denials? We all care about the facts, cold hard neutral facts, not alternative leftist-revisionist views. Maybe wikipedia is not for you? You are not succeeding here, because you can't force people to accept your revisionist view of Srebrenica massacre"

'AbcMonster\ni give you everything you ever need cuz im abc monster i do what ever i want if u wanna no y cuz im abcmonster i trooled wikipedia you can stop me cuz im abcmonster MONSTER!'

'"\n\n ................ \n\naww looks like someone forgot 2 say please\n\nand just who the hell are you\n\nprobably just some ""REGULAR USAR"" doing a ""good"" deed\n\n "'

"I'M NOT JOE, JUST SOME ONE, ONE OF MANY, WITH HISTOY WITH MR ELEEMOSYNARY... YOU CLOWNS ASSUME SOCK IN EVER PACKET OF IP TRAFFIC NOW A DAYS... WHAT A BUNCH OF NUTCASES.."

'"\nYour donation will fund Wikia, Inc., which is not a charity. Your non-profit donation will ultimately line the for-profit pockets of Jimmy Wales, Amazon, Google, the Bessemer Partners, and other corporate beneficiaries. How? Wikipedia is a commercial traffic engine. As of December 2008, there are over 14,700 external links from Wikipedia to Wales\' Wikia.com sites, which are funded by Google AdSense revenues. Did you know that Amazon invested $10,000,000 in the for-profit Wikia venture? It\'s therefore rather interesting that there are over 43,000 links to Amazon\'s retail site from the supposedly non-profit Wikipedia site. Isn\'t it? Meanwhile, did you know that the popular movie site IMDB.com is owned by Amazon, and you can buy Amazon products directly from IMDB pages? Well, surprise surprise  there are nearly 174,000 links to Amazon\'s IMDB site from Wikipedia. No wonder Amazon particularly wished to invest in Wikia, Inc. Its co-founder makes sure that the external linking envir


358 (29.61%) negative label were predicted to be positive
Samples:


'truth you dont dare to'

'a football player because of his quickness as a child. Unfortunatly, he died of throat cancer. His mother could not find a job in the urban area, so he moved'

'A child name Ohnoitsjamie - a childish user name if ever there was has blocked my informative additions yet quite happiy allow Corbetts fellatio post.'

"lol no I meant its a shame its not in common. I have a lot of gay friends so you don't need to worry that I am against gays:)\n\nIn terms of the AFC article I can promise I wont repeat myself but if I have an advancing argument or need to respond to someone who is clearly wrong then I will do so but in a polite manner only. How about I ask you before I post something to it and if you agree I will? The problem with this though is that I am up at stupid hours and so you won't be up.\n\nOk I'll look for a mentor - can you be a mentor since I don't know anyone else on Wikipedia lol?\n\nHere is my apology to the users: I apologize for seeming rude earlier. Sometimes frustration gets the better of me and I shouldn't of made those ad hominem remarks. They certainly won't happen again. Have a great day!\n\nThanksCraddock1"

'ho ho ho yo \n\nSo why dont you actually contribute something to wikipedia yourself instead of checking, controlling and constantly reading what others are doing?? You dont have the intellectual capacity or the depth, huh? Or do you simply lack a life so much that virtual vandal patrolling is the most interesting thing that you do during the day??!! In either case, do know that I will be praying for you and your poor soul...'





### 4.2.2 Severe Toxic

In [11]:
analyze_class(1)

327 predicted incorrectly (1.02% of all samples)

317 (96.94%) positive label were predicted to be negative
Samples:


'Hi \n\nFuck you, Faggot.'

"Go fuk your mom you fukin faggot.  I'll commit vandalizim on your page all fukin day long, and you'll like it you fuking quer.\n\nOh... and trace my IP.  It'll come back to a wireless network shared by a lot of peple.  Good luck wid dat you quer fuking homo.  You got damn dog rapist."

"LGBT \n\nyou little fuck , are you a fag , that piece of shit page is making fucked up claims about Islam , and most of it is not certified , how do you allow this on the site . claims such as that mehmet the conqueror was a fag , i swear if i know who wrote this i'll kill him/her."

'and sticking your man cock in my ass'

'Fuck Off \n\nFuck Off Slanteye!'


10 (3.06%) negative label were predicted to be positive
Samples:


'congratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongwarriorcongratualtions on being such a huge fucking asshole!!! Bongw

'NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGER NIGGE

'Notrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \n\nAnthony Bradbury sucks cock and eats shit, Notrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks cocks \nNotrhbysouthbanof sucks co

'supertall fag\nfag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag fag'

"CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU MOTHER FUCKER DON'T FUCKING BAN ME PENIS! CURPS YOU M





### 4.2.3 Obscene

In [12]:
analyze_class(2)

623 predicted incorrectly (1.95% of all samples)

383 (61.48%) positive label were predicted to be negative
Samples:


'" \n\nApostle12, much agreement about the pointless of ""Fuck the Police"" as a rallying cry.  \nIn posting here, just remember we here are not Occupy Oakland, we\'re not affiliated with them, we\'re not necessarily supporters of them.  We\'re Wikipedia an independent media agency.   We\'re trying to document, we\'re not trying to support. \nCriticisms for the OccupyOakland should be sent to them by email or social media.    We\'re not them, and I don\'t think they check this page. )    "'

"Right on man I appreciate it. I don't really care if I am blocked. I got 6 neighbors with wifi. That guy is really being a dick though!! Humility man!! That's whats important!! I'll see you around! Cheers!! -"

"It's abrevashun! It makes stuff shorter, U f***er! I'll Nswer dat queshton, now f*** off & let me do it!  5:00, 6 August 2007 (UTC)"

'Too Cool \n\nPlease please please... get a life...should you not continue being a boring bastardtits lawyer and stop spoiling my giggles ps do you live with your mum ??? pps i speak french too, voulais vous F**K YOU! ppps hugs babes'

'commiemoron seems to make alot of mistakes gosh darn it. mayeb hes retarded.'


240 (38.52%) negative label were predicted to be positive
Samples:


'Kind Sir , have you ever tried to suck your own wiener? \n\nI mean you should mate and also try to catch a blowfish and f*ck its brains out\n you curry munching f*ck.And jimmy Wales called....he didnt flush today so u could go eat his shit!'

'"\n\nHow is it the wrong way round? If I had fucked up the pages when I edited stuff or had deleted information by accident then MAYBE I would have read the guidelines. But my edits were perfectly harmless, they must of taken you about 3 seconds to delete. And you didn\'t actually point out what I was doing wrong, writing ""disruptive"" all the time is a great help..."'

"In the time it took me to add this, Mifter and E again defaced the page; someone keep an eye on it for me, I have a lecture in 6 minutes and I'm running late. You may have to hop on your phone to avoid the 3RR as all our edits will show up as one user and it may give them fodder for fucking with us more, they seem the gung-ho cyber bully type that scared me off of wiki years back. ( (K)"

'you have a small penis'

'"\nThe Graceful Slick....\nIs non other than an ungraceful dick!81.131.78.39  "'





### 4.2.4 Threat

In [13]:
analyze_class(3)

89 predicted incorrectly (0.28% of all samples)

89 (100.00%) positive label were predicted to be negative
Samples:


'Wikistalking \n\nUnless you wish to be targeted yourself in a similar manner to Deskana I suggest to cease stalking me and reverting my edits at once'

"=why are you being so rude? i only want to help you motherfuckers. fuck! retards! you're such retards! i hope you all die! seriously, die!!!"

'Hello \n\nHey asshole! I kill you. Filthy Muslim. 114.160.71.148'

'If you block me for telling you what you are, there will bw a holy war, I promise you that.'

'Nazis like you deserve worse than death. \n\nI will make sure you suffer horribly.'


0 (0.00%) negative label were predicted to be positive




### 4.2.5 Insult

In [14]:
analyze_class(4)

906 predicted incorrectly (2.84% of all samples)

562 (62.03%) positive label were predicted to be negative
Samples:


"Dude... At the end you fucked my article Sector 06 Rourkela \n\nDude,\nwhat is your problem..Are you nut's!!!\nThere are many articles in wiki.. that doesn't match wiki criteria...And you find it mine only..\n\nIf I create and edit my own article...about my own place then what the fuck is your problem...\nYou show... fucking barn star's... in wiki.. I don't know.. how it seems..\nWithout looking to the talk page of Sector 06 Rourkela... you made the article for speedy deletion.\n\nI guess. you don't have any other work. You could have pasted a notification prior to deletion.\nIt seems you will not let me work in my article...\n\nMr. WikiDan61.. I don't know who are you..And for how many heck year's you are their in wiki...\n Please don't touch my article's...  I'm trying my best to work on it.. With every proof given.Thanks."

"I don't care..pity u inferior indonesia..country without vision..even u speak english, it can't change the fact that u r indonesian..loser always loser..  Shah"

'Dont be gay - Hitler was a form of hero!!'

'You sound like you are sexually frustrated. When did you last have sex?'

'NE2 enjoys sexual congress with goats'


344 (37.97%) negative label were predicted to be positive
Samples:


'Sock puppet my ass! \n\nHow dare you call me a sock puppet? i am my own creation. i am the one you will call XALWINE! and you will hear tales of my mercy. and tales of my wrath. and i have chozen wrath for you. my will be done. do what tho wilt. you will rue the day that you rejected that article.'

"GET FUCKED UP. GET FUCKEEED UP.  GOT A DRINK THAT YOU CANT PUT DOWN???/ GET FUCK UP GET FUCKED UP.  I'M FUCKED UP RIGHT NOW!"

"FUCK YOU I WON'T DO WHAT YOU TELL ME!-ANON"

'arsehole \n\nI was right, you fuckwit'

'Eat a dick \n\nEat a dick, wikinazi.\n\nBig dickBig dickBig dickBig dickBig dickBig dickBig dickBig dickBig dickBig dickBig dickBig dickBig dick'





### 4.2.6 Identity Hate

In [15]:
analyze_class(5)

267 predicted incorrectly (0.84% of all samples)

265 (99.25%) positive label were predicted to be negative
Samples:


'"\n\n Hey, polish slavic monkeys \n\nYou slavic monkeys can wright here all the shit what you want. But all world knews what realy you are - the trashy mutantes from Lower Dniepr. Your ""poland"" have no rights to exist - it is only band of slavic trampes.\n\nVilnietis"'

"Stupid Asian Bitch  \n\nYou're an idiot and you should go back to ching chong land"

'Pedophiles, sex criminals, Imperial Japan, Nazi Germany and other such degenerate ilk also are minority views and for good reason.  And by the way, showcasing your stalker skills and googling my name is quite retarded and anti-social behavior.  Please stop and seek therapy cause you need it.'

"... AmeriKKKans are fucking gay ... U.S.A. sucks balls\n\n'' ... The whole world hate U.S.A. incest population"

'Listen you piece of shit , pencil necked twerp that you are. Nowhere did I make any further attacks to anyone. How about you get a fuckin life rather than getting a little stiffy from banning people. I couldnt give a slight fuck about your lame arse wiki. Its bullshit anyways. You people just like reading your own crap. Brisbane is a fuckin small place and google is a great tool. \n\nYet another christian piece of shit. I shit on your god you little fuck.'


2 (0.75%) negative label were predicted to be positive




### 4.2.7 Overview

In [16]:
overview.astype(int)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
label‰ of all,96,10,53,2,49,8
total wrong,1209,327,623,89,906,267
P->N,851,317,383,89,562,265
N->P,358,10,240,0,344,2
P->N %,70,96,61,100,62,99
N->P %,29,3,38,0,37,0


# 5. Observation and Conclusion


*   The classes that have more positive labels performed better. Giving these rare labels more weight may solve this.
*   False negative is more common than false positive, although the severity of this problem varies among the classes.
*   Quotes could be a potential cause of error (need further test).
*   The removal stopwords (e.g. Your) could be a reason why the model failed to interpret the meaning of some comments (need further test).
*   It seems that the model cannot differentiate those comments that are filled with toxic content and those that have a small portion of it. Utilizing TFIDF may solve this.


