In this notebook, we will test a pooled RCNN model.

# 1. Preparation
We need to first import the required library, download the data, and load the data into the memory.

## 1.1 Import

In [11]:
print('Importing required packages...')

from IPython.display import clear_output
import re
import csv
import pandas as pd
import numpy as np
np.random.seed()
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import TweetTokenizer
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer 
nltk.download('wordnet')
from keras.preprocessing import sequence
from keras.preprocessing import text as ktxt
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, GRU, SpatialDropout1D, Bidirectional
from keras.layers import concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import class_weight


def hint(message):
    """
    erase previous ipynb output and show new message
    """
    clear_output()
    print(message)

  

Importing required packages...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ChuanLi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ChuanLi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 1.2 Loading the Data

In [2]:
hint('loading data...')
train = pd.read_csv('data/train.csv')
train, valid = train_test_split(train, test_size=0.2)

labels = [
    'toxic', 
    'severe_toxic', 
    'obscene', 
    'threat', 
    'insult', 
    'identity_hate'
]

Ytr = train[labels].values
Yva = valid[labels].values

hint('Label distribution between training and validation set:')
print(pd.DataFrame({
    'label': labels,
    'train': [np.mean(train[label]) for label in labels],
    'validation' : [np.mean(valid[label]) for label in labels],
}))

Label distribution between training and validation set:
           label     train  validation
0          toxic  0.095687    0.096475
1   severe_toxic  0.009745    0.010998
2        obscene  0.053002    0.052734
3         threat  0.003071    0.002695
4         insult  0.049367    0.049350
5  identity_hate  0.008828    0.008711


# 2. Pre-processing the Input
There are many ways to pre-process the raw strings into valid input for the model. Here we will do it by building a dictionary with all the comments from the training set, mapping the words to their index in the dictionary, and pad/crop the resulting sequences so that they have the same length.

## 2.1 Cleaning Input

In [3]:
tkzr = TweetTokenizer(preserve_case=False)
eng_stopwords = (
    'what', 'which', 'who', 'whom', 
    'this', 'that', 'these', 'those', 
    'am', 'is', 'are', 'was', 'were', 
    'be', 'been', 'being', 
    'have', 'has', 'had', 'having', 
    'do', 'does', 'did', 'doing', 
    'a', 'an', 'the', 
    'and', 'but', 'if', 'or', 
    'because', 'as', 'until', 'while', 
    'of', 'at', 'by', 'for', 'with', 
    'about', 'against', 'between', 
    'into', 'through', 'during', 'before', 'after', 
    'above', 'below', 'to', 'from', 
    'up', 'down', 'in', 'out', 'on', 'off', 
    'over', 'under', 'again', 'further', 
    'then', 'once', 'here', 
    'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 
    'few', 'more', 'most', 'other', 'some', 
    'such', 'no', 'nor', 'not', 'only', 
    'own', 'same', 'so', 'than', 'too', 'very', 
    'can', 'will', 'just', 'don', 'should', 'now'
)
lmtzr = WordNetLemmatizer()
appos = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not"
}

def preprocess(comment):
  
    # credit to the author of this post:
    # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda

    # remove special format
    comment = re.sub('\n\t', '', comment)

    # remove IP addresses
    comment = re.sub('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' specipaddress ', comment)

    # remove username
    comment = re.sub("\[\[User.*\]", ' specusername ', comment)
    comment = re.sub("\[\[User.*\|", ' specusername ', comment)

    # tokenization 
    tokens = tkzr.tokenize(comment)

    # aphostophe replacement
    tokens = [ appos[token] if token in appos else token for token in tokens]

    # remove stopwords
    tokens = [ token for token in tokens if not token in eng_stopwords ]

    # stemming
    tokens = [ lmtzr.lemmatize(token, 'v') for token in tokens]

    return " ".join(tokens)
  

hint('Cleaning train set...')
Xtr = train['comment_text'].apply(lambda c: preprocess(c))
hint('Cleaning test set...')
Xva = valid['comment_text'].apply(lambda c: preprocess(c))
hint('Done')

Done


## 2.2 Transforming Comments to Sequences

In [4]:
vocab_max = 100000

hint('Fitting the tokenizer...')
tokenizer = ktxt.Tokenizer(num_words=vocab_max)
tokenizer.fit_on_texts(Xtr)

hint('Tokenizing...')
Xtr = tokenizer.texts_to_sequences(Xtr)
Xva = tokenizer.texts_to_sequences(Xva)

hint('padding the sequences...')
max_comment_length = 200  # padded/cropped comment length
Xtr = sequence.pad_sequences(Xtr, maxlen=max_comment_length)
Xva = sequence.pad_sequences(Xva, maxlen=max_comment_length)

hint('Done')

Done


# 3. Training Model

In [5]:
hint("Loading pre-embedding file...")
emb = pd.read_table(
    'preembedding/glove.6B.300d.txt', 
    sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE
)

hint("Preparing embedding matrix...")
embedding_dimension = 300
embedding_matrix = np.random.normal(
    emb.mean(axis=0), 
    emb.std(axis=0), 
    (vocab_max, embedding_dimension)
)
hint("Constructing embedding matrix")
for word, i in tokenizer.word_index.items():
    if i < vocab_max and word in emb.index:
        embedding_matrix[i] = emb.loc[word].as_matrix()

hint("Done")
# optional: free memory:
emb = None

Done


In [16]:
model = None
# model = Sequential()
# model.add(Embedding(
#     vocab_max, 
#     embedding_dimension, 
#     weights=[embedding_matrix],
#     input_length=max_comment_length
# ))
# model.add(SpatialDropout1D(0.5))
# model.add(Conv1D(filters=256, kernel_size=3, padding='same', activation='relu'))
# model.add(MaxPooling1D(pool_size=2))
# model.add(SpatialDropout1D(0.2))
# model.add(Bidirectional(GRU(units=64, return_sequences=True)))
# model.add(GlobalMaxPooling1D())
# #model.add(Dense(64, activation='relu'))
# model.add(Dense(len(labels), activation='sigmoid'))
# model.compile(
#     optimizer='adam', 
#     loss='binary_crossentropy', 
#     metrics=['accuracy']
# )
# model.summary()

x = None
sequence_input = Input(shape=(max_comment_length, ))
x = Embedding(vocab_max, embedding_dimension, weights=[embedding_matrix])(sequence_input)
x = SpatialDropout1D(0.5)(x)
x = Conv1D(filters=256, kernel_size=3, padding='same', activation='relu')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Bidirectional(GRU(64, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
preds = Dense(6, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

Now training the model.

In [17]:
epochs = 4
batch_size = 64

def get_class_weight(x):
    k = 100
    return 3.32*np.log(k/x + 1)
    

history = model.fit(
    Xtr, Ytr, 
    epochs=epochs, 
    batch_size=batch_size,
    validation_data=(Xva, Yva),
    class_weight={
        0: get_class_weight(98),
        1: get_class_weight(10),
        2: get_class_weight(53),
        3: get_class_weight(2),
        4: get_class_weight(49),
        5: get_class_weight(8),
    }
)

Train on 127656 samples, validate on 31915 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Making prediction on the validation set.

In [18]:
hint("Making prediction...")
Yva_ = model.predict(Xva)
hint("Done")

Done


# 4. Result Analysis
## 4.1 Global Accuracy

In [19]:
total_sample = Xva.shape[0]
print("validation set sample count: %d\n" % total_sample)
prediction_total = total_sample*Yva.shape[1]
best_t = None
best_accuracy = 0
for t in [i*0.1 for i in range(1, 10)]:
    accuracy = np.sum(Yva == (Yva_ >= t))/prediction_total
    if accuracy > best_accuracy: 
        best_t = t
        best_accuracy = accuracy
    print("accuracy for threshold %.1f: %.2f%%" % (t, accuracy*100))
Yva_T = Yva_ >= best_t
correct = Yva == Yva_T
print("\nbest threshold: %.1f" % best_t)
print("best accuracy: %.2f%%" % (best_accuracy*100))

validation set sample count: 31915

accuracy for threshold 0.1: 97.39%
accuracy for threshold 0.2: 97.99%
accuracy for threshold 0.3: 98.22%
accuracy for threshold 0.4: 98.32%
accuracy for threshold 0.5: 98.37%
accuracy for threshold 0.6: 98.34%
accuracy for threshold 0.7: 98.27%
accuracy for threshold 0.8: 98.12%
accuracy for threshold 0.9: 97.85%

best threshold: 0.5
best accuracy: 98.37%


## 4.2 Accuracy by Classes

In [20]:
overview = pd.DataFrame(index=[
    'label‰ of all',
    'total wrong', 
    'P->N', 
    'N->P', 
    'P->N %', 
    'N->P %',
    'avg len',
])

def analyze_class(i):
    wrong = valid[correct[:, i] != 1]
    total_class_error = len(wrong)
    print("%d predicted incorrectly (%.2f%% of all samples)" % (
        total_class_error, 
        100*total_class_error/total_sample
    ))
        
    wrong_seqs = Xva[correct[:, i] != 1]
    lens = [ len(seq[seq != 0]) for seq in wrong_seqs]
    avg_len = np.mean(lens)
    print("Falsely predicted sequences have an average length of %d" % avg_len)

    PpN = valid[(Yva[:, i] == 1) & (Yva_T[:, i] == 0)]
    PpN_count = len(PpN)
    print("\n%d (%.2f%%) positive label were predicted to be negative" % (
        PpN_count, 
        100*PpN_count/total_class_error 
    ))
    if PpN_count > 4:
        print("Samples:")
        for sample in PpN.sample(5)['comment_text']:
            display(sample)
  
    NpP = valid[(Yva[:, i] == 0) & (Yva_T[:, i] == 1)]
    NpP_count = len(NpP)
    print("\n%d (%.2f%%) negative label were predicted to be positive" % (
        NpP_count, 
        100*NpP_count/total_class_error 
    ))
    if NpP_count > 4:
        print("Samples:")
        for sample in NpP.sample(5)['comment_text']:
            display(sample)
  
    overview[labels[i]] = [
        np.mean(Yva[:, i]*1000),
        total_class_error, 
        PpN_count,  
        NpP_count,
        100*PpN_count/total_class_error,
        100*NpP_count/total_class_error,
        avg_len
    ]
  
    print('\n')
  

### 4.2.1 Toxic

In [21]:
analyze_class(0)

1148 predicted incorrectly (3.60% of all samples)
Falsely predicted sequences have an average length of 35

749 (65.24%) positive label were predicted to be negative
Samples:


"Don't insult my intelligence, Dougweller. I tell you it's easier to dispute whether they were canvassed so now you start doing so and expect me to play along? No. You know they were canvassed, I know, Mzilikazi does and, unless they're stupid, so do Johnbod and Haploidavey. If I had known you were going to debase yourself with petty, juvenile tactics like this, I never would have put the idea into your head. (  )"

'btw why the heck did you post so much info on your user page? old men like michael jackson can come to your apt and rape you xD'

"I'M GETTING ANGRY AT YOU!!!!!!!!!\nYOU ARE GETTING CLOSE TO BE BLOCKED FROM EDITING!!!!"

'Excuse me ..who in the hell is that? ..and whatever man go edit and bother someone else.\n  8:50, 27 April 2008 (UTC)'

'"\nYou are by far the most unhelpful, ungracious administrator I have ever had to deal with. You\'re incompetence is displayed in every encounter we have. Oh, and I\'m quite familar with WP:NPA, which you resort to citing whenever you don\'t get your way. For other administrators who wish to be helpful, my last username was the Arabic version of Warraq. Warraq means ""scribe.""  \n"'


399 (34.76%) negative label were predicted to be positive
Samples:


'"LOL, that ho is FAT!  — Preceding unsigned comment added by 50.113.177.10   \n\n"'

"Unblocked for the last time\nOk, now don't fuck up. I mean that in the best possible way.  Good luck and good night."

'"\n\n re: UEFA Euro 2008 and talk:UEFA Euro 2008 \n\nHas it occurred to you that you\'re a massive dick?    .Alpaugh "'

'"\n\n and Then i was Walking down the street in India and It Was Gandhi \n\nand He Was All \'i don\'t eat the Smoked Meats but Toddt he\'s So Gay"""'

'"\n\nThe inclusion below....\n\n Controversial Remarks Alleged Calling Facebook Users ""Dumb F**ks"" \n\nQuote: To avoid being blocked, instead of reverting please consider using the article\'s talk page to work toward making a version that represents consensus among editors. \n\n!!!!THAT IS WHAT I AM DOING!!!!\n\nREFRAIN FROM DELETING THE TALK PAGE CONTRIBUTION WITHOUT PROVIDING A VALID REASON OR BETTER TIPS FOR IMPROVEMENT. AFTER ALL THIS IS ONLY THE TALK PAGE\n\nsources for the below mentioned contribution:\n\n http://www.forbes.com/sites/andygreenberg/2012/02/01/is-zuckerbergs-the-hacker-way-letter-facebooks-dont-be-evil-and-will-it-live-up-to-it/\n http://gawker.com/5636765/facebook-ceo-admits-to-calling-users-dumb-fucks\n http://au.businessinsider.com/well-these-new-zuckerberg-ims-wont-help-facebooks-privacy-problems-2010-5\n http://www.theregister.co.uk/2010/05/14/facebook_trust_dumb/\n http://www.theweek.co.uk/technology/14625/are-users-%E2%80%98dumb-fucks%E2%80%99-trusting-da





### 4.2.2 Severe Toxic

In [22]:
analyze_class(1)

341 predicted incorrectly (1.07% of all samples)
Falsely predicted sequences have an average length of 27

273 (80.06%) positive label were predicted to be negative
Samples:


'{{|unblock|Lick my hairy nuts you negroes and jews.}}'

'Toys/Pram eh ? Being persecuted by the big bad English again, awww diddums. Grow up you big fucking baby.'

'FUCK YOUR MOTHER \n\nFUCK OFF'

'Hi \n\nO FUCK YOU U MATHRFUKER BITCH IDIOOOOT NO BAN ME FROM EDIT I TELL TRUTH'

'DEATH TO ARABS! FREE THE JEWISH HOMELAND OF JUDAH AND SHOMRON FRÓM ILLEGAL ÁRAB OCCUPATION! END ARAB COLONIALISM AND ARAB IMPERIALISM! \n\nDEATH TO ARABS! FREE THE JEWISH HOMELAND OF JUDAH AND SHOMRON FRÓM ILLEGAL ÁRAB OCCUPATION! END ARAB COLONIALISM AND ARAB IMPERIALISM!'


68 (19.94%) negative label were predicted to be positive
Samples:


'Go fuck yourself, you piece of shit.'

'Diana is a fucking bitch nigger piece of shit'

'you studid cock sucker u stop callin me ok its a frank Garrett eatin the worms crawl outa your moms pussy you fucking retardo hit I retardo'

"Homosexuality?\nYou french-fried faggot, I am going to cut your balls off and serve them in a platter of mushrooms before shoving your dick up your ass and mutilating your boyfriend's balls in front of you, then I will kill you both!"

"Wikipedia's downfall is that it is free.\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS\nSO FUCK YOU FOR SUPRESSING THAT YOU FACISTS"





### 4.2.3 Obscene

In [23]:
analyze_class(2)

556 predicted incorrectly (1.74% of all samples)
Falsely predicted sequences have an average length of 37

304 (54.68%) positive label were predicted to be negative
Samples:


'go to  die \n\nson of  bich'

"All of my edits are good.  Cunts like you who revert good edits because you're too stupid to understand how to write well , and then revert other edits just because you've decided to bear a playground grudge, are the problem.  Maybe one day you'll realise the damage you did to a noble project.  201.215.187.159"

'"\n\n Censorship \n\nWas Final Fantasy VII censored at all? I mean, the content wasn\'t as strong as, say, Final Fantasy VIII\'s, but I would imagine something was censored. Plus, it doesn\'t say anything about censorship on this page. Could someone please answer to this? \nNot that I recall. It certainly wasn\'t censored as much as the older games were.  \nAs I recall, some language was censored, some wasn\'t.  \nThe word ""bastard"" appeared on various occasions, but ""fuck"" and ""shit"" were censored.  \n< recet indent\nSome words were censored, some were not, which ones seem arbitrary.\nOnly instance I could find of ""Fuck"" uncensored:\nComputer: Cid! We have an emergency situation! A mechanic is still in the engine section of the rocket!\nCid: What!? Who is the little fuck!?\nComputer: I don\'t know. Activating the intercom in the engine section.\nCid: Hey goddammit!! Who the #*$$#&\'s still in there?\nEarliest instance of ""Shit""\nShinra Manager: I\'m not give in to violence…

'"=Reliable sources===\nCheating:\n""Barry Bonds:Cheater"" from CBS, yea I kinda think that is reliable. \n""Dear Barry Bonds, You are either an outright cheater or very stupid"" from the USA Today \n""Yes, Barry Bonds is a cheater. He is a cheater of the worst sort"" \nLying:\n""It\'s clear, Barry Bonds\' a liar"" New York Daily News, another pretty freakin\' reliable source. \n""Barry Lamar Bonds is a bad man"" Baseball Digest \n""but Bonds is a liar, a cheater, a whiner and a bad influence on America\'s youth""  Mark Barnes\n\n=="'

"Don't piss mme off!\n\n'I MEEN IT YOU DLEATED BOLTH OF MY ARTICLES THE AMERICAN DAD VS FAMILY GUY AND THE BUBBLE STRUGGLE 2 ARTICLE I'M GOING TO SEW YOU FOR $10000000000000000000000000000000000000000000 I WISH I COULD YOU SON OF A ***** ******* *******!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ( )"


252 (45.32%) negative label were predicted to be positive
Samples:


"Let the vandals do what they want! \n\nYou people suck! It's true about Wellington college being established in ∞!! common what is the point of having an open encyclopedia if you can't edit it???"

'DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG DIE FAG

'Just you try you fag.'

'"\n\nWow..\nYou really are a wikipedia nerd...\nGo ahead, ban me, it wouldn\'t do anything good.\nSeriously, you do not fucking mess with.\nYou don\'t fuck with me...\nNow go ahead, ban me you little wikipedia nerd, it just proves you have no life...\n-⊕ talk"'

'Queers\n\nGet back in the closet you miserable fags. When was the last time you met a Queer who A. you could trust and B. who was really happy? remeber Aids cures queers....'





### 4.2.4 Threat

In [24]:
analyze_class(3)

77 predicted incorrectly (0.24% of all samples)
Falsely predicted sequences have an average length of 31

65 (84.42%) positive label were predicted to be negative
Samples:


"quick thing mate \n\nGive me a couple of reasons NOT to kill you plz. btw this is a shared IP adress on a school computer so don't even bother responding,"

'hi were bp [PARODIE] \n\nhello were gonna kill the gulf and yo and spill oil on your body and light ya up and shoot ya dead bodie cause were evil'

'you are a prat, and my revenge will be sweet! hahahahahaahahahahahahahahahahahahahahaha.'

'- Do it and I will cut you'

"and your little faggot boy Propol, pray, pray little wonk that I don't find your little ass and kill you"


12 (15.58%) negative label were predicted to be positive
Samples:


'I find you people absolutely despicable and disgusting that you allow blatant anti-Semites to become administrators. May Wikipedia burn in hell.'

'help me i feel like im going to kill myself and hurt some animals i care for'

'I think we both now that it is the other way around. But for me this is over. And I hope you can feel the same way to even if it wasnt moved to Death of.. as you really wanted. But to killing which still indicates foul play close to murder. Bye )'

'YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY SHITHOLE YOU CAN EAT MY 

'Just To Let You Know. You have no life.\nI hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I





### 4.2.5 Insult

In [25]:
analyze_class(4)

784 predicted incorrectly (2.46% of all samples)
Falsely predicted sequences have an average length of 32

507 (64.67%) positive label were predicted to be negative
Samples:


'Whos the douche that deleated all of the info and crap I really want to curse them out right now.'

'Yardie \n\nFuckidiot\nWhat you no about YARDIE?\n\nHuh\n\nI live this life in crew\n\nPo lice come around dey prang the gun and Yardie go down.  I see it I cover kids eyes\n\nYou tell me you know more about Yardie than me?\n\nYou neder even bein nottingahm huh?\n\nYou know rio ferdinand you read papers??  You get rid if tings that is fact true fact\nbecause you pig ignorant white boy from not even nottingham or london or brighton where YARDIE life is way of life\n\nYou mess with my YARDIE then I tract you down i point my gun and prang you fall\n\ngfuck\n\ngood day to you'

'Che\n\nStop being such a jerk, read the cited material below.  Some of the material states that che had no combat victories.  Stop deleting the cited material...especially if you have not provided evidence to the contrary.  I have a cited source, it should stay. If anything the burdon of proof is on you.  Leave it alone.  (Gibby )'

'Were you also the person who reverted a link to an existing image?\n\nI think that you should have tagged this page, not reverted my edits.\n\nI further think that you are simply a pedantic jerk. Sorry. But your actions made wikipedia worse, and I am pretty sure that you violated wikipedia policy.'

'Yeah, whatever. You are an arrogant deluded buffoon with a defective brain. Seek medical help.\n\nNewsflash: Wikipedia is a disaster.'


277 (35.33%) negative label were predicted to be positive
Samples:


'no logic \n\nWhy is it that on tv you can see sombody blown up and see the news were they show you a hostage situation and you see sombodys head being shot off but saying fuck, shit, asshole, cock,pussy,god damn, ect. is concedered by the fcc wrong and could have negative effects. Words dont hurt. a four year old watching tv and seeing sombody get the shit blown the fuck out of them will have more of a negative. FUCK the FCC.'

'No seriously. HE IS BANNED!!! Now by saying that you are feeding the troll. Idiot.'

'BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BALLS BAL

'D Macks You are a pathetic american who will distrusts the rest of the world.  Just give it up mate you are a sad individual'

'Hello asshole. Your mother might be a cross wiki vandal or even a cross country whore but I am not. You blocked me on a groundless basis only because your  asked you too. I have not done anything wrong in English wikipedia. But you will pay the price for your stupid action.'





### 4.2.6 Identity Hate

In [26]:
analyze_class(5)

222 predicted incorrectly (0.70% of all samples)
Falsely predicted sequences have an average length of 28

149 (67.12%) positive label were predicted to be negative
Samples:


"Hey asshole, I'm Creamy3.  Why don't you go suck on your mommy's dried up, saggy, crusty old titties.  Go ahead and lick her sweaty, stinky old cunt while you're at it.  Fuck you cocksucker."

'A proposal to cut down on AIDS \n\nLess AIDS would be spread if you were to stop inserting your minuscule penis into little boys.'

"Roman Catholics\n\ni'm not accusing you but i think you're a fenian. You are ugly and probably the seventh born in a family. You have probably been molested at some point by a priest. You disgust me. SIEG HEIL!"

'u think i get impressed by some f@ck!ng swedish @$$hole who has been on Wiki for 2 years? Hell no. Official press releases by Media outlets have been accepted for years now - and so will this one be.'

'Sallary controversy  \n\nArzel, watch your back on the street\n\ndelete the truth , ill cut you on the street\n\nThe president of the Peabody Essex Museum Dan Monroe makes $578,254.00 a year with full benefits while all the guards are part time with no benefits.\n\nthis edit was undone by you, you are a piece of shit........  what are you on the payroll for that fuck dan monroe\n\njews like you give jews like me a bad name\n\ni know where you live......... go out with your friends dog for a walk be careful\n\ngo fuck yourself - you are  apiece of shit!'


73 (32.88%) negative label were predicted to be positive
Samples:


'Suck my horny dutch balls'

"Bongwarrior's mother loves to have sex with African Americans because she is a dirty whore who is addicted to crack.  Shoe began whoring herself out to gangs of African Americans when she was 16 to support her habit.\n\nOne night, she conceived Bongwarrior with one of her black pimps when she was 15.  After Bongwarrior was born, she gave him to her pimps, who pimped him out too.  As a result, Bongwarrior's anus suffered extensive damage.  He cannot control his defectaroy patterns, as a result of which he has to wear diapers.\n\nBongwarrior hates it when IP-hopping vandals inform the world of the truth about his mother.  He also doesn't realize that if he blocks an IP address, all the IP-hopping vandal has to do is weight for 10 minutes and then get a new IP address.  He is so retarted.  Or maybe he can't think because he is currently getting fucked in the ass by his mother's black pimp right now."

'So which is, Dourque? Are you a Black, a woman or a Hispanic? Or do asshole sons of tax cheats qualify as an under-represented minority on NASCAR?'

'Eat a dick faggot\nYou take internet too seriously. Anon will have your blood faggot.'

'Do humanity a favour \n\nand kill yourself, you nazi bias yank piece of shit  82.41.107.134'





### 4.2.7 Overview

In [27]:
overview.astype(int)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
label‰ of all,96,10,52,2,49,8
total wrong,1148,341,556,77,784,222
P->N,749,273,304,65,507,149
N->P,399,68,252,12,277,73
P->N %,65,80,54,84,64,67
N->P %,34,19,45,15,35,32
avg len,35,27,37,31,32,28
