# Error analysis

## Load Pretrained model

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
saver_paths = ["./logs/rf2/char/ckpt", "./logs/rf2/word/ckpt", "./logs/rf2/hybrid/ckpt"]
checkpoint_files = list(map(tf.train.get_checkpoint_state, saver_paths))

In [3]:
checkpoint_files

[model_checkpoint_path: "/home/homes/jhpark/hate-speech/logs/rf2/char/ckpt/model-final.ckpt"
 all_model_checkpoint_paths: "/home/homes/jhpark/hate-speech/logs/rf2/char/ckpt/model-final.ckpt",
 model_checkpoint_path: "/home/homes/jhpark/hate-speech/logs/rf2/word/ckpt/model-final.ckpt"
 all_model_checkpoint_paths: "/home/homes/jhpark/hate-speech/logs/rf2/word/ckpt/model-final.ckpt",
 model_checkpoint_path: "/home/homes/jhpark/hate-speech/logs/rf2/hybrid/ckpt/model-final.ckpt"
 all_model_checkpoint_paths: "/home/homes/jhpark/hate-speech/logs/rf2/hybrid/ckpt/model-final.ckpt"]

In [4]:
# create session for evaluation
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)


In [5]:
from data.hybrid import load_data_from_file

(x_train, y_train, x_test, y_test, initW, vocab) = load_data_from_file("racism_final2_binary")
word_text_len = x_train[0]["word"].shape[0]
word_vocab_size = len(vocab.vocabulary_)
char_text_len = x_train[0]["char"].shape[0]
char_vocab_size = x_train[0]["char"].shape[1]



Data Summary:
Train: Total Positive Labels=1750 (0.1421)
Test: Total Positive Labels=309 (0.1421)

dataset passed the assertion test


In [6]:
from data.hybrid import extract_from_batch

batchW, batchC = extract_from_batch(x_test)
feed_dicts = []
feed_dicts.append({"input/Placeholder_1:0": y_test, "input/Placeholder:0": batchC, "nn-layers/fully-connected-layer-0/dropout_1/keras_learning_phase:0": 0})
feed_dicts.append({"input/labels:0": y_test, "input/X:0": batchW, "dropout_keep_prob:0": 1})
feed_dicts.append({"input/labels:0": y_test, "input/X_word:0": batchW, "input/X_char:0": batchC, "dropout_keep_prob:0": 1})

output_names = ["prediction/prediction", "output/prediction", "output/prediction"]

In [7]:
preds = []
for i, ckpt in enumerate(checkpoint_files):
    print(ckpt.all_model_checkpoint_paths[0])
    tf.reset_default_graph()
    saver = tf.train.import_meta_graph("{}.meta".format(ckpt.all_model_checkpoint_paths[0]))

    with tf.Session(config=session_conf) as sess:
        saver.restore(sess, ckpt.model_checkpoint_path)
        graph = tf.get_default_graph()
        pred = sess.run(output_names[i] + ":0", feed_dicts[i])
        print(pred.shape)
        preds.append(pred)

/home/homes/jhpark/hate-speech/logs/rf2/char/ckpt/model-final.ckpt
(2175,)
/home/homes/jhpark/hate-speech/logs/rf2/word/ckpt/model-final.ckpt
(2175,)
/home/homes/jhpark/hate-speech/logs/rf2/hybrid/ckpt/model-final.ckpt
(2175,)


In [8]:
from model.helper import calculate_metrics
for pred in preds:
    precision, recall, f1 = calculate_metrics(y_test, pred)
    print("precision=%.4f recall=%.4f f1=%.4f" % (precision, recall, f1))

precision=0.6667 recall=0.7832 f1=0.7202
precision=0.7110 recall=0.7961 f1=0.7511
precision=0.7086 recall=0.8026 f1=0.7527


Using TensorFlow backend.


since the metrics are same as the final output, we can validate that the pre-trained model has been loaded successfully

## Load original test data

In [14]:
from data.preprocess import load_from_file
data = load_from_file("racism_final2_binary")

In [15]:
data["x_test"][:10]

array([ 'there are no ukr nazis. get an education. bandera did not hold a nazi ideaology.',
       'yep - karlie kloss et al are shaking in their boots. mkr',
       'miles of nothing but farmland in either direction, here stands a lone palm tree. california.',
       'anonymous hacker group claims to have taken down more than 1,000 isis sites and emails. opisis',
       'and now mods of ghazi are under fire by a mob that appears to have been led by you. :(',
       "it's official: the fcc will motion to have the internet filed under title ii as a utility.",
       'so i can sympathize and understand where this is all coming from.',
       'thank you',
       "how's colin and these rounds working for you.. pretty crap i'd say from the tweets.. needarethinkinformat mkr",
       'it was sunny out today! :p i went to the dog park for an hour.'], 
      dtype='<U157')

In [16]:
from data.char import one_hot_to_chars
["".join(one_hot_to_chars(x)) for x in batchC[:10]]

['therearenoukrnazis.getaneducation.banderadidnotholdanaziideaology.',
 'yep-karlieklossetalareshakingintheirboots.mkr',
 'milesofnothingbutfarmlandineitherdirection,herestandsalonepalmtree.california.',
 'anonymoushackergroupclaimstohavetakendownmorethan1,000isissitesandemails.opisis',
 'andnowmodsofghaziareunderfirebyamobthatappearstohavebeenledbyyou.:(',
 'itsofficial:thefccwillmotiontohavetheinternetfiledundertitleiiasautility.',
 'soicansympathizeandunderstandwherethisisallcomingfrom.',
 'thankyou',
 'howscolinandtheseroundsworkingforyou..prettycrapidsayfromthetweets..needarethinkinformatmkr',
 'itwassunnyouttoday!:piwenttothedogparkforanhour.']

## Loading Baseline

In [9]:
import numpy as np
from sklearn.linear_model import LinearRegression
from data.preprocess import load_from_file
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [10]:
def evaluate(pred_scores, target):
    pred = list(map(lambda x: 1 if x >= 0.5 else 0, pred_scores))
    precision, recall, f1 = calculate_metrics(target, pred)
    print("Training: Precision=%.2f Recall=%.2f, F1=%.2f" %(precision, recall, f1))
    return f1

In [11]:
def lr_with_freq(data):
    vect = CountVectorizer(ngram_range=(2,5), analyzer="char")
    X_train_counts = vect.fit_transform(data["x_train"])
    print(X_train_counts.shape)
    
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    lr = LinearRegression().fit(X_train_tfidf, data["y_train"])
    evaluate(lr.predict(X_train_tfidf), data["y_train"])
    
    X_test_counts = vect.transform(data["x_test"])
    pred_scores = lr.predict(tfidf_transformer.transform(X_test_counts))
    evaluate(pred_scores, data["y_test"])
    return lr, pred_scores

In [12]:
_, baseline_pred = lr_with_freq(data)

NameError: name 'data' is not defined

In [13]:
baseline_pred = list(map(lambda x: 1 if x >= 0.5 else 0, baseline_pred))

NameError: name 'baseline_pred' is not defined

In [33]:
len(baseline_pred)

2175

## Baseline vs CNN

### Char vs Word

In [8]:
char_pred = preds[0]
word_pred = preds[1]
hybrid_pred = preds[2]


In [49]:
word_right = []
word_wrong = []
for i in range(len(baseline_pred)):
    if char_pred[i] == 0 and word_pred[i] == 1 and y_test[i] == 1:
        word_right.append(data["x_test"][i])
    if char_pred[i] == 1 and word_pred[i] == 0  and y_test[i] == 1:
        word_wrong.append(data["x_test"][i])
    
                             

In [50]:
len(word_right)

21

In [51]:
len(word_wrong)

17

In [52]:
word_right

['our journalists watch isis burn a man in a cage, and they respond with the same tired cliche about "hijacking the religion" bs! it\'s islam',
 'if someone is going to die getting isis out of tikrit, it may as well be the shia militia. that is all good. islam daesh',
 'the holocaust was by nazis. and the people who fought, killed and stopped them where christian westerners.',
 "it's not about satire or free speech. pretending it is will solve nothing. the problem is that the quran is full of incitement for murder.",
 "the reason these politically correct idiots will never solve problem is because they won't acknowledge that it's islam",
 'baby you can drive my car asian juicyasian sex nsfw adult xxx',
 'the question is not about people doing bad things, it is about the religion supporting and demanding those things.',
 'sambo coon',
 'have they told the tribe yet that their daughters would become whores for sexual jihad?',
 'wrong. apostacy is the equivalent of leaving america and bec

In [53]:
word_wrong

["women did't lead squat. a woman can never be caliph. her testimony is half that of a man. she get's nothing for jihad.",
 'islam is a believe system based mainly on illusion. overlooking gross barbarity is necessary.',
 'socialists and moslems - working together to destroy australia.',
 'wrong, microbrain, she was 6, and you, like your prophet, are a liar. here is the proof',
 'the kurds clean up the isis islamofascist trash in kobane. kobane is the graveyard of isis. via',
 "at least i don't follow pedophile prophets or believe in mythical creatures of fire called jinn.",
 'yes, if we just sit on our butts and let them murder and enslave ezidis and christians, ...',
 'yes, they tried to murder their way to heaven. a total idiot would believe in such an allah.',
 'arab man arrested trying to enter same nuclear facility warned: islamic_terror_bomb_waiting_to_go_off.php',
 'looks like a slum, as people are forced to go to mosque at gun point. daesh are creating hundreds of thousands of

In [21]:
word_right_neg = []
word_wrong_neg = []
for i in range(len(word_pred)):
    if char_pred[i] == 1 and word_pred[i] == 0 and y_test[i] == 0:
        word_right_neg.append(data["x_test"][i])
    if char_pred[i] == 1 and word_pred[i] == 0  and y_test[i] == 1:
        word_wrong_neg.append(data["x_test"][i])

In [22]:
word_right_neg

["it's official: the fcc will motion to have the internet filed under title ii as a utility.",
 "i wrote a tool so people wouldn't have to listen to their hate filled rants.",
 'turkey: police use water cannon on anti-islamization protesters',
 "i've already explained why this is lies and propaganda many times.",
 "and i don't like the way that obama is ignoring isis in libya.",
 'charliehebdo i urge the modi government to show some moral courage and lift the ban on the satanic verses',
 'the daesh are so scared and helpless that all they have left is a propaganda campaign of lies.',
 'and ignorant uneducated social rejects are all that daesh can attract.',
 'islamic state claims abduction of 21 christians in libya via',
 "i don't care about slavo turks or idiots that live on race baiting. get lost.",
 'that "actions of a few" line was already stupid 1000 terrorist attacks ago.',
 'what i gave you is the historical research. ever hear of it, or do you just limit yourself to invented su

In [23]:
word_wrong_neg

["women did't lead squat. a woman can never be caliph. her testimony is half that of a man. she get's nothing for jihad.",
 'islam is a believe system based mainly on illusion. overlooking gross barbarity is necessary.',
 'socialists and moslems - working together to destroy australia.',
 'wrong, microbrain, she was 6, and you, like your prophet, are a liar. here is the proof',
 'the kurds clean up the isis islamofascist trash in kobane. kobane is the graveyard of isis. via',
 "at least i don't follow pedophile prophets or believe in mythical creatures of fire called jinn.",
 'yes, if we just sit on our butts and let them murder and enslave ezidis and christians, ...',
 'yes, they tried to murder their way to heaven. a total idiot would believe in such an allah.',
 'arab man arrested trying to enter same nuclear facility warned: islamic_terror_bomb_waiting_to_go_off.php',
 'looks like a slum, as people are forced to go to mosque at gun point. daesh are creating hundreds of thousands of

### Word vs Hybrid

In [22]:
hybrid_right = []
hybrid_wrong = []
for i in range(len(baseline_pred)):
    if word_pred[i] == 0 and hybrid_pred[i] == 1 and y_test[i] == 1:
        hybrid_right.append(data["x_test"][i])
    if word_pred[i] == 1 and hybrid_pred[i] == 0  and y_test[i] == 1:
        hybrid_wrong.append(data["x_test"][i])
    
                             

In [23]:
hybrid_right

['wheras the arabs really are behaving people and you scum are ignoring it.',
 'wrong, microbrain, she was 6, and you, like your prophet, are a liar. here is the proof',
 "at least i don't follow pedophile prophets or believe in mythical creatures of fire called jinn.",
 'yes, they tried to murder their way to heaven. a total idiot would believe in such an allah.',
 'looks like a slum, as people are forced to go to mosque at gun point. daesh are creating hundreds of thousands of atheists.',
 'you only have to count the corpses to see that islamic extremism is widespread.jewish not',
 'watching shia militia beat a peshmerga to death, i thought, "praying to allah 5 times a day certainly peaks the level of fanatical hatred"',
 'men and women are divided by propaganda, pushed by homosexuals, on behalf of zionist jews &gt;&gt; &lt;&lt; mgtow feminazi',
 "you don't guarantee them protection, you guarantee them third class citizenship and extortion."]

In [24]:
hybrid_wrong

['the holocaust was by nazis. and the people who fought, killed and stopped them where christian westerners.',
 'muslim found guilty of plotting to behead british soldiers-cases like this need the death penalty',
 'in the uk like idiots we give jihadis rehabilitation, jordan execute them. we are a joke',
 'where is your evidence that they are furious goat fucker?',
 "i don't care what they retracted. the no go zones are there. here is an example.",
 'allah the terrorist can stick what he wants up his giant ass.',
 'why do you think israel is most technological country in that part of world']

In [25]:
hybrid_right_neg = []
hybrid_wrong_neg = []
for i in range(len(baseline_pred)):
    if hybrid_pred[i] == 0 and word_pred[i] == 1 and y_test[i] == 0:
        hybrid_right_neg.append(data["x_test"][i])
    if hybrid_pred[i] == 1 and word_pred[i] == 0 and y_test[i] == 0:
        hybrid_wrong_neg.append(data["x_test"][i])

In [26]:
hybrid_right_neg

['and now mods of ghazi are under fire by a mob that appears to have been led by you. :(',
 'pilgar sides with communists, authoritarian dictators, and terrorist in every single case.',
 'okay,you had your chance.you are are race baiting pos and gone',
 "is#iranian leaders smarter than#western leaders?? up to date so called#war on#terror gave#iran power n upper hands over entire arab's lands",
 'how did we not find each other years ago and become besties',
 'people like blumenthal only care about their own fame as fake crusaders for justice.',
 'it read to me like you like him bcz he supports yr book',
 'is that suppose to be an excuse for the shia militia animals beating a peshmerga to death?',
 'company profile maine coon breeders and fanciers association - maine coon breeders and fanciers association',
 'robert spencer in pj media: muslim holidays in new york public schools: why not?',
 'and of course i would also fight against any theocracy - regardless of religion.',
 'the police 

In [27]:
hybrid_wrong_neg

["it's official: the fcc will motion to have the internet filed under title ii as a utility.",
 'palestinians must go back to egypt and saudi where they came from. there will be no withdrawal.',
 'the daesh are so scared and helpless that all they have left is a propaganda campaign of lies.',
 '..of india. he calls it the bloodiest chapter in history, and he documents all of it.',
 'that "actions of a few" line was already stupid 1000 terrorist attacks ago.',
 'ghazi has been criticized a few times in the past, and every time they reacted by reasonably discussing &amp; adapting.',
 'your idea that violence towards men and violence towards women are of equal weight ignores the reality of violence women face.',
 'have you noticed how isis prices women in the slave market?',
 'how islam rolls. auspol politas',
 "i don't have a religion, so go fuck yourself. racists like you hold humanity back.",
 'and there is no palestine and never has been a palestine.',
 "i don't believe in religion. b

In [22]:
correct = 0
hybrid_also_correct = 0
word_also_correct = 0
agreement = 0 
for i in range(len(hybrid_pred)):
    if y_test[i] == 1:
        correct += 1
        if char_pred[i] == word_pred[i]:
            agreement +=1
            

In [23]:
agreement/correct #char/word

0.8770226537216829

In [17]:
agreement/correct #hybrid/word

0.948220064724919

In [21]:
agreement/correct #hybrid/char

0.8770226537216829

In [34]:
all_wrong = []
for i in range(len(char_pred)):
    if baseline_pred[i] == 0 and char_pred[i] == 0 and word_pred[i] == 0 and hybrid_pred[i] == 0 and y_test[i] == 1:
        all_wrong.append(data["x_test"][i])
    
                             

In [35]:
all_wrong

["isis continues their big successes against unarmed civilians. they just can't win a fight against anything but families.",
 'thanks for showing again that blacks are the biggest racists. blocked',
 'i see mein public shaming vorked. zee vile carolcorps vill now have black members in zee comics. strucker: i put zee nazi in feminazi',
 'and is this a lie that westerners created?',
 'tokyo hot n1049 endless sex drive - dailyxlover jav asian',
 'isis has begun recruiting moroccan prostitutes for its militants in syria and iran, luring them with promises of money …',
 "no,it's the absolute truth and you can verify it through hundreds of sources. your anti semitism is scripted propaganda.",
 'managed to hit a bird and a small rodent on my drive to huxley this evening oops asian female teen triplethreat',
 'i would rather lose 100k followers by standing up for others in a bad position than gain 100 by keeping my mouth shut fo…',
 "load of bs. their numbers fell steadily since then. and you'