# Active Learning Test

## Load Pretrained model

In [1]:
import tensorflow as tf

In [2]:
saver_path = "./logs/sf3/hybrid/ckpt"

In [3]:
%ls logs/sf3/hybrid/ckpt/

checkpoint                             model-380000.ckpt.index
model-360000.ckpt.data-00000-of-00001  model-380000.ckpt.meta
model-360000.ckpt.index                model-390000.ckpt.data-00000-of-00001
model-360000.ckpt.meta                 model-390000.ckpt.index
model-370000.ckpt.data-00000-of-00001  model-390000.ckpt.meta
model-370000.ckpt.index                model-final.ckpt.data-00000-of-00001
model-370000.ckpt.meta                 model-final.ckpt.index
model-380000.ckpt.data-00000-of-00001  model-final.ckpt.meta


In [4]:
checkpoint_file = tf.train.get_checkpoint_state(saver_path)
print(checkpoint_file.all_model_checkpoint_paths[0])

/home/homes/jhpark/hate-speech/logs/sf3/hybrid/ckpt/model-360000.ckpt


In [5]:
saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file.all_model_checkpoint_paths[0]))


# create session for evaluation
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
sess = tf.Session(config=session_conf)

saver.restore(sess, checkpoint_file.model_checkpoint_path)

In [6]:
graph = tf.get_default_graph()
[n.name for n in graph.as_graph_def().node]

['input/X_word',
 'input/X_char',
 'input/labels',
 'input/one_hot/on_value',
 'input/one_hot/off_value',
 'input/one_hot/depth',
 'input/one_hot',
 'input/Reshape/shape',
 'input/Reshape',
 'dropout_keep_prob',
 'Const',
 'embedding/random_uniform/shape',
 'embedding/random_uniform/min',
 'embedding/random_uniform/max',
 'embedding/random_uniform/RandomUniform',
 'embedding/random_uniform/sub',
 'embedding/random_uniform/mul',
 'embedding/random_uniform',
 'embedding/W',
 'embedding/W/Assign',
 'embedding/W/read',
 'embedding/embedding_lookup',
 'embedding/ExpandDims/dim',
 'embedding/ExpandDims',
 'ExpandDims/dim',
 'ExpandDims',
 'channel0-conv-maxpool-1/truncated_normal/shape',
 'channel0-conv-maxpool-1/truncated_normal/mean',
 'channel0-conv-maxpool-1/truncated_normal/stddev',
 'channel0-conv-maxpool-1/truncated_normal/TruncatedNormal',
 'channel0-conv-maxpool-1/truncated_normal/mul',
 'channel0-conv-maxpool-1/truncated_normal',
 'channel0-conv-maxpool-1/W',
 'channel0-conv-maxpoo

## Load metadata & test set

check whether the loaded graph computes correctly with test set

In [7]:
from data.hybrid import load_data_from_file

(x_train, y_train, x_test, y_test, initW, vocab) = load_data_from_file("sexism_final2_binary")
word_text_len = x_train[0]["word"].shape[0]
word_vocab_size = len(vocab.vocabulary_)
char_text_len = x_train[0]["char"].shape[0]
char_vocab_size = x_train[0]["char"].shape[1]


Data Summary:
Train: Total Positive Labels=3284 (0.2372)
Test: Total Positive Labels=580 (0.2371)

dataset passed the assertion test


In [8]:
from data.hybrid import extract_from_batch

batchW, batchC = extract_from_batch(x_test)
feed_dict = {"input/labels:0": y_test, "input/X_word:0": batchW, "input/X_char:0": batchC, "dropout_keep_prob:0": 1}


In [9]:
pred = sess.run("output/prediction:0", feed_dict)

In [10]:
from model.helper import calculate_metrics
precision, recall, f1 = calculate_metrics(y_test, pred)
print("precision=%.4f recall=%.4f f1=%.4f" % (precision, recall, f1))

precision=0.7039 recall=0.6845 f1=0.6941


Using TensorFlow backend.


since the metrics are same as the final output, we can validate that the pre-trained model has been loaded successfully

## Extend the graph to compute softmax prob & entropy
use entropy loss to measure the uncertainty to sample

In [11]:
logits = graph.get_tensor_by_name("output/logits:0")
softmax_prob = tf.nn.softmax(logits, name="softmax")
entropy = tf.reduce_sum(tf.scalar_mul(-1, tf.multiply(softmax_prob, tf.log(softmax_prob))) ,axis=1, name="entropy")

n_candidates = tf.placeholder(tf.int32, name="n_candidates")
get_candidates = tf.nn.top_k(entropy, n_candidates, name="candidates")

In [12]:
feed_dict.update({n_candidates: 20})
prob, candidates = sess.run([softmax_prob, get_candidates], feed_dict)

In [13]:
idx = candidates.indices

In [14]:
prob[idx]

array([[ 0.49941638,  0.50058359],
       [ 0.49806392,  0.50193608],
       [ 0.49654597,  0.50345409],
       [ 0.50561768,  0.49438229],
       [ 0.49417138,  0.50582862],
       [ 0.50745797,  0.49254203],
       [ 0.50861633,  0.49138361],
       [ 0.49130696,  0.50869304],
       [ 0.49092978,  0.50907016],
       [ 0.49062288,  0.50937712],
       [ 0.5094744 ,  0.49052563],
       [ 0.49036452,  0.50963545],
       [ 0.51098138,  0.48901856],
       [ 0.4885855 ,  0.51141447],
       [ 0.51210153,  0.48789847],
       [ 0.51247025,  0.48752975],
       [ 0.48717543,  0.51282454],
       [ 0.48616478,  0.51383519],
       [ 0.48614314,  0.51385683],
       [ 0.51516593,  0.48483407]], dtype=float32)

as you can see the most uncertain (probabilities near 0.5) samples are chosen

## Load new unlabelled samples

In [15]:
import pandas as pd

unlabelled = pd.read_csv('./data/crawled/unlabelled/sexism_tweets.tsv',
                     sep="\t",
                     header=None,
                     skiprows=[0],
                     names=["Tweet_ID", "Text", "Previous"],
                     error_bad_lines=False)
unlabelled = unlabelled.drop_duplicates(subset=["Text"])

In [16]:
unlabelled.describe(include="all")

Unnamed: 0,Tweet_ID,Text,Previous
count,11896,11895,10992.0
unique,11829,11895,
top,#MKR,"visiting great granny tonight, might miss #mkr :/",
freq,26,1,
mean,,,1.074373e+17
std,,,2.724271e+17
min,,,10904.0
25%,,,71952630.0
50%,,,391155500.0
75%,,,2887851000.0


In [175]:
texts = list(unlabelled["Text"])
print(len(texts))

11896


remove most of #mkr tags

In [177]:

mkr = 9000
gamergate = 500
for i in range(len(texts)):
    if "mkr" in str(texts[i]).lower() and mkr > 0:
        mkr -= 1
        texts[i] = "123"
    if "gamergate" in str(texts[i]).lower():
        gamergate -= 1
        texts[i] = "123"

remove digits

In [178]:
final_filtered = list(filter(lambda x: not str(x).isdigit(), texts))
print(len(final_filtered))

1395


## Split dataset into two

In [179]:
import os
import random

random.shuffle(final_filtered)

In [180]:
half_index = int(len(final_filtered)/2)

In [181]:
pool_random_sampling = final_filtered[:half_index]
pool_uncertainty_sampling = final_filtered[half_index:]
print(len(pool_random_sampling))
print(len(pool_uncertainty_sampling))

697
698


In [185]:
if not os.path.isfile("./data/crawled/unlabelled/sexism_random.tsv"):
    with open("./data/crawled/unlabelled/sexism_random.tsv", "w") as f:
        for line in pool_random_sampling:
            f.write(str(line) + "\n")
    print("Saved file")
else:
    print("load from file")
    pool_random_sampling = []
    with open("./data/crawled/unlabelled/sexism_random.tsv", "r") as f:
        for line in f:
            pool_random_sampling.append(line.rstrip())
    print(len(pool_random_sampling))

load from file
697


In [186]:
if not os.path.isfile("./data/crawled/unlabelled/sexism_uncertain.tsv"):
    with open("./data/crawled/unlabelled/sexism_uncertain.tsv", "w") as f:
        for line in pool_uncertainty_sampling:
            f.write(str(line) + "\n")
    print("Saved file")
else:
    print("load from file")
    pool_uncertainty_sampling = []
    with open("./data/crawled/unlabelled/sexism_uncertain.tsv", "r") as f:
        for line in f:
            pool_uncertainty_sampling.append(line.rstrip())
    print(len(pool_uncertainty_sampling))

load from file
698


In [187]:
%ls ./data/crawled/unlabelled/

democrat_tweets.tsv   republican_tweets.tsv  unlabelled_data_analysis.ipynb
racism_random.tsv     sexism_random.tsv      youtube1.csv
racism_tweets.tsv     sexism_tweets.tsv      youtube2.csv
racism_uncertain.tsv  sexism_uncertain.tsv   youtube3.csv


## Prepare tsv for labelling

Randomly selected N samples from pool_random_sampling

In [188]:
N = 400

In [189]:
random_samples = random.sample(pool_random_sampling, N)

Prepare to feed pool_uncertainty_sampling into the classifier

In [190]:
from data.preprocess import preprocess_tweet

In [191]:
preprocessed = list(map(lambda x:preprocess_tweet(str(x)), pool_uncertainty_sampling))
valid_tweets = []
valid_tweets_preprocessed = []
for i, tweet in enumerate(preprocessed):
    if tweet:
        valid_tweets.append(pool_uncertainty_sampling[i])
        valid_tweets_preprocessed.append(tweet)
print(len(valid_tweets))
print(len(valid_tweets_preprocessed))
print(valid_tweets_preprocessed[:10])

682
682
["hope josh doesn't forget to take his seasick tablets tonight looks like there are storms on the horizon mkr", 'truth is truth, not an argument, period. false498a, fakedv, legalterrorism legalextortion falserape fakefeminism fakemolestation', "the fault lies with bioware for their game, and ultimately it's the game itself that should be criticized.", 'it is 7pm on sunday which means it is time for the first mkr for the week! it is another super sunday sudden death cook off!', 'the seafood dick is really unlikeable mkr', "absolutely disgusting treatment of the universally respected judge gorsuch by the reprehensible so-called 'democrats' uggh fakefeminism", 'valeriecourtney good luck girls', "josh get your hands off the fish, you've been dethroned mkr", 'all that oil???? so its salmon with 1000 extra calories ... mkr', 'notallmen my babydaddy is as responsive as any woman probably explains why baby likes him more. biology is not destiny']


### char features

In [192]:
from data.char import text_to_1hot_matrix
import numpy as np

In [193]:
pool_char = np.array(list(map(lambda x: text_to_1hot_matrix(str(x)), valid_tweets_preprocessed)))
print(pool_char.shape)

(682, 140, 70)


### word features

In [194]:
from data.word import load_data_from_file as load_vocabulary
from data.tokenizer import tokenize_with_dictionary
import numpy as np

In [195]:
_, _, x_test, _, _, vocab = load_vocabulary("sexism_final2_binary")

In [196]:
tokenized = list(map(lambda x: tokenize_with_dictionary(x ,vocab.vocabulary_._mapping.keys()), valid_tweets_preprocessed))

In [197]:
print(tokenized[:10])

[['hope', 'josh', "doesn't", 'forget', 'to', 'take', 'his', 'seasick', 'tablets', 'tonight', 'looks', 'like', 'there', 'are', 'storms', 'on', 'the', 'horizon', 'mkr'], ['truth', 'is', 'truth', 'not', 'an', 'argument', 'period', 'false', '498a', 'fake', 'dv', 'legal', 'terrorism', 'legal', 'extortion', 'false', 'rape', 'fake', 'feminism', 'fake', 'molestation'], ['the', 'fault', 'lies', 'with', 'bioware', 'for', 'their', 'game', 'and', 'ultimately', "it's", 'the', 'game', 'itself', 'that', 'should', 'be', 'criticized'], ['it', 'is', '7pm', 'on', 'sunday', 'which', 'means', 'it', 'is', 'time', 'for', 'the', 'first', 'mkr', 'for', 'the', 'week', 'it', 'is', 'another', 'super', 'sunday', 'sudden', 'death', 'cook', 'off'], ['the', 'seafood', 'dick', 'is', 'really', 'unlikeable', 'mkr'], ['absolutely', 'disgusting', 'treatment', 'of', 'the', 'universally', 'respected', 'judge', 'gorsuch', 'by', 'the', 'reprehensible', 'so', 'called', 'democrats', 'uggh', 'fake', 'feminism'], ['valerie', 'cou

In [198]:
assert x_test.shape[1] >= max([len(x) for x in tokenized])

In [199]:
joined = list(map(lambda x: " ".join(x), tokenized))
pool_word = np.array(list(vocab.fit_transform(joined)))
print(pool_word.shape)

(682, 36)


## Plug into the model

In [200]:
n = len(valid_tweets)
unlabelled_feed_dict = {"input/labels:0": np.zeros(n).reshape(n, 1), "input/X_word:0": pool_word, "input/X_char:0": pool_char, "dropout_keep_prob:0": 1}

In [201]:
unlabelled_feed_dict.update({n_candidates: 200})
_prob, _candidates = sess.run([softmax_prob, get_candidates], unlabelled_feed_dict)
idx = _candidates.indices

In [202]:
valid_tweets = np.array(valid_tweets)


### find the best predictions

In [203]:
positive_prob = _prob[:, 1]
top_pos_idx = np.argpartition(positive_prob, -200)[-200:]
for i in top_pos_idx:
    print("%s %s" % (valid_tweets[i], _prob[i][1]))


I feel sorry for Amy, she married a douche! 0.643221
@Darksoulsthree @sweetparadise40 @PrisonPlanet #Feminazi 's sure have plenty of time. Perhaps they are unemployed spinsters dying with cats! 0.64758
Josh- Can't cook fish 0.660457
Josh is going to "wing it" -  he can't even master a recipe #mkr 0.666461
@TECHXEC At this point, you're just seeking attention. If it doesn't apply to you, move along. But don't #NotAllMen in my mentions. 0.671014
can't see today is going to be Josh's best #MKR 0.689561
TRUTH is TRUTH, not an argument, period. #false498a, #fakeDV, #legalterrorism #legalextortion #falserape #fakefeminism #fakemolestation 0.689644
MESSAGE TO ALL: Time to give Josh some positive tweet-love. 0.695275
This crab is ridiculously soft.. well no, then we'd be calling them 'ridiculously soft-shell crabs' Josh you massive twat #MKR 0.715988
oh no 0.70359
No Josh it never worked for you #mkr 0.726333
Really hope Kelsey and Amanda kick ass!! The  Seafood King can go down with the ship 

seems like them model is biased on certain hashtags like #mkr

### find the most uncertain predictions

In [204]:
candidate_tweets = valid_tweets[idx]
print(candidate_tweets)

['Wife says: " I can\'t watch Collin eat...."' 'Oh no Kelsey chin up! #mkr'
 "How has Josh's face not been deep fried? #MKR"
 'soz josh is a rude bastard #mkr'
 'I think we have seen your best, Josh, and your best is utter shit. #MKR'
 "Every time you counter a sexist comment on a Whatsapp group, you're called a #Feminazi."
 '#MKR Well done Kelsey &amp; Amanda on your Entree.'
 'Josh: "I\'ve burnt prawns in a galley before, we\'ll be fine" #MKR'
 'Here they come..'
 "You muck it up cause you can't cook it Josh your delusional. #mkr"
 '@streetvoiceuk oh really? Why did he choose 2 come back to live with me then? Stop making things up you #feminazi #twat'
 'I am the seafood king....but fuck up every bit of seafood you touch....'
 'What does #Amy do when she sees #Josh staggering around the back yard? She reloads.  #mkr'
 'I need southern friend chicken too.'
 "Josh loves a crushed Asian? I knew I didn't like him. #mkr"
 'Could it be....?'
 "There was a look in Amy's eyes just then as Jos

### combine both and make into file

In [205]:
combined_idx = np.concatenate((idx, top_pos_idx))

In [206]:
active_learning_samples = valid_tweets[combined_idx] 
total_tweets = [(1, tweet) for tweet in active_learning_samples] + [(2, tweet) for tweet in random_samples]

In [207]:
len(total_tweets)

800

In [208]:
random.shuffle(total_tweets)

In [209]:
with open("./data/crawled/sexism_to_be_labelled.tsv", "w") as f:
    for i, tweet in total_tweets:
        f.write("%s\t%s\n" % (i, tweet))