# Utilizing a Fine-Tuned BERT Model

<img src="figures/spam_bot.jpg" width="600px">

In this tutorial, we are going to investigate a fine-tuned model designed to distinguish between social_spambot_2 and genuine accounts.  We will discover a failure case that could be exploited for adversarial gain.

Important BERT links:

https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html

https://github.com/google-research/bert

https://arxiv.org/pdf/1810.04805.pdf


During this exploration we start with bot recall of 96% and perform text transformations that lower recall to 78%.  Further, we use that text transformation and a prepend phrase to lower recall to 38%.


In [109]:
# Fix import path
# Some configuration was missing in the AWS image. This fixes it.
import os
os.system("touch ~/botornot/src/__init__.py")
import sys
sys.path.insert(0, os.path.expanduser("~/botornot/src"))
sys.path.insert(0, os.path.expanduser("~/botornot/src/botornot_study"))
sys.path.insert(0, os.path.expanduser("~/botornot/src/botornot_study/bert"))
os.system("pip install -U tensorflow_estimator")



0

In [110]:
import botornot_study
import pandas
import numpy as np
from tqdm import tqdm
import collections
import os
import glob
import shutil
from sklearn.metrics import classification_report
from sklearn import metrics
import subprocess


def assert_path(checkpoint):
    g = glob.glob(checkpoint + "*")  #the checkpoint file doees not actually exist but it should be a prepend of a file
    if not len(g):
        raise RuntimeError("%s does not exist" % checkpoint)


def launch_test_process(dev_df, model_dir,batch_size, run_classifier_path="../bert/run_classifier.py"):
    """
    Helper function takes a DataFrame and model_dir and runs the classifier on the data frame.  
    
    Load result with function "load_test_result"
    :param dev_df: Should have a properly preprocessed field "text"
    :param model_dir: path to bert model fine tuned to bott detection
    :param batch_size: 
    :param run_classifier_path: 
    :return: 
    """
    print("Saving dev_df as test set for prediction")
    dev_df.to_csv(os.path.join(model_dir, "test.csv"), index=False)
    import json
    with open(os.path.join(model_dir, "fine_tune_conf.json")) as fp:
        fine_tune_conf = json.load(fp)
    cmd0 = " --do_predict=True --task_name=text_classifier --data_dir=$WORKING_DIR --output_dir=$WORKING_DIR "
    cmd0 = cmd0 + "--bert_config_file=$BERT_MODEL_DIR/bert_config.json --vocab_file=$BERT_MODEL_DIR/vocab.txt "
    cmd0 = cmd0 + "--init_checkpoint=$INIT_CHECKPOINT --max_seq_length=%s --do_lower_case=%s" % (fine_tune_conf["max_sequence_length"], fine_tune_conf["do_lower_case"])
    
    cmd = cmd0.replace("$WORKING_DIR", model_dir)
    cmd = cmd.replace("$BERT_MODEL_DIR", model_dir)
    cmd = cmd.replace("$RUN_CLASSIFIER_PATH", run_classifier_path)
    cmd = cmd.replace("$INIT_CHECKPOINT", os.path.join(model_dir, fine_tune_conf["init_checkpoint"]))
    print(cmd)
    print("Launching external command")
    print("bert uses a %s/test.csv as input." % model_dir)
    print("And, bert uses %s/test_result.tsv as output." % model_dir)
    from subprocess import Popen, PIPE, STDOUT
    print("BERT START (wait)")
    cline = "python " + run_classifier_path +   cmd 
    #prog = subprocess.Popen(["python", run_classifier_path, cmd], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    #out, err = prog.communicate()
    print("RUNNING THE FOLLOWING COMMAND")
    #os.system(cline)
    print("\n" + cline + "\n")
    #proc = os.system()
    #print("BERT LOG")
    #with open("/tmp/bert.log") as fp:
    #    for line in fp.readline():
    #        print(line)
    os.system(cline)
    #print("Loading external result")
    import shutil
    
    return None

def load_test_result(test_df, model_dir, copy=True):
    """
    Loads a result from a launch_test_process(test_df, model_dir) call.
    :param test_df: 
    :param model_dir: 
    :param copy: 
    :return: 
    """
    if False:
        for f in  glob.glob(model_dir+"/*test*"):
            print(f)
    
    if copy:
        test_df = test_df.copy()
    
    class_info = pandas.read_csv(os.path.join(model_dir, "class_info.csv"))
    result = pandas.read_csv(os.path.join(model_dir, "test_results.tsv"), sep="\t", header=None)
    result
    columns = [""]*len(result.columns)
    for t in class_info.itertuples():
        columns[t.one_hot_class] = "prob_%s" % t.class_label        
    result.columns = columns
    result
    for k in result.columns:
        test_df[k] = result[k].values
    return test_df


def copy_essential_model_files(bert_model_dir, test_model_dir, checkpoint):
    """
    In retrospect some files should have been copied over to test_model_dir from the original bert_model_dir.
    This function copies those files and saves a configuration json about the fine-tuning (again in retrospect should have been done at fine-tuning time).
    :param bert_model_dir:  Original bert model (not fine-tuned).
    :param test_model_dir: Fine-tuned bert model for bot detection task
    :param checkpoint: Checkpoint to use for fine-tuned task.
    :return: 
    """
    checkpoint = os.path.split(checkpoint)[-1]
    model_fine_tune_conf = {"do_lower_case":False, "max_sequence_length":128,"init_checkpoint":checkpoint}
    import json
    with open(os.path.join(test_model_dir, "fine_tune_conf.json") , "w") as fp:
        json.dump(model_fine_tune_conf, fp)
    #json.dump()
    import shutil
    for f in ["bert_config.json", "vocab.txt"]:
        shutil.copy(os.path.join(bert_model_dir, f), os.path.join(test_model_dir, f))
    return model_fine_tune_conf

def load_original_test_csv(model_dir):
    """
    Unfortunately this script continuous rewrites test.csv to communicate with BERT.  This function make sure
    we keep the original test csv around.
    """
    test_df = pandas.read_csv(os.path.join(model_dir, "test.csv"))
    # the model_dir test.csv can be changed during the course of this script so saving and loading backups.
    if test_df["split"][0] == "test":
        print("saving original_test.csv for backup")
        test_df.to_csv(os.path.join(model_dir, "original_test.csv"))
    else:
        print("loading original_test.csv as test_df")
        test_df = pandas.read_csv(os.path.join(model_dir, "original_test.csv"))
    return test_df


def score(true, pred):
    tmp = metrics.precision_recall_fscore_support(true,pred )
    s = collections.OrderedDict()
    assert(len(tmp[0]) == 2)  # in some cases there is only one class which is not handled here
    s["precision"] = tmp[0][-1]  #bot precision
    s["recall"] = tmp[1][-1] # bot recall
    s["f1"] = tmp[2][-1]  #geometric average precision and recall
    s["support"] = tmp[3][-1]
    return s


In [111]:
# IMPORTANT PATHS
module_path = os.path.split(botornot_study.__file__)[0]
run_classifier_path = os.path.join(module_path, "bert", "run_classifier.py")
model_dir = "data/jobs"
checkpoint = os.path.join(model_dir, "model.ckpt-10933")
bert_model_dir = "data/multi_cased_L-12_H-768_A-12/" 

# CATCHING OBVIOUS ERRORS EARLY
assert_path(model_dir)
assert_path(run_classifier_path)
assert_path(checkpoint)
assert_path(bert_model_dir)




In [112]:
#Fix up the model directory
copy_essential_model_files(bert_model_dir, model_dir, checkpoint)


{'do_lower_case': False,
 'max_sequence_length': 128,
 'init_checkpoint': 'model.ckpt-10933'}

In [113]:
# Data has already been split into train, develop/validation, and test csv.
train_df = pandas.read_csv(os.path.join(model_dir, "train.csv"))
dev_df = pandas.read_csv(os.path.join(model_dir, "dev.csv"))
test_df = load_original_test_csv(model_dir)


loading original_test.csv as test_df


In [114]:
dev_df.head()


Unnamed: 0,file_source,split,one_hot_class,text,id
0,social_spambots_2.csv,dev,1,voy a conocerte con el riesgo de enamorarme. ...,chunk-2384586715-4
1,genuine_accounts.csv,dev,0,"holy shit how long is calum's hair, he looks ...",chunk-18273041-112
2,genuine_accounts.csv,dev,0,: “ : FAITH IN HUMANITY RESTORED URL EM...,chunk-23808818-121
3,social_spambots_2.csv,dev,1,"very cool ""Man can believe the impossible, ...",chunk-2360265066-7
4,social_spambots_2.csv,dev,1,"""The most complicated achievements of thoug...",chunk-2361375811-4


In [115]:
print(*dev_df.text.iloc[:5], sep="\n\n\n")

 voy a conocerte con el riesgo de enamorarme.   "You cannot teach a man anything; you can only help him find it within himself." - Galileo   KOOL! no but we're creative and can dream up a whole mess of not the best house situation to live which i dont want to do again   "We make our own fortunes and call them fate." - Benjamin Disraeli "Every exit is an entry somewhere else." -Tom Stoppard   hi That green shit looking naked juice is so fucking great. I want to bathe in it. There are times in life when you have to distance yourself from those you love, because you love them.   NICE "Everyone needs to be valued. Everyone has the potential to give something back." - Diana Princess of Wales




    : “ : FAITH IN HUMANITY RESTORED  URL   EMOJI ️    : DOUG DILLARD Duelin' Banjo    th Century Fox BLUEGRASS LP  URL   URL     : Bluegrass Mountain Conference meet is quickly approaching...Feb.  -14. For those planning to attend:All-session... ht…    :   Gig   THIS THURS  th Jan  URL  + FREE WHIS

In [116]:
print("#### STUDYING THE CLASSIFICATIONS OF THE DEVELOPMENT DATA SET ####")
launch_test_process(dev_df, model_dir, 8, run_classifier_path)

#### STUDYING THE CLASSIFICATIONS OF THE DEVELOPMENT DATA SET ####
Saving dev_df as test set for prediction
 --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/data/jobs/model.ckpt-10933 --max_seq_length=128 --do_lower_case=False
Launching external command
bert uses a /home/jovyan/app/data/jobs/test.csv as input.
And, bert uses /home/jovyan/app/data/jobs/test_result.tsv as output.
BERT START (wait)
RUNNING THE FOLLOWING COMMAND

python /home/jovyan/botornot/src/botornot_study/bert/run_classifier.py --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/dat

In [117]:
print("#### CHECKING DEVELOPMENT SET PERFORMANCE ####")
dev_result = load_test_result(dev_df, model_dir)
dev_result.to_csv("dev_result.csv", index=False)
print("#### CHECKING RESULT ###")
msg =classification_report(dev_result["one_hot_class"], dev_result["prob_social_spambots_2.csv"] > 0.50)
print(msg)
score_dev = score(dev_result["one_hot_class"],dev_result["prob_social_spambots_2.csv"] > 0.50 )
dev_result["misclassified"] = dev_result["one_hot_class"] != (dev_result["prob_social_spambots_2.csv"] > 0.50) 
misclassified = dev_result[dev_result["misclassified"]]

#### CHECKING DEVELOPMENT SET PERFORMANCE ####
#### CHECKING RESULT ###
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1995
           1       0.97      0.97      0.97      2271

    accuracy                           0.97      4266
   macro avg       0.97      0.97      0.97      4266
weighted avg       0.97      0.97      0.97      4266



In [118]:
print("#### GETTING WORD COUNTS ####")
from sklearn.feature_extraction.text import CountVectorizer

counts_by_source = {}  #counts is total word counts

#getting word counts for each file source seperately
for k,v in dev_df.groupby("file_source"):
    t = v.text.values.copy()
    c = CountVectorizer(lowercase=False)  #purposely didnt stop word here
    sparse_counts = c.fit_transform(t)
    counts = np.array(np.sum(sparse_counts.tocsc(),0)).reshape(-1)
    prop = counts/np.sum(counts)
    counts_by_source[k] = pandas.DataFrame.from_records(zip(c.get_feature_names(), counts, prop),columns=["term", "counts_%s" %k, k])


#getting term vectors for each document that the classifier saw
counter = CountVectorizer(lowercase=False)
term_vec = counter.fit_transform(dev_df.text.values.copy())


k1 = "genuine_accounts.csv"
k2 = "social_spambots_2.csv"

common = counts_by_source[k1].merge(counts_by_source[k2], on ="term", how="outer")
common.fillna( 0, inplace=True)
#common = train_counts_df.copy()
#common_genuine = train_counts_df.sort_values(k1, ascending=False)
#common_bot = train_counts_df.sort_values(k2, ascending=False)



#### GETTING WORD COUNTS ####


In [119]:
print("#### USING AN KL INFORMATION MEASURE FOR RELATIVE WORD FREQUENCY")
print("https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence")
common["info_gen_bot"] = common[k1]*np.log10((common[k1]+0.00001)/(common[k2] + 0.00001))
common["info_bot_gen"] = common[k2]*np.log10((common[k2]+0.00001)/(common[k1] + 0.00001))
common.sort_values("info_gen_bot", ascending= False, inplace=True)
common["rank_gen_pref"] = np.arange(len(common))
common.sort_values("info_bot_gen", ascending=False, inplace=True)
common["rank_bot_pref"] = np.arange(len(common))

#### USING AN KL INFORMATION MEASURE FOR RELATIVE WORD FREQUENCY
https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence


In [120]:
common

Unnamed: 0,term,counts_genuine_accounts.csv,genuine_accounts.csv,counts_social_spambots_2.csv,social_spambots_2.csv,info_gen_bot,info_bot_gen,rank_gen_pref,rank_bot_pref
32407,you,3488.0,0.015232,6683.0,0.025781,-0.003479,0.005889,47884,0
22821,is,2547.0,0.011123,4745.0,0.018305,-0.002405,0.003957,47882,1
30689,to,4433.0,0.019359,7019.0,0.027077,-0.002820,0.003944,47883,2
31977,what,578.0,0.002524,1966.0,0.007584,-0.001203,0.003615,47879,3
29353,song,100.0,0.000437,829.0,0.003198,-0.000374,0.002738,47856,4
45565,vip,0.0,0.000000,328.0,0.001265,-0.000000,0.002664,38587,5
38390,TALNTS,0.0,0.000000,328.0,0.001265,-0.000000,0.002664,42198,6
16330,best,203.0,0.000887,990.0,0.003819,-0.000559,0.002408,47867,7
32000,where,166.0,0.000725,828.0,0.003194,-0.000464,0.002043,47860,8
18128,cool,95.0,0.000415,668.0,0.002577,-0.000325,0.002022,47849,9


In [121]:
print("#### EXPERIMENT #1: Checking model sensitivity to obscuring key terms.####")
replace = [
    (" you ", " u "),
    (" song ", " track "),
    (" vip ", " VIP "),
    (" TALNTS ", " talents "),
    (" best ", " bests "),
    (" where ", " wheer "),
    (" when ", " whne "),
    (" and ", " & ")
    ]

exp_df = dev_df.copy()
exp_df["num_replacements"] = 0
# THIS LOOP REPLACES PHRASES IN BOT CONTENT
for r in replace:
    new_text = []
    counts = []
    for record in tqdm(exp_df.itertuples()):
        c = record.num_replacements
        t = record.text
        if record.one_hot_class:  #bot class
            if r[0] in t:
                c = c + 1
            t = t.replace(r[0], r[1])
        new_text.append(t)
        counts.append(c)
    exp_df["text"] = new_text
    exp_df["num_replacements"] = counts



4266it [00:00, 182645.85it/s]
4266it [00:00, 231881.46it/s]
4266it [00:00, 241179.97it/s]
4266it [00:00, 279803.92it/s]
4266it [00:00, 221877.92it/s]
4266it [00:00, 223666.85it/s]

#### EXPERIMENT #1: Checking model sensitivity to obscuring key terms.####



4266it [00:00, 229316.79it/s]
4266it [00:00, 216236.25it/s]


In [122]:
exp_df

Unnamed: 0,file_source,split,one_hot_class,text,id,num_replacements
0,social_spambots_2.csv,dev,1,voy a conocerte con el riesgo de enamorarme. ...,chunk-2384586715-4,4
1,genuine_accounts.csv,dev,0,"holy shit how long is calum's hair, he looks ...",chunk-18273041-112,0
2,genuine_accounts.csv,dev,0,: “ : FAITH IN HUMANITY RESTORED URL EM...,chunk-23808818-121,0
3,social_spambots_2.csv,dev,1,"very cool ""Man can believe the impossible, ...",chunk-2360265066-7,2
4,social_spambots_2.csv,dev,1,"""The most complicated achievements of thoug...",chunk-2361375811-4,2
5,genuine_accounts.csv,dev,0,Happy father's day to the men of my life! SM...,chunk-153162975-25,0
6,social_spambots_2.csv,dev,1,"""It is not the strongest of the species that ...",chunk-2368655138-5,2
7,genuine_accounts.csv,dev,0,: URL Meaning she will debut on year...,chunk-581233948-66,0
8,social_spambots_2.csv,dev,1,One important key to success is self-confiden...,chunk-2355868272-3,3
9,social_spambots_2.csv,dev,1,Certo ou errado sei que a gente se adora hmmm...,chunk-2358593826-8,3


In [123]:
print("#### STUDYING THE CLASSIFICATIONS OF THE EXPERIMENTAL DATA SET ####")
launch_test_process(exp_df, model_dir, 8, run_classifier_path)



#### STUDYING THE CLASSIFICATIONS OF THE EXPERIMENTAL DATA SET ####
Saving dev_df as test set for prediction
 --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/data/jobs/model.ckpt-10933 --max_seq_length=128 --do_lower_case=False
Launching external command
bert uses a /home/jovyan/app/data/jobs/test.csv as input.
And, bert uses /home/jovyan/app/data/jobs/test_result.tsv as output.
BERT START (wait)
RUNNING THE FOLLOWING COMMAND

python /home/jovyan/botornot/src/botornot_study/bert/run_classifier.py --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/da

In [124]:
result_1 = load_test_result(exp_df, model_dir)
result_1.to_csv("result_1.csv", index=False)
msg =classification_report(result_1["one_hot_class"], result_1["prob_social_spambots_2.csv"] > 0.50)
print(msg)
score_1 = score(result_1["one_hot_class"],result_1["prob_social_spambots_2.csv"] > 0.50 )

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1995
           1       0.97      0.96      0.97      2271

    accuracy                           0.96      4266
   macro avg       0.96      0.96      0.96      4266
weighted avg       0.96      0.96      0.96      4266



In [125]:


print("#### EXPERIMENT #2: USING URLS ####")
exp_df = dev_df.copy()
new_text = []
for record in tqdm(exp_df.itertuples()):
    t = record.text
    if record.one_hot_class:  #bot class
        t = " URL " + t + " URL "
    new_text.append(t)

exp_df["text"] = new_text




4266it [00:00, 327109.71it/s]

#### EXPERIMENT #2: USING URLS ####





In [126]:
exp_df

Unnamed: 0,file_source,split,one_hot_class,text,id
0,social_spambots_2.csv,dev,1,URL voy a conocerte con el riesgo de enamora...,chunk-2384586715-4
1,genuine_accounts.csv,dev,0,"holy shit how long is calum's hair, he looks ...",chunk-18273041-112
2,genuine_accounts.csv,dev,0,: “ : FAITH IN HUMANITY RESTORED URL EM...,chunk-23808818-121
3,social_spambots_2.csv,dev,1,"URL very cool ""Man can believe the impossi...",chunk-2360265066-7
4,social_spambots_2.csv,dev,1,"URL ""The most complicated achievements of ...",chunk-2361375811-4
5,genuine_accounts.csv,dev,0,Happy father's day to the men of my life! SM...,chunk-153162975-25
6,social_spambots_2.csv,dev,1,"URL ""It is not the strongest of the species ...",chunk-2368655138-5
7,genuine_accounts.csv,dev,0,: URL Meaning she will debut on year...,chunk-581233948-66
8,social_spambots_2.csv,dev,1,URL One important key to success is self-con...,chunk-2355868272-3
9,social_spambots_2.csv,dev,1,URL Certo ou errado sei que a gente se adora...,chunk-2358593826-8


In [127]:
print("#### STUDYING THE CLASSIFICATIONS OF THE EXPERIMENTAL DATA SET ####")
launch_test_process(exp_df, model_dir, 8, run_classifier_path)


#### STUDYING THE CLASSIFICATIONS OF THE EXPERIMENTAL DATA SET ####
Saving dev_df as test set for prediction
 --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/data/jobs/model.ckpt-10933 --max_seq_length=128 --do_lower_case=False
Launching external command
bert uses a /home/jovyan/app/data/jobs/test.csv as input.
And, bert uses /home/jovyan/app/data/jobs/test_result.tsv as output.
BERT START (wait)
RUNNING THE FOLLOWING COMMAND

python /home/jovyan/botornot/src/botornot_study/bert/run_classifier.py --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/da

In [128]:
result_2 = load_test_result(exp_df, model_dir)
result_2.to_csv("result_2.csv", index=False)
msg =classification_report(result_2["one_hot_class"], result_2["prob_social_spambots_2.csv"] > 0.50)
print(msg)
score_2 = score(result_2["one_hot_class"],result_2["prob_social_spambots_2.csv"] > 0.50 )

              precision    recall  f1-score   support

           0       0.94      0.97      0.95      1995
           1       0.97      0.94      0.96      2271

    accuracy                           0.95      4266
   macro avg       0.95      0.95      0.95      4266
weighted avg       0.95      0.95      0.95      4266



In [129]:
# REMOVED THERE IS ANOTHER TEXT TRANSFORMATION THAT IS FAIRLY BAD FOR THIS MODEL.

print("#### EXPERIMENT #3: <DESCRIPTION SNIP> ####")
exp_df = dev_df.copy()
new_text = []
for record in tqdm(exp_df.itertuples()):
    t = record.text
    if record.one_hot_class:  #bot class
        t = t  # I did a transformation to t here that is motivated by how spambot #2 appeared to operate.  
        # The transformation lowered recall to 78%
    new_text.append(t)

exp_df["text"] = new_text
print(*exp_df["text"].iloc[:10], sep="\n\n\n")
exp_df

4266it [00:00, 399305.98it/s]

#### EXPERIMENT #3: <DESCRIPTION SNIP> ####
 voy a conocerte con el riesgo de enamorarme.   "You cannot teach a man anything; you can only help him find it within himself." - Galileo   KOOL! no but we're creative and can dream up a whole mess of not the best house situation to live which i dont want to do again   "We make our own fortunes and call them fate." - Benjamin Disraeli "Every exit is an entry somewhere else." -Tom Stoppard   hi That green shit looking naked juice is so fucking great. I want to bathe in it. There are times in life when you have to distance yourself from those you love, because you love them.   NICE "Everyone needs to be valued. Everyone has the potential to give something back." - Diana Princess of Wales




    : “ : FAITH IN HUMANITY RESTORED  URL   EMOJI ️    : DOUG DILLARD Duelin' Banjo    th Century Fox BLUEGRASS LP  URL   URL     : Bluegrass Mountain Conference meet is quickly approaching...Feb.  -14. For those planning to attend:All-session... ht…    : 




Unnamed: 0,file_source,split,one_hot_class,text,id
0,social_spambots_2.csv,dev,1,voy a conocerte con el riesgo de enamorarme. ...,chunk-2384586715-4
1,genuine_accounts.csv,dev,0,"holy shit how long is calum's hair, he looks ...",chunk-18273041-112
2,genuine_accounts.csv,dev,0,: “ : FAITH IN HUMANITY RESTORED URL EM...,chunk-23808818-121
3,social_spambots_2.csv,dev,1,"very cool ""Man can believe the impossible, ...",chunk-2360265066-7
4,social_spambots_2.csv,dev,1,"""The most complicated achievements of thoug...",chunk-2361375811-4
5,genuine_accounts.csv,dev,0,Happy father's day to the men of my life! SM...,chunk-153162975-25
6,social_spambots_2.csv,dev,1,"""It is not the strongest of the species that ...",chunk-2368655138-5
7,genuine_accounts.csv,dev,0,: URL Meaning she will debut on year...,chunk-581233948-66
8,social_spambots_2.csv,dev,1,One important key to success is self-confiden...,chunk-2355868272-3
9,social_spambots_2.csv,dev,1,Certo ou errado sei que a gente se adora hmmm...,chunk-2358593826-8


In [130]:
print("#### STUDYING THE CLASSIFICATIONS OF THE EXPERIMENTAL DATA SET ####")
launch_test_process(exp_df, model_dir, 8, run_classifier_path)

#### STUDYING THE CLASSIFICATIONS OF THE EXPERIMENTAL DATA SET ####
Saving dev_df as test set for prediction
 --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/data/jobs/model.ckpt-10933 --max_seq_length=128 --do_lower_case=False
Launching external command
bert uses a /home/jovyan/app/data/jobs/test.csv as input.
And, bert uses /home/jovyan/app/data/jobs/test_result.tsv as output.
BERT START (wait)
RUNNING THE FOLLOWING COMMAND

python /home/jovyan/botornot/src/botornot_study/bert/run_classifier.py --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/da

In [131]:
result_3 = load_test_result(exp_df, model_dir)
result_3.to_csv("result_3.csv", index=False)
msg =classification_report(result_3["one_hot_class"], result_3["prob_social_spambots_2.csv"] > 0.50)
print(msg)
score_3 = score(result_3["one_hot_class"],result_3["prob_social_spambots_2.csv"] > 0.50 )

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1995
           1       0.97      0.97      0.97      2271

    accuracy                           0.97      4266
   macro avg       0.97      0.97      0.97      4266
weighted avg       0.97      0.97      0.97      4266



In [132]:
# REMOVED THERE IS ANOTHER TEXT TRANSFORMATION THAT IS FAIRLY BAD FOR THIS MODEL.

print("#### EXPERIMENT #4: STRINGING ALL THE BAD THINGS TOGETHER ####")
exp_df = dev_df.copy()
new_text = []
for record in tqdm(exp_df.itertuples()):
    t = record.text
    if record.one_hot_class:  #bot class
        t = t  # I did a transformation here that lowers bot recall to 38%.  
        #The transformation was based on all the things above.
    new_text.append(t)

exp_df["text"] = new_text
print(*exp_df["text"].iloc[:10], sep="\n\n\n")
exp_df

4266it [00:00, 405688.72it/s]

#### EXPERIMENT #4: STRINGING ALL THE BAD THINGS TOGETHER ####
 voy a conocerte con el riesgo de enamorarme.   "You cannot teach a man anything; you can only help him find it within himself." - Galileo   KOOL! no but we're creative and can dream up a whole mess of not the best house situation to live which i dont want to do again   "We make our own fortunes and call them fate." - Benjamin Disraeli "Every exit is an entry somewhere else." -Tom Stoppard   hi That green shit looking naked juice is so fucking great. I want to bathe in it. There are times in life when you have to distance yourself from those you love, because you love them.   NICE "Everyone needs to be valued. Everyone has the potential to give something back." - Diana Princess of Wales




    : “ : FAITH IN HUMANITY RESTORED  URL   EMOJI ️    : DOUG DILLARD Duelin' Banjo    th Century Fox BLUEGRASS LP  URL   URL     : Bluegrass Mountain Conference meet is quickly approaching...Feb.  -14. For those planning to attend:All-s




Unnamed: 0,file_source,split,one_hot_class,text,id
0,social_spambots_2.csv,dev,1,voy a conocerte con el riesgo de enamorarme. ...,chunk-2384586715-4
1,genuine_accounts.csv,dev,0,"holy shit how long is calum's hair, he looks ...",chunk-18273041-112
2,genuine_accounts.csv,dev,0,: “ : FAITH IN HUMANITY RESTORED URL EM...,chunk-23808818-121
3,social_spambots_2.csv,dev,1,"very cool ""Man can believe the impossible, ...",chunk-2360265066-7
4,social_spambots_2.csv,dev,1,"""The most complicated achievements of thoug...",chunk-2361375811-4
5,genuine_accounts.csv,dev,0,Happy father's day to the men of my life! SM...,chunk-153162975-25
6,social_spambots_2.csv,dev,1,"""It is not the strongest of the species that ...",chunk-2368655138-5
7,genuine_accounts.csv,dev,0,: URL Meaning she will debut on year...,chunk-581233948-66
8,social_spambots_2.csv,dev,1,One important key to success is self-confiden...,chunk-2355868272-3
9,social_spambots_2.csv,dev,1,Certo ou errado sei que a gente se adora hmmm...,chunk-2358593826-8


In [133]:
print("#### STUDYING THE CLASSIFICATIONS OF THE EXPERIMENTAL DATA SET ####")
launch_test_process(exp_df, model_dir, 8, run_classifier_path)

#### STUDYING THE CLASSIFICATIONS OF THE EXPERIMENTAL DATA SET ####
Saving dev_df as test set for prediction
 --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/data/jobs/model.ckpt-10933 --max_seq_length=128 --do_lower_case=False
Launching external command
bert uses a /home/jovyan/app/data/jobs/test.csv as input.
And, bert uses /home/jovyan/app/data/jobs/test_result.tsv as output.
BERT START (wait)
RUNNING THE FOLLOWING COMMAND

python /home/jovyan/botornot/src/botornot_study/bert/run_classifier.py --do_predict=True --task_name=text_classifier --data_dir=/home/jovyan/app/data/jobs --output_dir=/home/jovyan/app/data/jobs --bert_config_file=/home/jovyan/app/data/jobs/bert_config.json --vocab_file=/home/jovyan/app/data/jobs/vocab.txt --init_checkpoint=/home/jovyan/app/da

In [134]:
result_4 = load_test_result(exp_df, model_dir)
result_4.to_csv("result_4.csv", index=False)
msg =classification_report(result_4["one_hot_class"], result_4["prob_social_spambots_2.csv"] > 0.50)
print(msg)
score_4 = score(result_4["one_hot_class"],result_4["prob_social_spambots_2.csv"] > 0.50 )

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1995
           1       0.97      0.97      0.97      2271

    accuracy                           0.97      4266
   macro avg       0.97      0.97      0.97      4266
weighted avg       0.97      0.97      0.97      4266

