# Training & Testing CRF model on abusive language dataset

Referenced:
https://rajmak.wordpress.com/tag/crfsuite/


In [1]:
from data.preprocess import load_preprocessed_data
from data.tokenizer import tokenize_with_dictionary
import nltk

nltk.download('averaged_perceptron_tagger')

sexism_binary = load_preprocessed_data("sexism_binary")
racism_binary = load_preprocessed_data("racism_binary")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /homes/jhpark/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
preprocessed file already exists in /home/homes/jhpark/hate-speech/data/preprocessed/
preprocessed file already exists in /home/homes/jhpark/hate-speech/data/preprocessed/


In [11]:
def preprocess_for_crf(tweets, labels, file_name):
    # tokenize tweet and add pos tag
    x_data = list(map(lambda x: nltk.pos_tag(tokenize_with_dictionary(x)), tweets))
    
    # change label [0,1] to ["neg" ,"pos"]
    labels = ["neg" if y == 0 else "pos" for y in labels]
    
    with open("%s.tsv" % file_name, "w") as f:
        for i, x in enumerate(x_data):
            for j in range(len(x) - 1):
                f.write("%s\t%s\t%s\n" % (x[j][0], x[j][1], "."))
            # label the last token of the sentence
            f.write("%s\t%s\t%s\n\n" % (x[len(x) - 1][0], x[len(x) - 1][1], labels[i]))
    print("Data Written into %s for CRFsuite" % (file_name + ".tsv"))

In [4]:
preprocess_for_crf(sexism_binary["x_train"], sexism_binary["y_train"], "sexism_train")

[[('we', 'PRP'), ('can', 'MD'), ('see', 'VB'), ('that', 'IN'), ('by', 'IN'), ('the', 'DT'), ('dae', 'NN'), ('sh', 'NN'), ('fleeing', 'VBG'), ('ko', 'JJ'), ('bane', 'NN'), ('and', 'CC'), ('the', 'DT'), ('mosul', 'NN'), ('e', 'NN'), ('countryside', 'NN'), ('.', '.')], [('people', 'NNS'), ('say', 'VBP'), ('im', 'VB'), ('a', 'DT'), ('public', 'JJ'), ('figure', 'NN'), ('but', 'CC'), ('ewww', 'JJ'), ('twitter', 'NN'), ("hasn't", 'NN'), ('verified', 'VBD'), ('me', 'PRP'), ('yet', 'RB'), ('so', 'RB'), ('all', 'PDT'), ('these', 'DT'), ('nasty', 'JJ'), ('plebs', 'NN'), ('think', 'VBP'), ('they', 'PRP'), ('are', 'VBP'), ('on', 'IN'), ('equal', 'JJ'), ('ground', 'NN'), ('to', 'TO'), ('me', 'PRP'), ('.', '.')], [('actually', 'RB'), ('compared', 'VBN'), ('to', 'TO'), ('muslim', 'VB'), ('slaughter', 'NN'), ('of', 'IN'), ('muslims', 'NNS'), ('christians', 'NNS'), ('v', 'VBP'), ('few', 'JJ'), ('.', '.'), ('far', 'RB'), ('more', 'RBR'), ('killed', 'JJ'), ('eg', 'NN'), ('in', 'IN'), ('syria', 'NN'), ('ht

In [3]:
preprocess_for_crf(sexism_binary["x_valid"], sexism_binary["y_valid"], "sexism_valid")

[[('mkr', 'NN'), ('chicken', 'NN'), ('liver', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('baaad', 'NN'), ('choice', 'NN')], [('apple', 'NN'), ('did', 'VBD'), ('hire', 'VB'), ('a', 'DT'), ('lot', 'NN'), ('of', 'IN'), ('foss', 'NN'), ('devs', 'NN'), ('.', '.'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('original', 'JJ'), ('creators', 'NNS'), ('of', 'IN'), ('freebsd', 'NN'), ('became', 'VBD'), ('a', 'DT'), ('director', 'NN'), ('at', 'IN'), ('apple', 'NN'), ('.', '.')], [('i', 'NN'), ('think', 'VBP'), ('the', 'DT'), ('biggest', 'JJS'), ('problem', 'NN'), ('i', 'NN'), ('have', 'VBP'), ('with', 'IN'), ('it', 'PRP'), ('is', 'VBZ'), ('that', 'IN'), ("there's", 'VB'), ('too', 'RB'), ('much', 'JJ'), ('of', 'IN'), ('a', 'DT'), ('reliance', 'NN'), ('on', 'IN'), ('wow', 'NN'), ('services', 'NNS'), ('for', 'IN'), ('an', 'DT'), ('external', 'JJ'), ('mechanism', 'NN'), ('.', '.')], [('well', 'RB'), ('damn', 'RB'), ('.', '.'), ('marking', 'VBG'), ('you', 'PRP'), ('off', 'IN'), ('my', 'PRP$'), ('potential',

In [12]:
preprocess_for_crf(racism_binary["x_train"], racism_binary["y_train"], "racism_train")

Data Written into racism_train.tsv for CRFsuite


In [13]:
preprocess_for_crf(racism_binary["x_valid"], racism_binary["y_valid"], "racism_valid")

Data Written into racism_valid.tsv for CRFsuite


After cloning CRFSuite repository (https://github.com/chokkan/crfsuite), use the existing scrip to convert tsv to a format CRFSuite can read by executing the following command. (file path can be different of course)

cat "sexism_train.tsv" | python /homes/jhpark/crfsuite/example/chunking.py -s $'\t' > sexism_train.txt 
cat "sexism_valid.tsv" | python /homes/jhpark/crfsuite/example/chunking.py -s $'\t' > sexism_valid.txt 

cat "racism_train.tsv" | python /homes/jhpark/crfsuite/example/chunking.py -s $'\t' > racism_train.txt 
cat "racism_valid.tsv" | python /homes/jhpark/crfsuite/example/chunking.py -s $'\t' > racism_valid.txt 



## running the CRFSuite

Download the CRFSuite binary from (http://www.chokkan.org/software/crfsuite/) and train & test the model


### training
to set different training hyperparameter see http://www.chokkan.org/software/crfsuite/manual.html

crfsuite-0.12/bin/crfsuite learn -m sexism_binary sexism_train.txt

crfsuite-0.12/bin/crfsuite learn -m racism_binary racism_train.txt

### testing
using nohup to save the output of the crfsuite

crfsuite-0.12/bin/crfsuite tag -qt -m sexism_binary sexism_valid.txt

crfsuite-0.12/bin/crfsuite tag -qt -m racism_binary racism_valid.txt