In [1]:
import argparse
import fnmatch
import getpass
import glob
#!pip install html5lib
import html5lib
import json
import nltk
import os
import pandas as pd
import numpy as np
import re
import requests
import sys

from argparse import RawTextHelpFormatter
from bs4 import BeautifulSoup
from itertools import islice, chain
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.metrics.scores import precision, recall
from nltk import word_tokenize, bigrams
import pickle
import random
import re
import string

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JonH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# This notebook is for processing human trafficking news article annotations from 
# https://www.tagtog.com/jhudlow/HT_News/pool. 
#Full project guidelines can be found here: https://www.tagtog.com/jhudlow/HT_News/-settings. 

class Annotation(object):
    """This is a class for annotations by individuals or 'master' versions."""
    def __init__(self, relevant, curator):
        self.relevant = relevant
        self.curator = curator

        
class Article(object):
    """This is a class for news articles."""
    objects = []
    
    def __init__(self, uid, url, text):
        self.uid = uid
        self.url = url
        self.text = text
        self.ann1 = None
        self.ann2 = None
        self.master_ann = None
        self.__class__.objects.append(self)
        
    @classmethod
    def merge_to_master(cls, print_con_url=False):
        """Update 'master' versions of 'relevant' to the values entered by ann1 and ann2 (first and 
        second annotations) when they are in agreement. When they are not, update to 'disagree'. 
        Optionally print article url for conflicting annotations."""
        twice_ann = 0 # number annotated twice
        once_ann = 0 # number annotated once
        con_ann = 0 # number of conflicting annotations
        for obj in cls.objects:
            if obj.ann2 != None and obj.ann1 != None:
                twice_ann += 1
                if obj.master_ann == None:
                    try:
                        if obj.ann1.relevant == obj.ann2.relevant:
                            master_relevant = obj.ann2.relevant
                        else:
                            master_relevant = 'disagree'
                            con_ann += 1
                            if print_con_url==True:
                                print(obj.url)
                        obj.master_ann = Annotation(master_relevant, 'master')
                    except:
                        print('Error getting annotations for {}'.format(str(obj.uid)))
            elif obj.ann1 != None or obj.ann2 != None:
                once_ann += 1
        print("Number of articles annotated only once: {} \n".format(once_ann) + \
             "Number of articles annotated twice: {} \n".format(twice_ann) + \
             "Number of remaining conflicts between 1st and 2nd annotations: {}".format(con_ann))
    
    @classmethod
    def get_article_relevance_conflicts(cls):
        num_rel_conflicts = 0
        for obj in cls.objects:
            try:
                if obj.master_ann.relevant =='disagree':
                    num_rel_conflicts += 1
                    if obj.ann1.relevant=='old' or obj.ann2.relevan=='old':
                        
                        print("Rel " + str(num_rel_conflicts) \
                              + ": https://tagtog.net/jhudlow/HT_News/-search/members_anncomplete:" \
                              + obj.ann2.curator + "/" + obj.uid)
            except:
                pass
        print("Number of relevant conflicts:" + str(num_rel_conflicts))
        
    @classmethod
    def get_confirmed_rel(cls):
        num_confirmed_rel = 0
        confirmed_rel = []
        for obj in cls.objects:
            try:
                if obj.master_ann.relevant !='disagree':
                    num_confirmed_rel += 1
                    confirmed_rel.append(obj.uid)
            except:
                pass
        print("Number of confirmed relevant articles:" + str(num_confirmed_rel))
        return confirmed_rel
    
    @classmethod
    def get_rel_confirmed(cls):
        num_rel_confirmed = 0
        rel_confirmed = []
        for obj in cls.objects:
            try:
                if obj.master_ann.relevant != 'disagree':
                    num_rel_confirmed += 1
                    rel_confirmed.append(obj.uid)
            except:
                pass
        print("Number of confirmed relevant articles: " + str(num_rel_confirmed))
        return rel_confirmed
    
    @classmethod
    def get_total_rel_cat(cls, cat, include):
        """Gets a list of all article uids for a given relevance category which are labeled 
        as such by either or both annotations.
        
        Args:
        cat: string corresponding to the relevance category.
        include: 'both' - returns articles where both annotators are in agreement about this category.
            'either' - returns articles where one or both annotators are in agreement.
        """
        rel_cat_num = 0
        total_rel_cat = []
        for obj in cls.objects:
            try:
                if include == 'both':
                    if obj.master_ann.relevant == cat:
                        rel_cat_num += 1
                        total_rel_cat.append(obj.uid)
                elif include == 'either':
                    if (obj.ann1.relevant == cat) | (obj.ann2.relevant == cat):
                        rel_cat_num += 1
                        total_rel_cat.append(obj.uid)
                else:
                    print('incorrect argument for "include" parameter')
            except:
                pass
            
        print("Number of {} articles:".format(cat) + str(rel_cat_num))
        return total_rel_cat
    

def get_article_dict(path_to_tagtog_folder):
    """Create 'Article' class instances for all articles that have html files and return them
    in a dictonary where each key is the article's uid."""
    
    path_to_html = path_to_tagtog_folder + r"\HT_News\plain.html\pool"
    html_files = [pos_html for pos_html in os.listdir(path_to_html) if pos_html.endswith('.html')]
    
    a_dict = {}

    for i in range(len(html_files)):
        f = open(path_to_html + str("\\") + html_files[i], encoding="utf8")
        parsed_html = BeautifulSoup(f)
        
        # Get URL
        if "http" in parsed_html.body.find('pre', attrs={'id':"s1s2v1"}).text:
            url = parsed_html.body.find('pre', attrs={'id':"s1s2v1"}).text
        elif "http" in parsed_html.body.find('pre', attrs={'id':"s1s5v1"}).text:
            url = parsed_html.body.find('pre', attrs={'id':"s1s5v1"}).text
        else:
            print("Error: URL not found for file {}".format(html_files[i]))

        # Get Article Text
        if str(parsed_html(text=re.compile(r'article'))[0].parent)[1:3]== 'h2':
            str_id = str(parsed_html(text=re.compile(r'article'))[0].parent)[8:12]+str('v1')
        elif str(parsed_html(text=re.compile(r'article'))[1].parent)[1:3]== 'h2':
            str_id = str(parsed_html(text=re.compile(r'article'))[1].parent)[8:12]+str('v1')
        else:
            print("Error: Text not found for file {}".format(html_files[i]))
        article_text = str(parsed_html.body.find(
            'pre', attrs={'id':str_id}))[17:].replace('\n', ' ').replace('</pre>', '')
    
        a_dict[html_files[i][:-11]] = Article(html_files[i][:-11], url, article_text)
    
    return a_dict


def update_doc_master_anns(path_to_tagtog_folder, a_dict):
    """Get the latest document level annotations from all members and add them to Article class
    instances in a_dict."""
    path_to_masters = path_to_tagtog_folder + r"\HT_News\ann.json\master\pool"
    json_master_files = [pos_json for pos_json in os.listdir(path_to_masters) if pos_json.endswith('.json')]
    for j in range(len(json_master_files)):
        f = open(path_to_masters + str("\\") + json_master_files[j])
        data = json.load(f)
        try:
            relevant = data.get('metas')['m_10']['value']
        except:
            relevant = 'incomplete'
        uid = json_master_files[j][:-9]
        if a_dict[uid].master_ann == None:
            a_dict[uid].master_ann = Annotation(relevant, 'master')
        else:
            print('Already have master ann for {}'.format(uid))

                
def update_doc_annotations(path_to_tagtog_folder, a_dict, print_incomplete_ann=True):
    """Get the latest document level annotations from all members and add them to Article class
    instances in a_dict."""
    path_to_mem_folders = path_to_tagtog_folder + r"\HT_News\ann.json\members"
    mem_folders = os.listdir(path_to_mem_folders)
    
    json_files = []
    for i in range(len(mem_folders)):
        path_to_json = path_to_tagtog_folder + r"\HT_News\ann.json\members" + str("\\") \
        + mem_folders[i] + str("\\pool")
        json_mem_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
        for j in range(len(json_mem_files)):
            f = open(path_to_json + str("\\") + json_mem_files[j])
            data = json.load(f)
            if data.get('anncomplete') == True:
                if "m_12" not in data.get('metas'):
                    try:
                        relevant = data.get('metas')['m_10']['value']
                        curator = str(data.get('metas')['m_10']['confidence']['who'])[7:-2]
                    except:
                        relevant = 'incomplete'
                else:
                    relevant = 'old'
                    curator = 'n/a'
                uid = json_mem_files[j][:-9]
                if a_dict[uid].ann1 == None:
                    a_dict[uid].ann1 = Annotation(relevant, curator)
                else:
                    a_dict[uid].ann2 = Annotation(relevant, curator)
            else:
                if print_incomplete_ann:
                    print('ann incomplete for {}: {}'.format(mem_folders[i], json_mem_files[j]))

In [21]:
# After downloading article annotations from Tagtog, extract relevant info:

tagtog_path = os.getcwd() + "\\tagtog_HT_News_4_1_22"
a_dict = get_article_dict(tagtog_path)
update_doc_annotations(tagtog_path, a_dict)
update_doc_master_anns(tagtog_path, a_dict)
#Article.merge_to_master()

In [4]:
rel_confirmed = list(a_dict.keys())
#Article.get_rel_confirmed()
op_relevant = []
not_relevant = []
rel_dict = {}

for i in range(len(list(a_dict.keys()))):
    try:
        if a_dict[rel_confirmed[i]].master_ann.relevant=='op_relevant':
            rel_dict[rel_confirmed[i]] = 'op_relevant'
            op_relevant.append(rel_confirmed[i])
        elif a_dict[rel_confirmed[i]].master_ann.relevant=='not_relevant':
            rel_dict[rel_confirmed[i]] = 'not_relevant'
            not_relevant.append(rel_confirmed[i])
    except:
        pass
print("\nop_relevant: {}, not_relevant: {}".format(str(len(op_relevant)), str(len(not_relevant))))


op_relevant: 57, not_relevant: 49


In [5]:
train_test = [([i], 'not_relevant') for i in not_relevant] + [([i], 'op_relevant') for i in op_relevant]
train, test = train_test_split(train_test, test_size=0.2, random_state=1)

In [6]:
#View text of a particular article:
#a_dict[train[2][0][0]].text

In [7]:
corp = []
for i in range(len(train_test)):
    corp += word_tokenize(re.sub(r'\d+', '', a_dict[train_test[i][0][0]].text))

all_words = nltk.FreqDist(corp)

stopWords = set(stopwords.words('english'))
words = [',', "'", 'of', 'the', '.', "'\\r\\n", 'and', 'in', 'a']

wordsFiltered = []

for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)

new_words = [word for word in all_words if word.isalnum()]
filtered_words = [w for w in new_words if not w.lower() in stopWords]
filtered_words = nltk.FreqDist(filtered_words)
len(filtered_words)

word_features = list(filtered_words)[:1000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(word_tokenize(re.sub(r'\d+', '', a_dict[arti[0]].text))), category) for (arti, category) in train_test]
print('Featuresets: {}'.format(str(len(featuresets))))

Featuresets: 106


In [9]:
train, test = train_test_split(featuresets, test_size=0.4, random_state=4)

classifier = nltk.NaiveBayesClassifier.train(train)

print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, test))*100)
classifier.show_most_informative_features(100)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test))*100)

Naive Bayes Algo accuracy percent: 69.76744186046511
Most Informative Features
                   woman = True           op_rel : not_re =      7.3 : 1.0
                   given = True           not_re : op_rel =      6.6 : 1.0
                     job = True           op_rel : not_re =      6.3 : 1.0
                   media = True           op_rel : not_re =      6.3 : 1.0
                 calling = True           not_re : op_rel =      5.7 : 1.0
              department = True           not_re : op_rel =      5.7 : 1.0
                 include = True           not_re : op_rel =      5.7 : 1.0
                  within = True           not_re : op_rel =      5.7 : 1.0
                  arrest = True           op_rel : not_re =      5.3 : 1.0
                    deal = True           not_re : op_rel =      4.8 : 1.0
               developed = True           not_re : op_rel =      4.8 : 1.0
                  impact = True           not_re : op_rel =      4.8 : 1.0
                incre

In [10]:
def find_ngrams(document, ngram_features):
    """
    Return a dictionary with boolean values indicating whether each ngram_feature is present in a document.
    """
    ngrams = set(document)
    found_ngrams = {}
    for g in ngram_features:
            found_ngrams[g] = (g in ngrams)
    return found_ngrams

In [12]:
# Get Training Set
tagtog_path = os.getcwd() + "\\tagtog_HT_News_12_21_21"
path_to_html = tagtog_path + r"\HT_News\plain.html\pool"
html_files = [pos_html for pos_html in os.listdir(path_to_html) if pos_html.endswith('.html')]

prev_a_dict = get_article_dict(tagtog_path)
update_doc_annotations(tagtog_path, prev_a_dict, print_incomplete_ann=False)
update_doc_master_anns(tagtog_path, prev_a_dict)

Article.merge_to_master()

def check_master_agree(k, dict1):
    try:
        if dict1[k].master_ann.relevant != 'disagree':
            return(k)
    except:
        pass

train_keys = [check_master_agree(k, prev_a_dict) for k in prev_a_dict.keys() \
              if check_master_agree(k, prev_a_dict)!=None]
train_dict = {k:check_master_agree(v, prev_a_dict) for (k,v) in prev_a_dict.items()}
#print(len(prev_a_dict))
#print(len(train_keys))

Number of articles annotated only once: 1019 
Number of articles annotated twice: 1119 
Number of remaining conflicts between 1st and 2nd annotations: 198


In [13]:
keyword_unigrams = [
    'slavery',
    'trafficking',
    'trafficked',
    'humantrafficking',
    'childtrafficking',
    'antitrafficking',
]

keyword_bigrams = [
('sex', 'racket'),
('child', 'labour'),
('child', 'labor'),
('child', 'prostitute'),
('teen', 'prostitute'),
('teenage', 'prostitute'),
]

def get_ann_articles(dict1):
    ann_articles = []

    for k in dict1.keys():
        try:
            if dict1[k].master_ann.curator:
                a_text = str(dict1[k].text + re.sub(r"[-!?',;./]", ' ', dict1[k].url)).translate(
                str.maketrans('-', ' ', string.punctuation)).lower()
                ann_articles.append((word_tokenize(a_text), 
                                     list(nltk.bigrams(a_text.split())),
                                     dict1[k].ann1.relevant,
                                     dict1[k].uid))
        except:
            pass
    return ann_articles

ann_articles = get_ann_articles(train_dict)

train_test_keys = [check_master_agree(k, a_dict) for k in a_dict.keys() if check_master_agree(k, a_dict)!=None]
test_dict = {k:v for (k,v) in a_dict.items() if (k in train_test_keys) and (k not in train_keys)}

test_ann_articles = get_ann_articles(test_dict)


print(len(ann_articles))
print(len(test_ann_articles))

0
389


In [14]:
featuresets = [(find_ngrams(arti_w, keyword_unigrams), 
                find_ngrams(arti_bi, keyword_bigrams), cat, uid) \
               for (arti_w, arti_bi, cat, uid) in test_ann_articles]

def get_rel_pred_accuracy(feat_sets):
    rel_dict = []
    for i in range(len(feat_sets)):
        rel_dict.append(
            {
                'Predicted': True in feat_sets[i][0].values() or True in feat_sets[i][1].values(),
                'Actual': feat_sets[i][2] == 'op_relevant',
                'UID':  feat_sets[i][3]
            }
        )

    rel_df = pd.DataFrame(rel_dict)

    rel_df['match'] = np.where(rel_df.Predicted == rel_df.Actual, 1, 0)
    print("Accuracy: " + "{:.2%}".format(sum(rel_df.match)/len(rel_df)))

    rel_df['false_negative'] = np.where((rel_df.Predicted == 0) & (rel_df.Actual == 1), 1, 0)
    FN = sum(rel_df.false_negative)/len(rel_df)
    print('False Negative Rate: ' + str("{:.2%}".format(FN)))

    rel_df['false_positive'] = np.where((rel_df.Predicted == 1) & (rel_df.Actual == 0), 1, 0)
    FP = sum(rel_df.false_positive)/len(rel_df)
    print('False Positive Rate: ' + str("{:.2%}".format(FP)))

    rel_df['true_positive'] = np.where((rel_df.Predicted == 1) & (rel_df.Actual == 1), 1, 0)
    TP = sum(rel_df.true_positive)/len(rel_df)
    print('True Positive Rate: ' + str("{:.2%}".format(TP)))

    rel_df['true_negative'] = np.where((rel_df.Predicted == 0) & (rel_df.Actual == 0), 1, 0)
    TN = sum(rel_df.true_negative)/len(rel_df)
    print('True Negative Rate: ' + str("{:.2%}".format(TN)))

    print('\nPrecision: ' + str("{:.2%}".format(TP/(TP + FP))))
    print('Recall: ' + str("{:.2%}".format(TP/(TP + FN))))
    return rel_df

rel_df = get_rel_pred_accuracy(featuresets)
rel_df

Accuracy: 33.16%
False Negative Rate: 0.26%
False Positive Rate: 66.58%
True Positive Rate: 16.71%
True Negative Rate: 16.45%

Precision: 20.06%
Recall: 98.48%


Unnamed: 0,Predicted,Actual,UID,match,false_negative,false_positive,true_positive,true_negative
0,True,False,a.g3znPbHy4w5uaRK4GzP_a3FdiS-Bibek_File2_filte...,0,0,1,0,0
1,True,False,a.TxAsEN4lLuKIOLjfb_myJyEAvC-Bibek_File2_filte...,0,0,1,0,0
2,True,False,a.YNixBS7dNz0BBslac2KuDy2MES-Bibek_File2_filte...,0,0,1,0,0
3,True,False,a0gW5H30w2xOQCppoOd6nDCWArQ4-Bibek_File2_filte...,0,0,1,0,0
4,True,False,a1EjA3QRD.d4F.xM8jzL0zAxAB.W-jan22_remaining2....,0,0,1,0,0
...,...,...,...,...,...,...,...,...
384,True,False,a_KXREyBtsUAgYQibndGV9f6HxUK-Bibek_File2_filte...,0,0,1,0,0
385,True,False,a_n1jHtG0DbynaxjOTiHH.na3cB0-Bibek_File2_filte...,0,0,1,0,0
386,True,True,a_NRV8U.eMMZw78YYtf4HY3Ya35C-Nov17_export50.cs...,1,0,0,1,0
387,True,True,a_RcQvn9aolbVjt4VCTU4w5yha_S-jan22_remaining2....,1,0,0,1,0


In [18]:
# Testing approach on articles that have only been annotated once so far:
test_ann_articles = []

for k in a_dict.keys():
    try:
        if a_dict[k].master_ann == None:
            a_text = str(a_dict[k].text + re.sub(r"[-!?',;./]", ' ', a_dict[k].url)).translate(
                str.maketrans('-', ' ', string.punctuation)).lower()
            test_ann_articles.append((word_tokenize(a_text), 
                                 list(nltk.bigrams(a_text.split())),
                                 a_dict[k].ann1.relevant,
                                 a_dict[k].uid))
    except:
        pass

print("Test set size: " + str(len(test_ann_articles)) + "\n")

test_featuresets = [(find_ngrams(arti_w, keyword_unigrams), 
                find_ngrams(arti_bi, keyword_bigrams), cat, uid) \
               for (arti_w, arti_bi, cat, uid) in test_ann_articles]

get_rel_pred_accuracy(test_featuresets)

Test set size: 503

Accuracy: 28.63%
False Negative Rate: 0.00%
False Positive Rate: 71.37%
True Positive Rate: 6.76%
True Negative Rate: 21.87%

Precision: 8.65%
Recall: 100.00%


Unnamed: 0,Predicted,Actual,UID,match,false_negative,false_positive,true_positive,true_negative
0,True,False,a..nn9ofRVdv6uc3IXRsxrCL69CS-12_2_21export50.c...,0,0,1,0,0
1,True,False,a.4jJuOgYmhR8ONnY0S58WPSNJ80-12_9_21export50.c...,0,0,1,0,0
2,True,False,a.ENWjpr1JBeYW793OGqMRVTS.Um-12_9_21export50.c...,0,0,1,0,0
3,True,False,a.q.REJp0pMHBTrhtca5n5Wj6t_y-12_9_21export50.c...,0,0,1,0,0
4,True,False,a.YZ0P.Wc56c6NJRPvIYFcRhOB6C-12_6_21export50.c...,0,0,1,0,0
...,...,...,...,...,...,...,...,...
498,True,False,a_nvtl58I_kOi3qHcOMmuhW3xxa8-12_9_21export50.c...,0,0,1,0,0
499,False,False,a_O9.pZoBZpgN8cXG71wTqteTfpS-1st_batch_wiki_re...,1,0,0,0,1
500,True,False,a_TlzpXaspTo93clUwqcM1UA3nWq-12_2_21export50.c...,0,0,1,0,0
501,True,False,a_vhO3zBGWhiUm0481KyndPVOS9a-12_9_21export50.c...,0,0,1,0,0
