In [8]:
### LOAD ###
import os
import json


hate_speech = [json.loads(ln) for ln in open("./src/data/processed/hate_cleaned.jsonl")]

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joshua.sheppard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
### LOGGING ###
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ARGUMENT-EXTRACTOR")

### NLP FUNCTIONS ###
from src.utils.utils import tokeniser, sentences_segment

### KEYPHRASE EXTRACTORS ###
from src.utils.keyphrase_extraction import yake_extract_keyphrase, extract_keyphrase

In [11]:
### CONNECT TO KNOWLEDGEBASE ###
from src.utils.elastic_db import ElasticDB

PORT = "http://localhost:9200"
db = ElasticDB(elastic_port=PORT)

INFO:src.utils.elastic_db:Connecting to http://localhost:9200 
INFO:src.utils.elastic_db:Connected to <Elasticsearch(['http://localhost:9200'])> 


In [12]:
from tqdm.notebook import tqdm
from src.detection.stance_classifier import sentence_stance, compare_stance
import time
import re

# Disable Huggingface Logging
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

topic_ids = [json.loads(ln)["id"] for ln in open("./src/data/processed/argument_topic_concept.jsonl")]
concept_ids = [json.loads(ln)["id"] for ln in open("./src/data/processed/argument_concept.jsonl")]

def clean(phrase):
    return re.sub(r"[,.;@#?!&$]+\ *", " ", phrase)

def get_notion(notions_ids, notions_lst, arg_id, label):
    notion_id = notions_ids.index(arg_id)
    notion = notions_lst[notion_id][label]
    return str(notion) if notion else None

### RETRIEVER ###
db = db
queries = []
retrieved_ev = []

# TODOs: Commonsense Query Expansion
def search(hs_, type="counters", l=5):
    id_ = hs_["id"]
    tgt = hs_["counter"]
    hs = hs_["hate"]

    counter_sents = sentences_segment(tgt)
    hate_sent = sentences_segment(hs)

    hate_speech = []
    for adu in hate_sent:
        kp = extract_keyphrase(adu)

        if kp: aspect = kp[0]
        else: aspect = None

        if aspect: stance = sentence_stance(adu, aspect)
        else: stance = None

        hate_speech.append({"sentence": adu, "selected_keyphrases": [], "stance": stance, "aspect": aspect})

    tgt_response = []
    retrieved = []
    for adu in counter_sents:

        kp = extract_keyphrase(adu)

        if kp: aspect = kp[0]
        else: aspect = None

        if aspect: stance = sentence_stance(adu, aspect)
        else: stance = None

        # TODOs: Common-sense Query Expansion
        query = []
        query.extend(kp)

        query = ", ".join(i for i in query)

        search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=l)]

        source = [i[0] for i in search]
        evidence = [i[1] for i in search]

        #print("query", query)
        merged = ", ".join(i for i in evidence)
        ev_kp = list(set(yake_extract_keyphrase(merged)))

        retrieved.append({"passages": evidence, "kp": [clean(i) for i in ev_kp], "source": source})

        tgt_response.append({"sentence": adu, "selected_keyphrases": [], "stance": stance, "aspect": aspect})

    # TODOs: Implement yield without storing list
    return ({
        "id": id_,
        "hate_speech": [i for i in hate_speech],
        "tgt_counter": [i for i in tgt_response],
        "retrieved": [i for i in retrieved],
    })

tic = time.time()
SAMPLE = hate_speech

retrieved_hate = []
with tqdm(total=(len(SAMPLE)), position=0, leave=True) as pbar:
    for hs in SAMPLE:
        retrieved_hate.append(search(hs))
        pbar.update()
toc = time.time()

  0%|          | 0/8867 [00:00<?, ?it/s]

INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.068s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.028s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.023s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.079s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.045s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.054s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.064s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.124s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.024s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.053s]
INFO:elast

In [13]:
import copy
retrieved_hate_ = copy.deepcopy(retrieved_hate)

In [14]:
file_name = "hate_r_"
fout = open(f"./src/data/processed/{file_name}.jsonl", "w")

with tqdm(total=(len(retrieved_hate_))) as pbar:
    with fout:
        fout.write(json.dumps(retrieved_hate_))

logger.info(f"[{len(retrieved_hate_)} Data Stored as {file_name}.jsonl]")

  0%|          | 0/8867 [00:00<?, ?it/s]

INFO:ARGUMENT-EXTRACTOR:[8867 Data Stored as hate_r_.jsonl]


In [15]:
retrieved_hate_

[{'id': 'ENT1ST0001HS0033CN000021',
  'hate_speech': [{'sentence': 'according to a recent ofsted report a school in birmingham is still segregating girls and boys despite a hight court ruling in that this is unlawful.',
    'selected_keyphrases': [],
    'stance': 'CON',
    'aspect': 'birmingham segregating girls'}],
  'tgt_counter': [{'sentence': 'to be fair the ofsted report is more concerned with lack of enforcement and less about focussing on the practice of any particular faith.',
    'selected_keyphrases': [],
    'stance': 'PRO',
    'aspect': 'ofsted report concerned'}],
  'retrieved': [{'passages': ['Ofsted report (2018). The school had its latest Ofsted Report in 2018. None',
     "Ofsted Report. Ashcroft Technology Academy's latest Ofsted report, from 2015, gave a result of 'Outstanding'. Homework.",
     "During its March 2019 Ofsted inspection, the school maintained its 'GOOD' rating in all areas. The Ofsted report advises that:The full inspection report can be viewed on 

In [None]:
### SCORE COSINE SIMILARITY ###
from rank import rank_passages

tic = time.time()
rr_hate = copy.deepcopy(retrieved_hate_)

with tqdm(total=(len(retrieved_hate_)), position=0, leave=True) as pbar:
    for i in range(0, len(retrieved_hate_)):
        rr_hate[i]["retrieved"] = [i for i in rank_passages(retrieved_hate_[i])]
        pbar.update()

toc = time.time()

  0%|          | 0/8867 [00:00<?, ?it/s]