In [97]:
# TODOs: Operate DB Class
# TODOs: Use SQLite
# TODOs: Implement BM25

import sys
from pathlib import Path
sys.path[0] = str(Path(sys.path[0]).parent)

from utils.elastic_db import ElasticDB

# INIT DB OBJECT
PORT = "http://localhost:9200"
INDEX_NAME = "news_cc"

news_db = ElasticDB(elastic_port=PORT, elastic_index=INDEX_NAME)
wiki_db = ElasticDB(elastic_port=PORT, elastic_index="knowledge")

INFO:utils.elastic_db:Connecting to http://localhost:9200 
INFO:utils.elastic_db:Connected to <Elasticsearch(['http://localhost:9200'])> 
INFO:utils.elastic_db:Connecting to http://localhost:9200 
INFO:utils.elastic_db:Connected to <Elasticsearch(['http://localhost:9200'])> 


In [98]:
### LOAD DATASETS ###
import json

data = [json.loads(ln) for ln in open("../../data/train_cmv_cleaned.jsonl")]
topics = [json.loads(ln) for ln in open("../../data/claim_topics.jsonl")]

ex_retrieval = [json.loads(ln) for ln in open("../../data/wiki_doc_retrieved_from_op_train.jsonlist")]
ex_ranked = [json.loads(ln) for ln in open("../../data/selected_evidence.jsonl")]

conan = [json.loads(ln) for ln in open("../../data/CONAN.json")]

In [99]:
len(data)

3456

In [100]:
### SAMPLE OUTPUT ###
import random

# TODOs: Replicate Output, Passage Evidence Retrieval
# TODOs: Replicate Output, Passage Ranking

_ = random.randint(0, 1000)
ranked = ex_ranked[_]
retireval = ex_retrieval[_]

retireval

{'tid': 't3_3ts8rc',
 'retrieval_results': [{'retrieved_document_titles': ['ANTIC',
    'Chothe Naga',
    'Middle-earth wars and battles',
    'Nikita Khrushchev',
    'H. H. Asquith'],
   'query': 'the utter downfall some russian colleagues assistance lazy',
   'sentence': 'Some Russian colleagues of mine feel very strongly that ANY amount of assistance from the government results in people not wanting to work , everyone becoming very lazy , and the utter downfall of society . '},
  {'retrieved_document_titles': ['Freeloaders (film)',
    'Freeloaders (band)',
    'Common Development and Distribution License',
    'Reactions to Occupy Wall Street',
    'The Philisteins'],
   'query': 'those few freeloaders the system providing',
   'sentence': "Yes , some people may take advantage of the system , but those few freeloaders should n't prevent us from providing help to other members of society . "}]}

In [101]:
# TODOs: News Data
# TODOs: Ranking
# TODOs: Ranking, Cosine
# TODOs: Research Evidence Retireval: Context Aware, Neural Retrieval
# TODOs: Stance
# TODOs: Target ADUs: Premises, Claims, discard non-ADUs, thus reducing noise over retreival 

# TODOs: Paralellise

In [103]:
### SUBJECT ARG ###
import random
import re

_ = random.randint(0, 1000)
print(_)

claim = data[_]["titles"]
arg = data[_]["arguments"]
claim, arg

646


('That American society is getting better not worse',
 'Many people seem to believe that society is degenerating and that society in general is getting worse. I disagree with this pessimistic view. Since the mids crime has fallen by more than in America. The quality of life is increasing technology is improving and crime is falling. This idea that society is degenerating seems like a myth spurred on by nostalgia.Are there any validity in these claims? !Further Note some people are afraid that technology will and has led to a degeneration of values. Others also believe that technology is making people dumber. I disagree with this as well.')

In [116]:
from utils.keyphrase_extraction import extract_keyphrase
# from keyphrase_vectorizers import KeyphraseCountVectorizer

# def to_sentences(text):
#     sents = [i for i in re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)]
#     return sents


# TODOs: Fix Sentence parsing
# TODOs: Parameterise Index-DB in use
#sentences = [i for i in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', arg)]
sentences = [i for i in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', arg)]

queries = []
results = []

for sent in sentences:
    toks = re.findall(r"\w+(?:'\w+)?|[^\w\s]", sent)
    # print(sent)
    # print(len(sent))

    if len(toks) <= 8:
        continue

    print(sent)    
    kp = extract_keyphrase(sent, n_kp=3)
    query = ", ".join(i for i in kp)
    
    # TODOs: Re-init DB with smaller passage size
    titles = [i["_source"]["document"]["title"] for i in news_db.search(query_=query, k=10)]
    evidence = [i["_source"]["document"]["text"] for i in news_db.search(query_=query, k=10)]

    results.append({
        "argument_sentence": sent, 
        "retrieved_documents_titles": titles,
        "query": query,
        "retrieved_evidence": evidence
    })

Many people seem to believe that society is degenerating and that society in general is getting worse.


INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.065s]
INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.009s]


Since the mids crime has fallen by more than in America.


INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.070s]
INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.005s]


The quality of life is increasing technology is improving and crime is falling.


INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.086s]
INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.009s]


This idea that society is degenerating seems like a myth spurred on by nostalgia.Are there any validity in these claims? !Further Note some people are afraid that technology will and has led to a degeneration of values.


INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.059s]
INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.004s]


Others also believe that technology is making people dumber.


INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.054s]
INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.009s]


In [119]:
### MERGE ###

# OUT: A list of counter-evidence articles, merged as a list of sentences, per input argument sentence (ADU)

merged = []
for counter_ev in results:
    sentence = counter_ev["argument_sentence"]
    evidence = counter_ev["retrieved_evidence"]
    sent_kp = counter_ev["query"]

    merged_ev = ", ".join(retrieved for retrieved in evidence)

    merged.append((sentence, sent_kp, merged_ev))

#merged

In [120]:
### PASSAGE RANKING : KEYWORD OVERLAP ###

test = merged[0]
test
sent, kp, merge = test

sent, sent_kp

('Many people seem to believe that society is degenerating and that society in general is getting worse.',
 'technology, others, people')

In [137]:
import spacy
from spacy.matcher import PhraseMatcher
#from negspacy.negation import Negex
nlp = spacy.load("en_core_web_sm")
#nlp.add_pipe("negex", config={"ent_types":["NOUN", "PERSON","ORG"]})

# TODOs: https://www.cs.uic.edu/~liub/FBS/opinion-mining-final-WSDM.pdf 
# TODOs: Pattern based Negation
# TODOs: Semantic Orientation of an opinion (Claim)
# TODOs:Group synonyms of 'features', 'targets'


# TODOs: Review Unsuperived Approach; Consider adveanced patterns and common-sence knowledge.
phrase_matcher = PhraseMatcher(nlp.vocab)

### SENTIMENT LEXICONS ###
pos = [w.replace("\n", "") for w in open("../../data/lexicon/positive_lex.txt")]
neg = [w.replace("\n", "") for w in open("../../data/lexicon/negative_lex.txt")]

def extract_aspect(sentence, n_gram):
    aspects = extract_keyphrase(str(sentence))[0]

    return nlp(aspects)

def index_aspect(aspect, sentence):    
    patterns = [nlp(aspect)]
    phrase_matcher.add("aspects", None, *patterns)

    start = 0
    stop = 0

    matched_phrases = phrase_matcher(sentence)
    for i in matched_phrases:
        _, start, stop = i
        
    return start, stop

def stance_score(start, stop, sentence):
    pos_score = 0.0
    neg_score = 0.0

    for idx, tok in enumerate(sentence):
        if idx == start or idx == stop - 1:
            continue

        # TODOs: Implement Polarity Shift
        k = 5
        # Negation Rules
        if tok.dep_ == "neg":
            if tok.text in pos:
                # Shift to Negative
                if idx <= k:
                    if idx < start: neg_score += 1/(start - idx)
                    elif idx > start: neg_score += 1/(idx - stop)**0.5
                    else: continue

            if str(tok.head.text) in neg:
                # Shift to Positive
                if idx < start: pos_score += 1/(start - idx)
                elif idx > start: pos_score += 1/(idx - stop)**0.5
                else: continue

        # Aspect Sentement Orientation
        if str(tok.text) in pos:
            if idx < start: pos_score += 1/(start - idx)
            elif idx > start: pos_score += 1/(idx - stop)**0.5
            else: continue

        if str(tok.text) in neg:
            if idx <= start: neg_score += 1/(start - idx)
            elif idx >= start: neg_score += 1/(idx - stop)**0.5
            else: continue

    score = pos_score - neg_score /(pos_score + neg_score + 1)
    return score

def sentence_stance(sentence, target=None):
    sentence = nlp(sentence)

    if target == None:
        # Extract Aspect
        aspect = extract_aspect(sentence, n_gram=3)

    elif str(target) in str(sentence):
        aspect = nlp(target)
    
    else:
        return None

    start, stop = index_aspect(aspect, sentence)
    score = stance_score(start, stop, sentence)

    # Add Neutral
    stance = {"claim": sentence, "stance": "PRO", "aspect": aspect} if score > 0 else {"claim": sentence, "stance": "CON", "aspect": aspect}
    return stance

sentence = "I hate abortion rights. Abortions should be banned."
sentence_2 = "I like abortion rights. I belive we should keep them."
sentence_3 = "I hate tennis. People should play tennis more often"

sentence_stance(sentence_3, target="tennis")

{'claim': I hate tennis. People should play tennis more often,
 'stance': 'CON',
 'aspect': tennis}

In [132]:
# TODOs: Improve Scoring Function for whole vs partial keyphrase match
# from detection.stance_classifier import sentence_stance

def overlap_score(evidence_kp, adu_kp):
    score = 0
    # Split Keyphrase into components, scoring partial units as overlap
    for i in evidence_kp:
        for j in i.split():
            if adu_kp.find(j) != -1: score += 1
            
            else: continue
    
    return score

# TODOs: Speed-up, Parrelleise, Yield
sentence_scores = []
for sent, sent_kp, evidence in merged[0:1]:
    sent_overlap_score = 0
    
    evidence_sentences = [i for i in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', evidence)]

    for ev_sent in evidence_sentences:
        toks = re.findall(r"\w+(?:'\w+)?|[^\w\s]", ev_sent)

        if len(toks) <= 8:
            continue
        
        evidence_kp = extract_keyphrase(ev_sent)
        target = extract_keyphrase(ev_sent, n_kp=1)
        sent_overlap_score += overlap_score(evidence_kp, sent_kp)
        
        sentence_scores.append((sent, ev_sent, sent_overlap_score))
    
sent_overlap_score


27

In [133]:
### SORTED TUPLES ###
sentence_scores
sentence_scores.sort(key=lambda y: y[2], reverse=True)

sent_kp, sentence_scores

('society, many people',
 [('Many people seem to believe that society is degenerating and that society in general is getting worse.',
   'Too many members of our society are still struggling to find a good-paying job or get the health care they need.',
   27),
  ('Many people seem to believe that society is degenerating and that society in general is getting worse.',
   'Too many women, LGBTQ Americans, people of color, and people with disabilities still face inequality and injustice across our society. “Everywhere we look, our most fundamental values are under attack.',
   27),
  ('Many people seem to believe that society is degenerating and that society in general is getting worse.',
   'Too many members of our society are still struggling to find a good-paying job or get the health care they need.',
   25),
  ('Many people seem to believe that society is degenerating and that society in general is getting worse.',
   'Too many women, LGBTQ Americans, people of color, and people with

In [138]:
### ASSERT SAME STANCE ###

opposing_stance = []
for i in sentence_scores:
    sent, ev, score = i
    if sentence_stance(ev, target=sent_kp) != sentence_stance(sent):
        opposing_stance.append(ev)

    else: continue


ZeroDivisionError: float division by zero

In [115]:
k = 3
top_k = opposing_stance[0:k]

top_k

['The government of Liberia has stated that it has a commitment to providing an inclusive society for people with disabilities.',
 'The Second Liberian Civil War caused various types of disability to as many as 800,000 people.',
 'Many people in Liberia have congenital conditions, but others become disabled due to birth trauma.']

[]

In [7]:
#     pos_score = 0.0
#     neg_score = 0.0

#     # Pattern Match
#     phrase_matcher = PhraseMatcher(nlp.vocab)

    # compound_words = []    
    # for i in aspects:
    #     compound_word = ""
    #     if i.pos_ in ["NOUN", "PROPN"]:
    #         comps = "".join([str(j) for j in i.children if j.dep_ == "compound"])
    #         if comps:
    #             compound_word = comps + " " + str(i)
    #             compound_words.append(compound_word)

    # aspects_ = []
    # aspects_.extend(compound_words)
    # aspects_.extend(aspects)

#     for idx, tok in enumerate(sentence):

#         if idx == start or idx == stop - 1:
#             continue

#         # Polarity Shift
#         # NEAR parameter, k
#         k = 5
#         if tok.dep_ == "neg":
#             if tok.text in pos:
#                 # Shift to Negative
#                 if idx <= k:
#                     if idx < start: neg_score += 1/(start - idx)
#                     else: neg_score += 1/(idx - stop)**0.5
        
#             if str(tok.head.text) in neg:
#                 # Shift to Positive
#                 if idx < start: pos_score += 1/(start - idx)
#                 else: pos_score += 1/(idx - stop)**0.5

#         if str(tok.text) in pos:
#             if idx < start: pos_score += 1/(start - idx)
#             else: pos_score += 1/(idx - stop)**0.5

#         if str(tok.text) in neg:
#             if idx < start: neg_score += 1/(start - idx)
#             else: neg_score += 1/(idx - stop)**0.5

#     result = pos_score - neg_score /(pos_score - neg_score + 1)
#     stance = ""

#     neg_score, pos_score
#     stance = {"claim": sentence, "stance": "PRO", "aspect": aspect} if result > 0 else {"claim": sentence, "stance": "CON", "aspect": aspect}

#     return stance

# ### TEST STATEMENT ###

# id = random.randint(0, 1000)
# claim =  nlp("I do not believe abortion should be legal")

# print(sentence_stance(claim))



In [None]:
# print(kp)

# def count(string, substring):
#     n = len(substring)
#     cnt = 0
#     for i in range(len(string) - n):
#         if string[i:i+n] == substring:
#             cnt += 1
#     return cnt

# def hasIntersection(a, b):
#         score = 0
#         a = set(a) 
#         b = set(b)
#         if a.intersection(b):
#             score += 1

#         return score

# def overlap_score(count_kp):
#     score = 0

#     for i in count_kp:
#         score += count(sent_kps, count_kp)

#     return score

# scores = []
# for sent in merged_sentences:
#     if sent:
#         # TODOs: Try-Catch, Handel Errors
#         kp = [i for i in extract_keyphrase(sent, n_kp=3)]
#         scores.append((kp, overlap_score(kp)))

# print(sent_kps)
# scores

In [59]:
print(sent_kp)

view, arguments, logic, line


In [None]:
### ATTACKING PEMISES ###

# from BERT_adu_classifier import predict

# premises = []
# for sent in sentences:
#     prediction = predict(sent)
    
#     if prediction == "premise":
#         premises.append(sent)

In [26]:
# # Argument Level

# results = []

# _ = 90
# hate_ = sample_hate[0]["conan"][_]["hateSpeech"]

# # queries = []
# # sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', hate_)

# results = []

# # TODOs: Refactor key_BERT as Class for multiple oututs
# #kp_ = (exctract_keyphrase(hate_, n_gram=1, n_kp=1), exctract_keyphrase(hate_, n_gram=2, n_kp=1), exctract_keyphrase(hate_, n_gram=3, n_kp=1))

# kp_ = (exctract_keyphrase(hate_, n_gram=5, n_kp=1))

# query = " ,".join(i for i in kp_)
# query

# res = [i["_source"]["document"]["title"] for i in news_db.search(query_=query, k=5)]

# results.append({
#     "argument_sentence": hate_, 
#     "query": query, 
#     "retireved_documents": res,

# })

# results


INFO:elastic_transport.transport:POST http://localhost:9200/news_cc/_search [status:200 duration:0.367s]


[{'argument_sentence': 'Fuck Islam and Fuck ISIS. We should repatriate all Muslims. They are all a bunch of violent criminals who do not share our values and never will. Without them Britain will be a better, safer place.',
  'query': 'fuck isis repatriate muslims',
  'retireved_documents': ['‘Assalamu Alaykom:’ peace be upon you',
   'How We Got to Here: A Charlottesville Reading List',
   'De-radicalization can work for former ISIS fighters',
   'How to Get Assyrians to Stay',
   'How to Get Christians to Stay']}]

In [27]:
# results = []
# res = [i["_source"]["document"]["title"] for i in wiki_db.search(query_=query, k=5)]

# results.append({
#     "argument_sentence": hate_, 
#     "query": query, 
#     "retireved_documents": res,

# })

# results

INFO:elastic_transport.transport:POST http://localhost:9200/knowledge/_search [status:200 duration:0.249s]


[{'argument_sentence': 'Fuck Islam and Fuck ISIS. We should repatriate all Muslims. They are all a bunch of violent criminals who do not share our values and never will. Without them Britain will be a better, safer place.',
  'query': 'fuck isis repatriate muslims',
  'retireved_documents': ['Refuting ISIS',
   'Refuting ISIS',
   'Take Off (2017 film)',
   'Terrorism in India',
   'Tareq Kamleh']}]

In [None]:
# #queries
# query = "the executive the presidential system legislative branches partisan unproductive"
# res = search_text(wiki_ev, query_=query, k=3)
# res

[{'_index': 'knowledge',
  '_id': 'kbZa5YIBRqC1428DaQUk',
  '_score': 33.672318,
  '_ignored': ['document.text.keyword'],
  '_source': {'document': {'id': '3408791',
    'source': 'wikipedia',
    'title': 'Ruling party',
    'text': "In parliamentary systems, the majority in the legislature also controls the executive branch of government, thus leaving no possibility of opposing parties concurrently occupying the executive and legislative branches of government. In other systems, such as in an American style presidential system, the party of the president does not necessarily also have a legislative majority. A 'ruling party' is also used to describe the party of one-party states, such as the Chinese Communist Party in the People's Republic of China. In his political manifesto 'The Green Book', the late Libyan leader Muammar al-Gaddafi attacked the ability of the ruling party, using it as a basis for his opposition to partisan politics. None"}}},
 {'_index': 'knowledge',
  '_id': 'JQ3

In [None]:
# import re

# print(", ".join(re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]*)]]', "", i["_source"]["document"]["title"]).strip("[]") for i in res))

Ruling party, Fusion of powers, Student governments in the United States


In [None]:
# ### TEST SEARCH ###

# def search_text(es, query_, k=5):
#     results = es.search(
#         index = es.elastic_index,
#         query = {
#             "match": {
#                 "document.text": query_,
#                 },
#         },
#         size=k)

#     hits = results["hits"]["hits"]
#     doc_ids = [row['_source']["document"]["id"] for row in hits]

#     return hits

# def search_topic(es, topic, k=5):
#     results = es.search(
#         index = es.elastic_index,
#         body= {
#             "size": k,
#             "query": {
#                 "match": {
#                     "document.title": topic,
#         }}})

#     hits = results["hits"]["hits"]
#     doc_ids = [row['_source']["document"]["id"] for row in hits]

#     title = hits[0]["_source"]["document"]["title"]
#     text = hits[0]["_source"]["document"]["text"]

#     return {
#         "title": title,
#         "text": text
#     }

# query = "government emails"
# text = search_text(wiki_ev, query_=query, k=10)

# print(text)


In [None]:
### SEARCH FUNCTION ###

# TODOs: Implement as Class-DB object

# def search_text(db, query_, k=5):
#     results = db.search(
#         index = es.elastic_index,
#         query = {
#             "match": {
#                 "document.text": query_,
#                 },
#         },
#         size=k)

#     hits = results["hits"]["hits"]
#     doc_ids = [row['_source']["document"]["id"] for row in hits]

#     return hits