In [None]:
### TODOs ###
# DONE: Implement Semantic Ranking
# TODOs: Commonsense Query and Concept Expansion: Topics, Concepts, Synonyms
# TODOs: Targeted Retrieval with NLI over ADUs, Premises, Claims; discard non-ADUs

# DONE: News Data
# DONE: Add Concepts
# DONE: Cosine Semantic Search
# DONE: Prior Pre-processing, tokenization and sentence segmentation to speed processing
# TODOs: Domain Restrict. Polarising social and political debate (Class labelling) only for higher-quality argument-knowledge set.
# TODOs: News, Political, Sociology and 'Good', 'Positive' counter-evidence Knowledge Base.
# TODOs: Bag of Topics Modelling
# TODOs: Implement as a Class

# TODOs: Keyphrase Selection
# DONE: Manage Duplicate Keywords
# DONE: Sentential Ranking
# DONE: Include Topic Label
# DONE: Include Concept Label
# DONE: Add News
# TODOs: Targeted Retreival with Semantic Graphs
# TODOs: Target Argumentative Content Only
# TODOs: Targeted Argument Content: Adus + Extractive Summary
# TODOs: Query Expansion
# TODOs: Multi-Field Search
# TODOs: Additional News and Knowledge Sources

In [None]:
### INIT LOGGING ###
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ARGUMENT-EXTRACTOR")

In [1]:
from src.utils_.elastic_db import ElasticDB

# TODOs: Implement KW Selection
# INIT DB OBJECT
PORT = "http://localhost:9200"
db = ElasticDB(elastic_port=PORT)

INFO:src.utils_.elastic_db:Connecting to http://localhost:9200 
INFO:src.utils_.elastic_db:Connected to <Elasticsearch(['http://localhost:9200'])> 


In [2]:
### NLP FUNCTIONS ###
from src.utils_.utils import tokeniser, sentences_segment

print(tokeniser("hello, my name is Josh!"))
print(sentences_segment(
    "hello, my name is Josh! How are you doing today? I'm curious ... will this line seperate? I'm not so sure Dr. Evil"))

['hello', ',', 'my', 'name', 'is', 'Josh', '!']
['hello, my name is Josh!', 'How are you doing today?', "I'm curious ... will this line seperate?", "I'm not so sure Dr.", 'Evil']


In [3]:
### LOAD DATASETS ###
import json
import random

args = [json.loads(ln) for ln in open("../data/cmv_processed.jsonl")]
mined_args = [json.loads(ln) for ln in open("../data/cmv_argument_extraction.jsonl")]
topics = [json.loads(ln) for ln in open("../data/argument_topic_concept.jsonl")]
concepts = [json.loads(ln) for ln in open("../data/argument_concept.jsonl")]

In [4]:
print("ARGS:", len(args), " MINED-ARGS:",  len(mined_args), " TOPICS:", len(topics), " CONCEPTS: ", len(concepts))

ARGS: 10303  MINED-ARGS: 10303  TOPICS: 5990  CONCEPTS:  5990


In [5]:
### EXTRACT UNIQUE SET ###
import pandas as pd

def unique_entries(args, key="id"):
    data_ = pd.DataFrame(args)
    unique = data_.drop_duplicates(subset=key)

    unique_ = []
    for _, i in unique.iterrows():
        unique_.append({
            "id": i["id"],
            "claim": i["claim"],
            "argument": i["argument"],
            "tgt_counter": i["tgt_counter"],
        })

    return unique_

#unique_args = unique_entries(mined_args)
unique_args = unique_entries(mined_args)
unique_args

[{'id': 't3_30oi71',
  'claim': 'we should strengthen the traditional safety net rather than replace it with basic income',
  'argument': {'argument': 'section i why is basic income increasingly popular? basic income is a policy that has broad support from both the progressive left and libertarian right. centerleft economists including paul krugman have endorsed the scheme for various reasons. first bi is an effective antipoverty measure. bi also reduces inequality by redistributing income from capital to labor. perhaps most importantly to some on the left is the notion that bi provides people with freedom. leftlibertarian political economist philippe van parijs argues that to be truly free people have to have access to the means that people need for doing what they might want to do. bi provides people those means.',
   'arg_kp': ['income increasingly popular',
    'basic income increasingly',
    'increasingly popular',
    'basic income'],
   'arg_stance': ['NEUTRAL', 'basic income i

In [6]:
len(unique_args)

5990

In [7]:
### SUBJECT ARG ###
import random
sample = random.randint(0, 99)

arg = args[sample]["argument"]["argument"]
claim = args[sample]["claim"]

print(sample, "\n")
print(claim, "\n")
print(arg, "\n")

49 

transgenders only enforce gender stereotypes. 

i try not to be bigoted and id really like to open my mind to this especially as my cousin begins his hrt. im a strong liberal but i believe the concept of transgender individuals is tied strongly to the concept of gender roles. why bother changing your gender or identifying as a different one if they are equal? dysphoria is a real issue but thats purely psychological. if someone truly believed man woman then why would they feel the need to be one or the other. men can wear make up nail polish dresses suits high heels and be nurturing. women can do the same. when i was little they called me a tomboy. 



In [8]:
### KEYPHRASE EXTRACTORS ###
from src.utils_.keyphrase_extraction import yake_extract_keyphrase, summa_extract_keyphrase
import keybert

test = "Brazil's minimum income has increasingly been accepted."
ev_kp = yake_extract_keyphrase(test)
ev_kp_ = summa_extract_keyphrase(test)

test_2 = " "
ev_kp_2 = yake_extract_keyphrase(test_2)
ev_kp_2_ = summa_extract_keyphrase(test_2)

print(ev_kp)
print(ev_kp_)

# Can Handel Blanks
print(ev_kp_2)
print(ev_kp_2_)

INFO:summa.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu
INFO:KEYPHRASE_EXTRACTOR:[Test Keyphrase: ] 
 ['heathrow airport', 'environmental impact', 'aviation']


['Brazil minimum income', 'minimum income has increasingly', 'Brazil minimum', 'increasingly been accepted', 'minimum income']
['minimum']
[]
[]


In [9]:
# import os
# path = "/Users/joshua.sheppard/PycharmProjects/countaBot/"
# os.chdir(path)

from src.detection.BERT_adu_classifier import predict

INFO:pytorch_pretrained_bert.modeling:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


claim


In [22]:
from tqdm.notebook import tqdm
import multiprocessing
from src.detection.stance_classifier import sentence_stance, compare_stance
from src.utils_.word_net_expansion import expand_query
from src.detection.stance_classifier import sentence_stance
# from multiprocessing.pool import ThreadPool as Pool

# path = "/Users/joshua.sheppard/PycharmProjects/retriever"
import os
print(os.getcwd())

topic_ids = [json.loads(ln)["id"] for ln in open("./src/data/argument_topic_concept.jsonl")]
concept_ids = [json.loads(ln)["id"] for ln in open("./src/data/argument_concept.jsonl")]

import time

# Disable Huggingface Logging
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

### RETRIEVER ###
db = db
queries = []
retrieved_ev = []

# TODOs: Argumentative Sentence
# TODOs: Query Expansion
def search(mined, type="argument"):
    id_ = mined["id"]
    claim = arg["claim"]

    retrieve_len = 5
    retrieved = []

    # for adu in mined["argument"]:
    for adu in mined[type]:

        sentence = adu["sentence"]
        if predict(sentence) != "premise":
            continue

        #kp = extract_keyphrase(sentence)
        kp = list(set(adu["kp"]))

        topic = adu["topic"]
        concept = adu["concept"]

        # Construct and Expand Query
        query = []
        query.extend(kp)
        query.extend(topic) if topic else query
        query.extend(concept) if concept else query

        query = ", ".join(i for i in query)

        search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=retrieve_len)]

        source = [i[0] for i in search]
        evidence = [i[1] for i in search]

        merged = ", ".join(i for i in evidence)
        ev_kp = list(set(yake_extract_keyphrase(merged)))

        retrieved.append({"passages": evidence, "kp": [i for i in ev_kp], "source": source})

    # TODOs: Implement yield without storing list
    return ({
        "id": id_,
        "claim": claim,
        "argument": mined["argument"],
        "tgt_counter": mined["tgt_counter"],
        "retrieved": [i for i in retrieved],
    })

# SINGLE ARGUMENT INSPECT
# SAMPLE = unique_args[0]
# results = search(SAMPLE)

tic = time.time()
SAMPLE = unique_args[0:100]

retrieved_ev = []
with tqdm(total=(len(SAMPLE)), position=0, leave=True) as pbar:
    for arg in SAMPLE:
        retrieved_ev.append(search(arg))
        pbar.update()
toc = time.time()

/Users/joshua.sheppard/PycharmProjects/countaBot


  0%|          | 0/100 [00:00<?, ?it/s]

INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:2.436s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.819s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.555s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.552s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.381s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.258s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.430s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.685s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.393s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.198s]
INFO:elast

In [21]:
retrieved_ev

[{'id': 't3_30oi71',
  'claim': {'sentence': 'We should strengthen the traditional safety net rather than replace it with basic income',
   'kp': ['strengthen the traditional safety',
    'traditional safety net',
    'basic income',
    'strengthen the traditional',
    'traditional safety']},
  'argument': [{'sentence': 'Section I Why is Basic Income Increasingly Popular?',
    'kp': ['basic income increasingly popular',
     'income increasingly popular',
     'basic income increasingly',
     'increasingly popular',
     'basic income'],
    'stance': 'NEUTRAL',
    'aspect': 'Basic Income Increasingly Popular',
    'topic': 'incomes policy',
    'concept': 'increase in accumulated other comprehensive income'},
   {'sentence': 'Basic income is a policy that has broad support from both the progressive left and libertarian right.',
    'kp': ['progressive left and libertarian',
     'basic income',
     'income is a policy',
     'policy that has broad',
     'broad support'],
    's

In [32]:
# from tqdm.notebook import tqdm
# import multiprocessing
# from src.detection.stance_classifier import sentence_stance, compare_stance
# from src.utils_.word_net_expansion import expand_query
# from src.detection.stance_classifier import sentence_stance
# # from multiprocessing.pool import ThreadPool as Pool
# import time
#
# # Disable Huggingface Logging
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
#
# ### RETRIEVER ###
# db = db
# queries = []
# retrieved_ev = []
#
# topic_ids = [json.loads(ln)["id"] for ln in open("../data/argument_topic_concept.jsonl")]
# concept_ids = [json.loads(ln)["id"] for ln in open("../data/argument_concept.jsonl")]
#
# # TODOs: Argumentative Sentence
# # TODOs: Query Expansion
# def search(mined):
#     id_ = mined["id"]
#     # print("\n", id_)
#
#     topic = arg["argument"][0]["topic"]
#     concept = arg["argument"][0]["concept"]
#
#     retrieve_len = 5
#     retrieved = []
#
#     # for adu in mined["argument"]:
#     for adu in mined["tgt_counter"]:
#
#         sentence = adu["sentence"]
#         if len(tokeniser(sentence)) <= 8:
#             continue
#
#         kp = list(set(adu["kp"]))
#         #print(kp)
#         # topic = adu["topic"]
#         # concept = adu["concept"]
#
#         kp.append(topic) if topic else kp
#         kp.append(concept) if concept else kp
#         # print(kp)
#
#         query = ", ".join(i for i in kp)
#
#         search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=retrieve_len)]
#
#         source = [i[0] for i in search]
#         evidence = [i[1] for i in search]
#
#         merged = ", ".join(i for i in evidence)
#         ev_kp = list(set(yake_extract_keyphrase(merged)))
#
#         retrieved.append({"passages": evidence, "kp": [i for i in ev_kp], "source": source})
#
#     # TODOs: Implement yield without storing list
#     return ({
#         "id": id_,
#         "argument": mined["argument"],
#         "tgt_counter": mined["tgt_counter"],
#         "retrieved": [i for i in retrieved],
#     })
#
# # SINGLE ARGUMENT INSPECT
# # SAMPLE = unique_args[0]
# # results = search(SAMPLE)
#
# tic = time.time()
# SAMPLE = unique_args[0:100]
#
# retrieved_ev = []
# with tqdm(total=(len(SAMPLE)), position=0, leave=True) as pbar:
#     for arg in SAMPLE:
#         retrieved_ev.append(search(arg))
#         pbar.update()
# toc = time.time()

  0%|          | 0/100 [00:00<?, ?it/s]

INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.072s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.084s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.070s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.069s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.045s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.105s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.038s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.051s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.025s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.037s]
INFO:elast

In [29]:
retrieved_ev[70]

{'id': 't3_368ijm',
 'claim': {'sentence': 'MRAs are right to claim that there are mens issues. However the way theyre trying to address them is ineffective and quite possibly makes things worse.',
  'kp': ['mens issues',
   'possibly makes things worse',
   'makes things worse',
   'MRAs',
   'issues']},
 'argument': [{'sentence': 'The fact that men are disadvantaged in child custody cases is appalling.',
   'kp': ['custody cases is appalling',
    'disadvantaged in child custody',
    'child custody cases',
    'cases is appalling',
    'fact that men'],
   'stance': 'CON',
   'aspect': 'custody cases is appalling',
   'topic': None,
   'concept': 'inability to monitor the status and location of mro assets'},
  {'sentence': 'So is that fact that men who call the police to report domestic violence against them are more likely to be arrested than the ACTUAL PERPETRATOR.',
   'kp': ['police to report domestic',
    'report domestic violence',
    'actual perpetrator',
    'fact that men

In [37]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch
import time
import copy

# TODOs: Discard equivalent stance
model = SentenceTransformer('all-MiniLM-L6-v2')
def cosine_similarity_(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)

    cos = torch.nn.CosineSimilarity()
    scores = cos(embeddings[0], embeddings[1:])

    scored = []
    retrieved_sentences = sentences[1:]
    for sent, similarity in zip(retrieved_sentences, scores):
        scored.append((sent, similarity.numpy().item()))

    return scored

def rank_passages(ev, k=4):
    #adus = [i["sentence"] for i in ev["argument"]]
    adus = [i["sentence"] for i in ev["tgt_counter"]]
    retrieved_passages = [i["passages"] for i in ev["retrieved"]]

    # Merge
    # Output 1 x merged sentences object per ADU sentence, with k=5 collected passages as a list of sentences
    merged_passages = []
    for passages in retrieved_passages:
        merged_sents = []
        # Iterate n x sentences for each k=5 retrieved passages
        for passage in passages:
            # Segment as a list of sentences
            sents = sentences_segment(passage)
            # Add sentences to merged_sentences object
            merged_sents.extend(sents)

        # Store merged sentence object for each ADU
        merged_passages.append(merged_sents)

    rank_retrieved = []
    # Rank n x merged sentences for each 1 x ADU
    for adu, merged in zip(adus, merged_passages):
        scored = []
        sentences = [adu]
        sentences.extend(merged)
        scored = cosine_similarity_(sentences)

        ranked_sents = sorted(scored, key=lambda x: x[1], reverse=True)

        # Select top-k sentences
        ranked_sents = ranked_sents[0:k]

        merged = ", ".join(i[0] for i in ranked_sents)
        merged_kp = yake_extract_keyphrase(merged)
        rank_retrieved.append({"ranked_passages": merged, "kp": merged_kp})

    return rank_retrieved

# TODOs: Join passages and sentence rank
### SCORE COSINE SIMILARITY ###
tic = time.time()
retrieved_ranked = copy.deepcopy(retrieved_ev)
with tqdm(total=(len(retrieved_ev)), position=0, leave=True) as pbar:
    for i in range(0, len(retrieved_ev)):
        retrieved_ranked[i]["retrieved"] = [i for i in rank_passages(retrieved_ev[i])]
        pbar.update()
toc = time.time()

retrieved_ranked

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


  0%|          | 0/100 [00:00<?, ?it/s]

[{'id': 't3_30oi71',
  'argument': [{'sentence': 'Section I Why is Basic Income Increasingly Popular?',
    'kp': ['Income Increasingly Popular',
     'Basic Income Increasingly',
     'Increasingly Popular',
     'Basic Income',
     'Income Increasingly'],
    'stance': 'NEUTRAL',
    'aspect': 'Income Increasingly Popular',
    'topic': 'incomes policy',
    'concept': 'Income Increasingly Popular'},
   {'sentence': 'Basic income is a policy that has broad support from both the progressive left and libertarian right.',
    'kp': ['Basic income',
     'broad support',
     'progressive left',
     'left and libertarian',
     'Basic'],
    'stance': 'PRO',
    'aspect': 'Basic income',
    'topic': 'incomes policy',
    'concept': 'Basic income'},
   {'sentence': 'Centerleft economists including Paul Krugman have endorsed the scheme for various reasons.',
    'kp': ['including Paul Krugman',
     'economists including Paul',
     'Centerleft economists including',
     'Paul Krugman'

In [38]:
print(toc - tic)
len(retrieved_ranked)

38.27395415306091


100

In [39]:
len(retrieved_ranked)
retrieved_ranked[6]

{'id': 't3_1jipi5',
 'argument': [{'sentence': 'To clarify what I mean by the Title I believe that people with a predominantly subsaharan African heritage are genetically inclined to lower intelligence when compared to Whites and East Asians.This does not mean that all black people are less intelligent than white people.',
   'kp': ['predominantly subsaharan African',
    'subsaharan African heritage',
    'subsaharan African',
    'African heritage',
    'East Asians.This'],
   'stance': 'PRO',
   'aspect': 'predominantly subsaharan African',
   'topic': None,
   'concept': 'predominantly subsaharan African'},
  {'sentence': 'Some black people are much more intelligent than some white people but these are outliers and are accounted for by probability distribution.',
   'kp': ['probability distribution',
    'black people',
    'white people',
    'people',
    'distribution'],
   'stance': 'PRO',
   'aspect': 'probability distribution',
   'topic': None,
   'concept': 'probability dis

In [40]:
# NOTE: Zipping retrieved evidence, args
print(len(retrieved_ranked), len(args[0:sample]))

100 85


In [41]:
retrieved_ranked[0]

{'id': 't3_30oi71',
 'argument': [{'sentence': 'Section I Why is Basic Income Increasingly Popular?',
   'kp': ['Income Increasingly Popular',
    'Basic Income Increasingly',
    'Increasingly Popular',
    'Basic Income',
    'Income Increasingly'],
   'stance': 'NEUTRAL',
   'aspect': 'Income Increasingly Popular',
   'topic': 'incomes policy',
   'concept': 'Income Increasingly Popular'},
  {'sentence': 'Basic income is a policy that has broad support from both the progressive left and libertarian right.',
   'kp': ['Basic income',
    'broad support',
    'progressive left',
    'left and libertarian',
    'Basic'],
   'stance': 'PRO',
   'aspect': 'Basic income',
   'topic': 'incomes policy',
   'concept': 'Basic income'},
  {'sentence': 'Centerleft economists including Paul Krugman have endorsed the scheme for various reasons.',
   'kp': ['including Paul Krugman',
    'economists including Paul',
    'Centerleft economists including',
    'Paul Krugman',
    'including Paul'],
 

In [46]:
unique_args[0]

{'id': 't3_30oi71',
 'argument': [{'sentence': 'Section I Why is Basic Income Increasingly Popular?',
   'kp': ['Income Increasingly Popular',
    'Basic Income Increasingly',
    'Increasingly Popular',
    'Basic Income',
    'Income Increasingly'],
   'stance': 'NEUTRAL',
   'aspect': 'Income Increasingly Popular',
   'topic': 'incomes policy',
   'concept': 'Income Increasingly Popular'},
  {'sentence': 'Basic income is a policy that has broad support from both the progressive left and libertarian right.',
   'kp': ['Basic income',
    'broad support',
    'progressive left',
    'left and libertarian',
    'Basic'],
   'stance': 'PRO',
   'aspect': 'Basic income',
   'topic': 'incomes policy',
   'concept': 'Basic income'},
  {'sentence': 'Centerleft economists including Paul Krugman have endorsed the scheme for various reasons.',
   'kp': ['including Paul Krugman',
    'economists including Paul',
    'Centerleft economists including',
    'Paul Krugman',
    'including Paul'],
 

In [43]:
file_name = "cmv_rr"
fout = open(f"../data/{file_name}.jsonl", "w")

# Deep_copies
rr = copy.deepcopy(retrieved_ranked)

with tqdm(total=(len(rr))) as pbar:
    with fout:
        for unique, rr in zip(unique_args, rr):
            # Extended pre-formatted mined object
            rr["tgt_counter"] = [_ for _ in unique["tgt_counter"]]

            fout.write(json.dumps(rr))

            fout.write("\n")
            pbar.update()

logger.info(f"[{len(rr)} Data Stored as {file_name}.jsonl]")

  0%|          | 0/100 [00:00<?, ?it/s]

INFO:ARGUMENT-EXTRACTOR:[4 Data Stored as cmv_rr.jsonl]


In [44]:
### EVALUATE OUTPUT ###
test = [json.loads(ln) for ln in open("../data/cmv_rr.jsonl", "r")]

In [45]:
_ = random.randint(0, len(test))
print(test[_]["argument"], "\n")
print(test[_]["retrieved"], "\n")
print(test[_]["tgt_counter"], "\n")

[{'sentence': 'IMO whenever theyre used its mostly just confusing especially to younger readers.', 'kp': ['IMO whenever theyre', 'younger readers', 'IMO', 'readers', 'theyre'], 'stance': 'CON', 'aspect': 'IMO whenever theyre', 'topic': None, 'concept': 'IMO whenever theyre'}, {'sentence': 'Why say preposterous when you can just say crazy or insane and have it make sense to more people?', 'kp': ['crazy or insane', 'make sense', 'people', 'preposterous', 'crazy'], 'stance': 'CON', 'aspect': 'crazy or insane', 'topic': None, 'concept': 'crazy or insane'}, {'sentence': 'Do you need to sound smart with fancy language?', 'kp': ['fancy language', 'sound smart', 'smart with fancy', 'language', 'sound'], 'stance': 'PRO', 'aspect': 'fancy language', 'topic': None, 'concept': 'fancy language'}, {'sentence': 'Of course there are some exceptions for things that cant be described any other way like names of diseases and other domainspecific words.', 'kp': ['domainspecific words', 'exceptions for thi

In [30]:
# TODOs: Keyphrase Selection
# TODOs: Full-run, unique arguements
import json

### KEYPHRASE SELECTION ###
rr_train = [json.loads(ln) for ln in open("../data/cmv_rr.jsonl", "r")]
rr_train

[{'id': 't3_30oi71',
  'argument': [{'sentence': 'Section I Why is Basic Income Increasingly Popular?',
    'kp': ['Income Increasingly Popular',
     'Basic Income Increasingly',
     'Increasingly Popular',
     'Basic Income',
     'Income Increasingly'],
    'stance': 'NEUTRAL',
    'aspect': 'Income Increasingly Popular',
    'topic': 'incomes policy',
    'concept': 'Income Increasingly Popular'},
   {'sentence': 'Basic income is a policy that has broad support from both the progressive left and libertarian right.',
    'kp': ['Basic income',
     'broad support',
     'progressive left',
     'left and libertarian',
     'Basic'],
    'stance': 'PRO',
    'aspect': 'Basic income',
    'topic': 'incomes policy',
    'concept': 'Basic income'},
   {'sentence': 'Centerleft economists including Paul Krugman have endorsed the scheme for various reasons.',
    'kp': ['including Paul Krugman',
     'economists including Paul',
     'Centerleft economists including',
     'Paul Krugman'

In [31]:
# WORK WITH DEEP COPIES

# TODOs: Similarity, duplicate
def overlap_kp(string, sub):
    count = start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            count+=1
        else:
            return count

# Similarity rank
def selected_keyphrases(arg):
    kps = [_["kp"] for _ in arg["retrieved"]]
    tgt_sentences = [_["sentence"] for _ in arg["tgt_counter"]]

    selected_kps = []

    # Iterate per target sentence
    for tgt, kp in zip(tgt_sentences, kps):
        selected = []

        for terms in kp:
            singletons = terms.split()
            for single in singletons:
                if overlap_kp(tgt.lower(), single) > 0:
                    selected.append(terms)

        selected_kps.append(selected)

    for _, j in zip(arg["tgt_counter"], selected_kps):
        _["selected_keyphrases"] = j

for arg in rr_train:
    selected_keyphrases(arg)

In [32]:
import random
_ = random.randint(0, len(rr_train))

rr_train[_]

{'id': 't3_6jgnbe',
 'argument': [{'sentence': 'I think that for redistributing wealth to deal with increased automation expanding the welfare system makes way more sense than UBI.',
   'kp': ['sense than UBI',
    'increased automation expanding',
    'welfare system makes',
    'UBI',
    'redistributing wealth'],
   'stance': 'NEUTRAL',
   'aspect': 'sense than UBI',
   'topic': None,
   'concept': 'sense than UBI'},
  {'sentence': 'I want to begin this by defining UBI and the welfare system as I understand them.',
   'kp': ['defining UBI', 'welfare system', 'UBI', 'begin', 'defining'],
   'stance': 'NEUTRAL',
   'aspect': 'defining UBI',
   'topic': None,
   'concept': 'defining UBI'},
  {'sentence': 'UBI to provide all legal residents of a country a standard sum of cash unconnected to work The Welfare System Providing income to societys lowest earning citizens on a sliding scale so that the lower your income the more assistence you get.',
   'kp': ['Welfare System Providing',
    

In [39]:
file_name = "cmv_rr_selected"
fout = open(f"../data/{file_name}.jsonl", "w")

# Deep_copies
import copy
rr_selected = copy.deepcopy(rr_train)

#with tqdm(total=(len(rr))) as pbar:
with fout:
    fout.write(json.dumps(rr_selected))
        # for unique, rr in zip(unique_args, rr):
        #     # Extended pre-formatted mined object
        #     rr["tgt_counter"] = [_ for _ in unique["tgt_counter"]]
        #     print(unique["tgt_counter"])

    fout.write("\n")

logger.info(f"[{len(rr_selected)} Data Stored as {file_name}.jsonl]")

INFO:ARGUMENT-EXTRACTOR:[100 Data Stored as cmv_rr_selected.jsonl]


In [267]:
# fout = open("../data/cmv_rr.jsonl", "w")
#
# args = [json.loads(ln) for ln in open("../data/cmv_processed.jsonl")]
# sample = args[0:sample]
#
# # for i, j in zip(retrieved_ranked, sample):
# #     # Add counter to the dictionary (implicitly, i)
# #     i["counter"] = j["counter"]
# #     fout.write(json.dumps(i))
# #     fout.write("\n")

In [None]:
# OLD
# kp = adu["kp"]
# topic = adu["topic"]
# concept = adu["concept"]
#
# kp.append(topic) if topic else kp
# kp.append(concept) if concept else kp
#
# query = ", ".join(i for i in adu["kp"])
# print(query)
#
# # TODOs: Add title field for all ES indices to enable multi-field search
# search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=retrieve_len)]
#
# #evidence = [i[1] for i in search]
# #source = [i[0] for i in search]
#
# evidence = [i[1] for i in search]
# ev_kp = yake_extract_keyphrase(evidence)
#
#         # try:
#         #     ev_kp = yake_extract_keyphrase(evidence)
#         # except:
#         #     ev_kp = [" "]
#
#         #retrieved.append({"passages": evidence, "kp": [i for i in ev_kp], "source": source})
#         #retrieved.append({"passages": evidence})
#
#
#
#     # TODOs: Implement yield without storing list
#     return ({
#         "id": id_,
#         "argument": mined["argument"],
#         "retrieved": [i for i in retrieved]
#     })
#
# for arg in unique_args[0:10]:
#     retrieved_ev.append(search(arg))
#
# # SAMPLE = unique_args[0:100]
# #
# # step = max(int(len(SAMPLE) / 10), 1)
# # BATCHES = [SAMPLE[i:i + step] for i in range(0, len(SAMPLE), step)]
# #
# # retrieved_ev = []
# # for idx, batch in enumerate(BATCHES):
# #     print('-' * 25 + 'Batch %d/%d' % (idx + 1, len(batch)) + '-' * 25)
# #
# #     with multiprocessing.Pool(8) as pool:
# #         with tqdm(total=(len(batch))) as pbar:
# #             for arg in batch:
# #                 retrieved_ev.append(search(arg))
# #                 pbar.update()

In [44]:
# SINGLE SEARCH FUNCTION
####

# from tqdm.notebook import tqdm
# import multiprocessing
# from src.detection.stance_classifier import sentence_stance, compare_stance
# from src.utils_.word_net_expansion import expand_query
# from src.detection.stance_classifier import sentence_stance
# # from multiprocessing.pool import ThreadPool as Pool
# import time
#
# # Disable Huggingface Logging
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
#
# ### RETRIEVER ###
# db = db
# queries = []
# retrieved_ev = []
#
# topic_ids = [json.loads(ln)["id"] for ln in open("../data/argument_topic_concept.jsonl")]
# concept_ids = [json.loads(ln)["id"] for ln in open("../data/argument_concept.jsonl")]
#
# import random
# _ = random.randint(0, len(unique_args))
# sample = unique_args[_]
#
# # TODOs: Argumentative Sentence
# def search(mined):
#     id_ = mined["id"]
#     print("\n", id_)
#
#     retrieve_len = 5
#     retrieved = []
#     for adu in mined["argument"]:
#
#         sentence = adu["sentence"]
#         if len(tokeniser(sentence)) <= 8:
#             continue
#
#         kp = list(set(adu["kp"][0:5]))
#         topic = adu["topic"]
#         concept = adu["concept"]
#
#         kp.append(topic) if topic else kp
#         kp.append(concept) if concept else kp
#
#         query = ", ".join(i for i in adu["kp"])
#         print(query)
#         print(" ")
#         search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=retrieve_len)]
#
#         source = [i[0] for i in search]
#         evidence = [i[1] for i in search]
#
#         merged = ", ".join(i for i in evidence)
#         ev_kp = yake_extract_keyphrase(merged)
#         print(ev_kp)
#
#         retrieved.append({"passages": evidence, "kp": [i for i in ev_kp], "source": source})
#
#     # TODOs: Implement yield without storing list
#     return ({
#         "id": id_,
#         "argument": mined["argument"],
#         "retrieved": [i for i in retrieved]
#     })
#
# result = search(sample)

INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.054s]



 t3_5h86n0
election, coming, lot
 
['lot', 'lot of folks', 'election', 'folks', 'folks are living']
system of voting, prevent the tyranny, majority, dont, system
 


INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.149s]


['Electoral system', 'electoral system introduced', 'system', 'President of Chad', 'majority']
work with elections, structured to make, doesnt work, difficult to act, act against minority
 


INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.270s]
INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.043s]


['British humanitarian aid', 'humanitarian aid worker', 'fellow aid workers', 'Sri Lanka', 'mass killings directed']
difficult to elect, make, difficult, elect
 
['sizeable African-American minority', 'African Americans began', 'system favored', 'favored the white', 'white majority']
authority to govern, system will fall, elected, collectively, authority
 


INFO:elastic_transport.transport:POST http://localhost:9200/*/_search [status:200 duration:0.125s]


['Saskatchewan Health Authority', 'Provincial Health Authority', 'Health Authority Act', 'Regional Health Authority', 'Health Authority']


In [18]:
# ### CHECK BLANKS ###
# args_ = [json.loads(ln)["argument"]["argument"] for ln in open("../data/cmv_processed.jsonl")]
# ids = [json.loads(ln)["id"] for ln in open("../data/cmv_processed.jsonl")]
#
# for j, k in zip(args_, ids):
#     if j == "":
#         print("blanks", j, k)

blanks  t3_3cm6jy
blanks  t3_1egv4k
blanks  t3_1egv4k
blanks  t3_5wjdve


In [30]:
# from src.detection.stance_classifier import sentence_stance, compare_stance
# from src.utils_.word_net_expansion import expand_query
# from src.detection.stance_classifier import sentence_stance
# import multiprocessing
# import json
# import time
#
# # TODOs: Adu, Counter + KP Extraction as 'Argument Mining' preprocessing module
#
# # Disable Huggingface Logging
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
#
# topic_ids = [json.loads(ln)["id"] for ln in open("../data/argument_topic_concept.jsonl")]
# concept_ids = [json.loads(ln)["id"] for ln in open("../data/argument_concept.jsonl")]
#
# def get_notion(notions_ids, notions_lst, arg_id, label):
#     notion_id = notions_ids.index(arg_id)
#     notion = notions_lst[notion_id][label]
#     return str(notion) if notion else None
#
# def extract_adus(arg_):
#     arg, id_ = arg_
#     print("\n", id_)
#
#     topic = get_notion(topic_ids, topics, id_, "topic_label")
#     concept = get_notion(concept_ids, concepts, id_, "concept_label")
#
#     adu_sents = sentences_segment(arg)
#
#     adus = []
#     for _ in adu_sents:
#         if len(tokeniser(_)) <= 8:
#             continue
#
#         try:
#             kp = extract_keyphrase(_)
#         except:
#             kp = [" "]
#
#         kp.append(topic) if topic else kp
#         kp.append(concept) if concept else kp
#         print(kp)
#
#         adu = {"sentence": _, "kp": [i for i in kp], "stance": sentence_stance(_, kp[0])}
#
#         adus.append(adu)
#
#     yield ({
#         "id": id_,
#         "argument": [i for i in adus]
#     })
#
# step = max(int(len(unique) / 10), 1)
# batches = [unique[i:i + step] for i in range(0, len(unique), step)]
#
# mined_args = []
# # TODOs: Remove Huggingface Warnings
#
# for idx, batch in enumerate(batches):
#     print('-' * 25 + 'Batch %d/%d' % (idx + 1, len(batches)) + '-' * 25)
#
#     with multiprocessing.Pool(8) as pool:
#         with tqdm(total=(len(batch))) as pbar:
#             for arg in batch:
#                 mined_args.append([i for i in extract_adus(arg)])
#                 pbar.update()

-------------------------Batch 1/10-------------------------
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_5dv1q4
['topic', 'death', 'angle']
['opposed to Trump', 'work for Melania', 'opinion that places', 'places of public', 'public accommodation']
['people completely disgusting', 'religious fundamentalists denying', 'fundamentalists denying service', 'gay people completely', 'completely disgusting']
['problem', 'distinguish', 'cases']

 t3_5t56cr
['peace economics etc.', 'works journalism peace', 'journalism peace economics', 'liberal works journalism', 'wellknown liberal bias', 'australian democrats']
['Jim Simons mentioned', 'Simons mentioned indirectly', 'Democratic views represent', 'Jim Simons', 'Simons mentioned', 'australian democrats']
['potentially environmental failure', 'Republican Party', 'Party is objectively', 'current ideology', 'objectively untenable', 'australian democrats']
['progressive Democrats', 'corporate Democrats', 'comparison is relative', 'relative to progressive', 'Democrats not corporate', 'australian democrats']

 t3_3gys03
['retarded scientific progress

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_3rd4r3
['generally liberal views', 'proBernie Sanders', 'rcirclejerk Redditors', 'liberal views', 'recurring theme']
['expecting free things', 'mission expecting balance', 'free things', 'goodsservicescontent creators', 'sort of mission']
['quote outofcontext sound', 'outofcontext sound bytes', 'slant left', 'level of social', 'social liberal']
['manchildren alike.So reddit', 'hive mind', 'mind is frowned', 'children and manchildren', 'manchildren alike.So']

 t3_6cn44s
['important traits', 'guy', 'rowdiness', 'important', 'traits', 'aggressiveness']
['public situation', 'rowdy guy', 'offensive if required', 'required either physically', 'physically or verbally', 'aggressiveness']
['physical if needed', 'guy can confront', 'people get physical', 'needed', 'teasing', 'aggressiveness']
['strong eye contact', 'strong eye', 'eye contact', 'strong', 'eye', 'aggressiveness']
['suck', 'hes', 'aggressiveness']

 t3_2tuej1
['Bill Nye', 'discussionargument on rdataisbeautiful', 'Bill', 'Nye

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_6rdz97
['visually clean.I work', 'regularly cleaned times', 'times a day', 'start this post', 'visually clean.I', 'changes in sanitation']
['toilet dirty', 'Ive', 'dirty or unclean', 'unclean', 'bathroom look dirty', 'changes in sanitation']
['bathroom with respect', 'office treats', 'treats the bathroom', 'respect', 'office', 'changes in sanitation']
['toilet paper covers', 'office will make', 'paper covers', 'covers before sitting', 'toilet paper', 'changes in sanitation']
['hear', 'stall', 'changes in sanitation']

 t3_6ew42y
['funniest movie made', 'funny is subjective', 'view changed', 'essentially the funniest', 'funniest movie', 'comedy']
['immediately knew', 'knew I loved', 'loved the comedy', 'laughing so hard', 'hard my side', 'comedy']
['recall a movie', 'remains funny', 'funny', 'isnt', 'sensation', 'comedy']
['vulgur play', 'comedies are promoted', 'promoted so heavily', 'funny parts', 'shown in previews', 'comedy']
['sequels', 'funny', 'moment', 'time', 'doesnt', 'co

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_3o7s9v
['making this post', 'dont think video', 'video games', 'considered a sport', 'sports channel.First']
['front page', 'recently found', 'found this thread', 'page', 'recently']
['Apparently CSGO', 'added to FoxSports.Now', 'video games', 'games are great', 'CSGO']
['sports Tv channel', 'channel', 'dont', 'belong', 'sports']
['physical activities.Video gaming', 'activities.Video gaming', 'age level', 'physical activities.Video', 'physical activity']

 t3_55do4j
['Hating people based', 'race ethnicity family', 'ethnicity family skin', 'family skin color', 'Hating people']
['Muslims or Christians', 'hating on Muslims', 'Christians bigoted', 'simply religions', 'bigoted and racist']
['Trump fans NeoNazis', 'constantly change them.Why', 'fans NeoNazis misogynists', 'hating Muslims', 'door to Trump']
['Trump fans personality', 'Trump fans', 'fans personality', 'Trump', 'Arent']
['human a Muslim', 'Muslim is matter', 'opinion and lifestyle', 'justify their opinion', 'Muslim']

 t3_

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_1l4s6s
['end the Rword', 'campaigns to end', 'essentially a movement', 'movement to stop', 'word retarded', 'pseudobulbar palsy']
['mental disabilities.The reasons', 'deaf or lame', 'mental disabilities.The', 'goal is worthwhile', 'worthwhile are threefold', 'pseudobulbar palsy']
['exaggerate a situation', 'simply another disability', 'situation', 'simply', 'disability', 'pseudobulbar palsy']
['huge sign', 'sign', 'huge', 'pseudobulbar palsy']

 t3_65fe4m
['beating a dead', 'dead horse', 'beating', 'dead', 'horse', 'anita hill', 'blood clot in her carotid artery']
['hate her.', 'list of reasons', 'reasons I hate', 'her.', 'give', 'anita hill', 'blood clot in her carotid artery']
['differing viewpoints Anita', 'Unlike normal people', 'viewpoints Anita', 'Anita has prevented', 'Commenting or voting', 'anita hill', 'blood clot in her carotid artery']

 t3_5aoldn
['disbelief Ive', 'actual conversation', 'time I express', 'express my disbelief', 'offered any logical']
['argument EDITED

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_21qr73
['IsraelJewish than usual', 'posts regarding IsraelJewish', 'usual', 'posts', 'IsraelJewish', 'history of israel', 'israeli closure policies']
['inherently contradictory', 'change my view', 'proIsrael the state', 'state and calling', 'liberal is inherently', 'history of israel', 'israeli closure policies']
['ArabIsraeli conflict', 'religionspeople involved', 'conflict', 'clarify', 'qualms', 'history of israel', 'israeli closure policies']
['Jewish state', 'inherently calling', 'calling for decades', 'decades of crisis', 'crisis to set', 'history of israel', 'israeli closure policies']
['state of Israel', 'current state', 'strengthen its position', 'oppressive measures', 'measures to strengthen', 'history of israel', 'israeli closure policies']

 t3_1w9mvo
['ordinary US restaurant', 'restaurant I dont', 'based on shame', 'ordinary', 'restaurant', 'transaction costs']
['amount varies depending', 'kind of asshole.Why', 'amount varies', 'varies depending', 'restaurant and servi

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_22ro8l
['dont', 'aspect', 'excessive dopamine activity']
['post about eating', 'eating disorders', 'found myself thinking', 'counter argument', 'people', 'excessive dopamine activity']
['them.The main point', 'make it clear', 'rehab or gym', 'gym or similar', 'good thing', 'excessive dopamine activity']

 t3_3m2auo
['vacations and trips', 'flying and opted', 'opted to drive', 'Ive', 'vacations', 'cheaper travel']
['reasoning Its cheaper', 'trip', 'reasoning', 'cheaper', 'people', 'cheaper travel']
['cost of driving', 'total cost', 'number of people', 'significantly affect', 'vacation increases', 'cheaper travel']
['flight requires', 'security and wait', 'airport security', 'airport', 'terminal', 'cheaper travel']
['worry about security', 'rent a car', 'car', 'worry', 'security', 'cheaper travel']

 t3_5wlqz8
['drives harmful wedges', 'wedges between people', 'identity politics', 'politics though goodintentioned', 'goodintentioned actually drives', 'identity conflict']
['style of s

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_3wamx9
['codes formalized standards', 'private high schools', 'dress codes formalized', 'United States', 'buying clothes', 'excessive wear']
['preoccupied with clothes', 'free time', 'spend energy', 'energy or minutes', 'minutes picking', 'excessive wear']
['allowed to dress', 'invariably end', 'end up wearing', 'wearing clothes', 'students are allowed', 'excessive wear']
['foremost to foster', 'foster learning', 'mission', 'distracts from learning', 'learning is inherently', 'excessive wear']

 t3_667gvu
['considered mens issues', 'homelessness suicide custody', 'suicide custody jail', 'custody jail sentence', 'jail sentence length']
['feminist speaking', 'speaking', 'issues', 'addressed.This', 'feminist']
['occasional suicidal thought', 'strong distaste', 'socalled feminists', 'occasional suicidal', 'suicidal thought']
['naturally zerosum games', 'extension public support', 'zerosum games', 'objectivelyPublic attention', 'extension public']
['shelters breast cancer', 'breast can

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_20ofb2
['friends found', 'back story', 'fight', 'back', 'story', 'fists']
['Punches were thrown', 'pretty badly', 'cheated on beat', 'beat friend', 'friend B pretty', 'fists']
['firing from jobs', 'girl who called', 'called friend', 'friend B admitted', 'admitted I dont', 'fists']
['fist fight settle', 'fist fight', 'fight settle', 'settle the tension', 'friends faster', 'fists']
['day life.Assuming consent', 'extreme harmokay guys', 'good points presented', 'mature debate', 'legal in hockey', 'fists']

 t3_22p8vc
['social constructs', 'disputing the existence', 'existence of social', 'constructs', 'clear', 'nervous origin']
['deeper significance', 'phenomenon of dismissing', 'dismissing a common', 'common belief', 'belief or behavior', 'nervous origin']
['natural factors', 'necessarily have arisen', 'today.My reasoning', 'play hundreds', 'hundreds of thousands', 'nervous origin']
['enduring social constructs', 'social constructs grew', 'instinctual behavior', 'language developed'

  0%|          | 0/599 [00:00<?, ?it/s]


 t3_5uutr2
['white people based', 'favor black people', 'word racist found', 'word discrimination found', 'ethnicity.So reddit change']

 t3_2rc3p3
['American patriots', 'mentality against immigration', 'general mentality', 'American', 'immigration']
['people shouldnt', 'shouldnt be allowed', 'line and americans', 'born', 'side']
['slogan Creating jobs', 'slogan Creating', 'Creating jobs', 'jobs for americans', 'ads and political']
['private corporations', 'understand why politics', 'americans to vote', 'prominent in ads', 'ads made']
['creating jobs', 'jobs for americans', 'jobs for people', 'americans is morally', 'morally superior']

 t3_1h1yj0
['sound more important', 'recognize utilize', 'puff word', 'important', 'recognize', 'unnecessary use']
['overcomplicating language', 'select few times', 'makes more sense', 'sense are negligible.What', 'utilize makes', 'unnecessary use']

 t3_2fdmwl
['tldr Hyperrealistic paintingsdrawings', 'Hyperrealistic paintingsdrawings offer', 'tldr Hy

In [28]:
# def get_topic(arg_id):
#     topic_id = topic_ids.index(arg_id)
#     topic = topics[topic_id]["topic_label"]
#     return str(topic) if topic else None
#
# def get_concept(arg_id):
#     concept_id = concept_ids.index(arg_id)
#     concept = concepts[concept_id]["concept_label"]
#     return str(concept) if concept else None

In [26]:
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer

# TODOs: Fix Vectorizer Issue
# kb = KeyBERT()
# vectorizer = KeyphraseCountVectorizer()
# def extract_keyphrase(doc, n_gram=3, n_kp=3, use_mmr="False", use_maxsum="False"):
#     try:
#         kp = kb.extract_keywords(doc, keyphrase_ngram_range=(0, 3), stop_words="english", diversity=0.3,)
#         kp_ = kb.extract_keywords(doc, vectorizer=vectorizer, stop_words="english", diversity=0.3)
#
#     except:
#         return [" "]
#
#     # Concatonate, remove duplicates
#     kp = kp + kp_
#     kp = [i[0] for i in kp]
#     kp = list(set(kp))
#
#     return kp

In [None]:
# # TODOs: Compute in Batches
# sample = unique[0:100]
# with multiprocessing.Pool(8) as pool:
#     with tqdm(total=(len(unique))) as pbar:
#         for arg in unique:
#             mined_args.append([i for i in extract_adus(arg)])
#             pbar.update()

In [None]:
# def retrieved_evidence(mined, retrieve_len=5):
#     """ Retrieves Evidence from Knowledge base, returning a well-formed Retrieved Evidence Object
#     given an input Argument"""
#
#     id_ = mined["id"]
#     print("\n", id_)
#
#     retrieved = []
#     adus = []
#     for _ in mined["argument"]:
#         if len(tokeniser(_)) <= 8:
#             continue
#
#         kp = extract_keyphrase(_)
#         print(kp)
#         adu = {"sentence": _, "kp": [i for i in kp], "stance": sentence_stance(_, kp[0])}
#
#         kp.append(topic) if topic else kp
#         kp.append(concept) if concept else kp
#
#         query = ", ".join(i for i in kp)
#         print(query)
#
#         # TODOs: Add title field for all ES indices to enable multi-field search
#         search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=retrieve_len)]
#
#         evidence = [i[1] for i in search]
#         source = [i[0] for i in search]
#
#         ev_kp = extract_keyphrase(evidence)
#
#         retrieved.append({"passages": evidence, "kp": [i[0] for i in ev_kp], "source": source})
#         #retrieved.append({"passages": evidence, "source": source})
#         adus.append(adu)
#
#     return ({
#         "id": id_,
#         "argument": [i for i in adus],
#         "retrieved": [i for i in retrieved]
#     })

In [224]:
# from sentence_transformers import SentenceTransformer, util
# from sklearn.metrics.pairwise import cosine_similarity
# import torch
# import time
#
# model = SentenceTransformer('all-MiniLM-L6-v2')
# def cosine_similarity_(sentences):
#     embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)
#
#     cos = torch.nn.CosineSimilarity()
#     scores = cos(embeddings[0], embeddings[1:])
#
#     scored = []
#     retrieved_sentences = sentences[1:]
#     for sent, similarity in zip(retrieved_sentences, scores):
#         scored.append((sent, similarity.numpy().item()))
#
#     return scored
#
# def rank_passages(ev, k=3):
#     adus = [i["sentence"] for i in ev["argument"]]
#     retrieved_passages = [i["passages"] for i in ev["retrieved"]]
#
#     # Merge
#     # Output 1 x merged sentences object per ADU sentence, with k=5 collected passages as a list of sentences
#     merged_passages = []
#     for passages in retrieved_passages:
#         merged_sents = []
#         # Iterate n x sentences for each k=5 retrieved passages
#         for passage in passages:
#             # Segment as a list of sentences
#             sents = sentences_segment(passage)
#             # Add sentences to merged_sentences object
#             merged_sents.extend(sents)
#
#         # Store merged sentence object for each ADU
#         merged_passages.append(merged_sents)
#
#     rank_retrieved = []
#     # Rank n x merged sentences for each 1 x ADU
#     for adu, merged in zip(adus, merged_passages):
#         scored = []
#         sentences = [adu]
#         sentences.extend(merged)
#         scored = cosine_similarity_(sentences)
#
#         ranked_sents = sorted(scored, key=lambda x: x[1], reverse=True)
#
#         # Select top-k sentences
#         ranked_sents = ranked_sents[0:k]
#
#         merged = ", ".join(i[0] for i in ranked_sents)
#         merged_kp = extract_keyphrase(merged)
#         rank_retrieved.append({"ranked_passages": merged, "kp": merged_kp})
#
#     return rank_retrieved
#
# # TODO: Clean text
# # TODO: Collect unique Keyphrases per Argument
# rank_passages(retrieved_ev[0])
# #print(len(test["ranked_passages"][0]))

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


[{'ranked_passages': "The organisation Basic Income UK is 'a collective of independent people promoting unconditional basic income as a progressive social policy for the United Kingdom, and beyond'., Several British academics have been involved in the basic income debate., Her son Brandon Rhys Williams proposed a basic income to a parliamentary committee in 1982, and soon after that in 1984, the Basic Income Research Group, now the Citizen's Basic Income Trust, began to conduct and disseminate research on basic income.",
  'kp': ['organisation basic income uk',
   'basic income',
   'basic income debate',
   'basic income uk']},
 {'ranked_passages': "But far more of the contemporary support for basic income in the United States has come from the left of center, driving by people who see it as a major expansion of support for low-income people, as the late, Al Sheahan argued in his 2012 book, 'the Basic Income Guarantee': 'Your right' 'to economic security'., In the Czech Republic, unco

In [62]:
# Handle duplicates
# def rank_passages(ev, k=3):
#     """ Handles a Retrieved Evidence Object, returning the top-k passages for each ADU """
#     # Per Argument
#     # Index into Retrieved Evidence Object
#     adus = [i for i in ev[0]["argument"]]
#     retrieved = [i for i in ev[0]["retrieved"]]
#
#     #print(len(retrieved), len(adus))
#
#     # Rank k-returned passages for each ADU
#     r_retrieved = []
#     for adu, passage in zip(adus, retrieved):
#         scored = []
#         ranked_ev = []
#         for _, kp in zip(passage["evidence"], passage["kp"]):
#             scored.append((_, kp, cosine_similarity(str(adu), str(_))))
#
#         scored = sorted(scored, key=lambda x: x[2], reverse=True)[0:3]
#         for i, j, k in scored:
#             ranked_ev.append({"evidence": i, "kp": j, "similarity": k})
#
#         r_retrieved.append(ranked_ev)
#
#     return r_retrieved

# 1 Argument x 4 ADUs x 5 Retrieved Passages
# ranked = [i for i in rank_passages(retrieved_ev[3])]
# print(len(ranked))
# print(ranked)

4 4
4
[[{'evidence': 'Many technology experts and technology entrepreneurs have begun endorsing basic income in the 2000s and 2010s. These include Marshal Brain, Sam Altman, James Hughes, Facebook co-founder Chris Hughes, Elon Musk, and Mark Zuckerberg (in his 2017 Harvard commencement speech), and Jeremy Rifkin. The overriding theme among technologists who favor basic income is the belief that automation is creating an increasingly unstable labor market.', 'kp': ('technologists favor basic income', 0.6414), 'similarity': 0.3174712657928467}, {'evidence': "Committee member Lady Rhys-Williams argued that the incomes for adults should be more like a basic income. She was also the first to develop the negative income tax model. Her son Brandon Rhys Williams proposed a basic income to a parliamentary committee in 1982, and soon after that in 1984, the Basic Income Research Group, now the Citizen's Basic Income Trust, began to conduct and disseminate research on basic income.", 'kp': ('will

In [13]:
# from sentence_transformers import SentenceTransformer, util
# import torch
# import time
#
# model = SentenceTransformer('all-MiniLM-L6-v2')
#
# # TODOs: Join passages and sentence rank
# ### SCORE COSINE SIMILARITY ###
# def cosine_similarity(sent_1, sent_2):
#     sentences = [sent_1, sent_2]
#     embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)
#
#     cos = torch.nn.CosineSimilarity(dim=0)
#     score = cos(embeddings[0], embeddings[1])
#
#     return score.numpy().item()
#
# ### SCORE TF-KEYWORD OVERLAP ###
# def overlap_score(evidence_kp, adu_kp):
#     score = 0
#     # Split Keyphrase into components, scoring partial units as overlap
#     for i in evidence_kp:
#         for j in i.split():
#             # Ensure string value, to enact .find
#             if ", ".join([i for i in adu_kp]).find(j) != -1: score += 1
#
#             else: continue
#     return score
#
# ### RANK PASSAGES ###
# def score_passages(ev):
#     for _ in range(0, len(ev["argument"])):
#         print(_)
#
# from collections import defaultdict
# def rank_passages(ev, k=2):
#     adus = [i for i in ev["argument"]]
#     retrieved = [i for i in ev["retrieved"]]
#
#     rank_retrieved = []
#     count = 0
#
#     for adu, passages in zip(adus, retrieved):
#         count += 1
#         scored = []
#
#         # 5 passages
#         for passage in passages["passages"]:
#             score = cosine_similarity(str(adu), str(passage))
#             scored.append((passage, score))
#
#         ranked_passages = sorted(scored, key=lambda x: x[1], reverse=True)
#         ranked_passages = ranked_passages[0:k]
#
#         merged = ", ".join(i[0] for i in ranked_passages)
#         merged_kp = extract_keyphrase(merged)
#         rank_retrieved.append({"ranked_passages": merged, "kp": merged_kp})
#
#     return rank_retrieved
#
# import copy
# ### UPDATE RETRIEVED OBJECT ###
# # for i in range(0, len(retrieved_ev)):
# #     retrieved_ranked[i]["retrieved"] = [i for i in rank_passages(retrieved_ev[i])]
#
# tic = time.time()
# retrieved_ranked = copy.deepcopy(retrieved_ev)
# with tqdm(total=(len(retrieved_ev)), position=0, leave=True) as pbar:
#     for i in range(0, len(retrieved_ev)):
#         retrieved_ranked[i]["retrieved"] = [i for i in rank_passages(retrieved_ev[i])]
#     pbar.update()
#
# toc = time.time()
# # duration = toc - tic
#
# retrieved_ranked

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


  0%|          | 0/10 [00:00<?, ?it/s]

[{'id': 't3_30oi71',
  'argument': [{'sentence': 'Section I Why is Basic Income Increasingly Popular?',
    'kp': ['basic income increasingly',
     'basic income',
     'section basic income'],
    'stance': 'NEUTRAL'},
   {'sentence': 'Basic income is a policy that has broad support from both the progressive left and libertarian right.',
    'kp': ['basic income', 'basic income policy', 'income policy'],
    'stance': 'PRO'},
   {'sentence': 'Centerleft economists including Paul Krugman have endorsed the scheme for various reasons.',
    'kp': ['krugman endorsed scheme',
     'centerleft economists including',
     'centerleft economists'],
    'stance': 'PRO'},
   {'sentence': 'BI also reduces inequality by redistributing income from capital to labor.',
    'kp': ['bi reduces inequality',
     'inequality redistributing income',
     'reduces inequality redistributing'],
    'stance': 'CON'}],
  'retrieved': [{'ranked_passages': "Several British academics have been involved in the b

In [202]:
# Fix KW extraction
# Fix Duplicates
# def rank_passages(ev, k=2):
#     adus = [i for i in ev["argument"]]
#     retrieved = [i for i in ev["retrieved"]]
#
#     rank_retrieved = []
#     count = 0
#
#     for adu, passages in zip(adus, retrieved):
#         count += 1
#         scored = []
#
#         # 5 passages
#         for passage in passages["passages"]:
#             score = cosine_similarity(str(adu), str(passage))
#             scored.append((passage, score))
#
#         ranked_passages = sorted(scored, key=lambda x: x[1], reverse=True)
#         ranked_passages = ranked_passages[0:k]
#
#         merged = ", ".join(i[0] for i in ranked_passages)
#         merged_kp = extract_keyphrase(merged)
#         rank_retrieved.append({"ranked_passages": merged, "kp": merged_kp})
#
#     return rank_retrieved
#
# ev = retrieved_ev[0]
# rank_passages(ev)

[{'ranked_passages': "Several British academics have been involved in the basic income debate. Among them the following:Organisations. The organisation Basic Income UK is 'a collective of independent people promoting unconditional basic income as a progressive social policy for the United Kingdom, and beyond'., Basic Income. Delaney opposes implementing a basic income (also known as a universal basic income). Minimum wage.",
  'kp': ['organisation basic income uk',
   'basic income uk',
   'united kingdom basic income']},
 {'ranked_passages': "Several British academics have been involved in the basic income debate. Among them the following:Organisations. The organisation Basic Income UK is 'a collective of independent people promoting unconditional basic income as a progressive social policy for the United Kingdom, and beyond'., A basic income is defined in the report as a policy that guarantees all members of a society a minimum amount of income. One type of basic income considered is

In [458]:
# def fuck_you():
#     print("fuck you")
#
# fuck_you()
#
# def rank_(ev):
#     # Index into Retrieved Evidence Object
#     ev = ev[0]
#     adus = [i for i in ev[0]["argument"]]
#     retrieved = [i for i in ev[0]["retrieved"]]
#     k = 3
#     print("hello")
#     # # Rank k-returned passages for each ADU
#     # count = 0
#     # r_retrieved = []
#     # for adu, passage in zip(adus, retrieved):
#     #     count += 1
#     #     ranked_passages = []
#     #     for _ in passage["evidence"]:
#     #         print(_)
#     #         ranked_passages.append((_, cosine_similarity(adu, _)))
#     #         r_retrieved.append({"evidence": i, "similarity": k} for i, k in sorted(ranked_passages, key=lambda x: x[1], reverse=True)[0:k])
#     #         print(r_retrieved)
#
#     # return {
#     #     "r_retrieved": r_retrieved
#     # }
#
# #print(rank_(retrieved_ev[0:1]))

In [116]:
# from multiprocessing.pool import ThreadPool as Pool
# from yake import KeywordExtractor
# import tqdm.notebook as tqdm
# import time
# from summa import keywords
# from tqdm import tqdm
#
# ### PASSAGE RANKING; KEYWORD OVERLAP ###
# kw_extractor = KeywordExtractor(lan="en", n=3, top=5)
#
# # TODOs: For each ADU, Rank Merged Evidence using Keyword Overlap and Filter for Contrasting Stance
# # TODOs: Handel Multiple Keywords
#
# def overlap_score(evidence_kp, adu_kp):
#     score = 0
#     # TODOs: Robust 'None' handeling
#     if adu_kp == None:
#         return score
#     # Split Keyphrase into components, scoring partial units as overlap
#     else:
#         for i in evidence_kp:
#             for j in i.split():
#                 # Ensure string value, to enact .find
#                 if ", ".join([i for i in adu_kp]).find(j) != -1: score += 1
#
#                 else: continue
#
#     return score
#
# def calculate_overlap(merged_ev, adu_kp):
#
#     for ev_unit in sentences_segment(merged_ev):
#         toks = tokeniser(ev_unit)
#         kp_overlap = 0
#
#         if len(toks) <= 8: continue
#
#         #ev_unit_kp = [i for i in keywords.keywords(ev_unit).split("\n")]
#         ev_unit_kp = [i[0] for i in kw_extractor.extract_keywords(ev_unit)]
#
#         if ev_unit_kp:
#             kp_overlap = overlap_score(evidence_kp=ev_unit_kp, adu_kp=adu_kp)
#
#         else: ev_unit_kp = None
#         yield ev_unit, ev_unit_kp, kp_overlap
#
# # pool = Pool(8)
# ### RANK PASSAGES ###
# def score_passages(ev_):
#     adu = ev_[0]["argument_discourse_unit"]
#     adu_stance = ev_[0]["adu_stance"]
#     merged_ev = ev_[0]["merged_evidence"]
#     adu_kp = ev_[0]["adu_keyphrases"]
#
#     ### CALCULATE OVERLAP ###
#     for ev_unit, ev_unit_kp, kp_overlap in calculate_overlap(merged_ev, adu_kp):
#         target = adu_kp[0]
#
#         compared_stace = compare_stance(ev_unit, target)
#         if compared_stace != adu_stance:
#             yield {
#                 "adu": adu,
#                 "adu_kp": adu_kp,
#                 "evidence_unit": ev_unit,
#                 "evidence_kps": ev_unit_kp,
#                 "overlap": kp_overlap,
#                 "evidence_stance": compare_stance(ev_unit, target),
#                 "adu_stance": adu_stance
#             }
#
#         else: continue
#
# ### SCORED EVIDENCE ###
# def score_evidence(retrieved_evidence):
#     for ev_ in retrieved_ev:
#         yield [i for i in score_passages(ev_)]
#
# ### RANKED EVIDENCE ###
# def rank_filter_counter_evidence(retireved_evidence, k=3):
#     with tqdm(total=(len(retrieved_ev))) as pbar:
#         for i in score_evidence(retrieved_ev):
#             yield sorted(i, key=lambda y: y["overlap"], reverse=True)[0:k]
#
#             pbar.update()
#
#
# ### SELECT TOP-K COUNTER-EVIDENCE ###
# tic = time.time()
# ranked_sorted_evidence = [i for i in rank_filter_counter_evidence(retrieved_ev)]
# ranked_sorted_evidence
# toc = time.time()
#
# print(toc - tic)
# # TIME 1:20M

100%|██████████| 100/100 [00:30<00:00,  3.23it/s]

30.97145128250122





In [250]:
# idx = 2
# for ln in retrieved_ev:
#     r = ln[0]
#     for _ in range(0, len(r["argument"])):
#         print(r["argument"][_]["sentence"])
#         print(r["argument"][_]["kp"])
#         print("")
#         print(r["retrieved"][_]["evidence"])
#         print(r["retrieved"][_]["kp"])

#"counter": {"counter": arg["counter"]["counter"], "counter_kp": arg["counter"]["counter_keyphrases"]}
# "argument_discourse_unit": adu,
# "query": query,
# "adu_keyphrases": [i for i in kp],
# "adu_stance": sentence_stance(adu, kp),
# "merged_evidence": ", ".join(ln for ln in evidence)
# "retrieved_documents_titles": titles,
# "retrieved_evidence": evidence,

Section I Why is Basic Income Increasingly Popular?
['basic income increasingly popular', 'basic income increasingly', 'section basic income increasingly']

Brazil. Minimum income has been increasingly accepted by the Brazilian government. In 2004, President Lula da Silva signed into law a bill to establish a universal basic income. Committee member Lady Rhys-Williams argued that the incomes for adults should be more like a basic income. She was also the first to develop the negative income tax model. Her son Brandon Rhys Williams proposed a basic income to a parliamentary committee in 1982, and soon after that in 1984, the Basic Income Research Group, now the Citizen's Basic Income Trust, began to conduct and disseminate research on basic income. Many technology experts and technology entrepreneurs have begun endorsing basic income in the 2000s and 2010s. These include Marshal Brain, Sam Altman, James Hughes, Facebook co-founder Chris Hughes, Elon Musk, and Mark Zuckerberg (in his 201

In [None]:
# # TODOs: Speed-up, Parrelleise, Yield
# def overlap_score(evidence_kp, adu_kp):
#     score = 0

#     # Split Keyphrase into components, scoring partial units as overlap
#     for i in evidence_kp:
#         for j in i.split():
#             # Ensure string value, to enact .find
#             if " ".join(adu_kp).find(j) != -1: score += 1

#             else: continue

#     return score

# ev_units = evidence
# adu_kp = extract_keyphrase(adu)

# adu_ev_overlap = []

# kp_1 = ['sex', 'relationship', 'opportunity']
# kp_2 = ['better sex']

# overlap_score(kp_2, kp_1)

# for ev_unit in evidence:
#     #print(ev_unit)
#     toks = tokeniser(ev_unit)

#     # Exprimental Value
#     if len(toks) <= 8:
#         continue

#     ev_unit_kp = extract_keyphrase(ev_unit)
#     kp_overlap = overlap_score(evidence_kp=ev_unit_kp, adu_kp=adu_kp)

#     adu_ev_overlap.append({
#         "adu": adu,
#         "adu_kp": adu_kp,
#         "ev_unit": ev_unit,
#         "ev_unit_kp": ev_unit_kp,
#         "kp_overlap": kp_overlap

#         })

# adu_ev_overlap

In [None]:
# ### OVERLAP RANKED EVIDENCE ###

# adu_ev_overlap.sort(key=lambda y: y["kp_overlap"], reverse=True)
# adu_ev_overlap

# ### FILTER IRRELEVANT EVIDENCE ###
# overlapping = [i for i in adu_ev_overlap if i["kp_overlap"] !=0]

# len(adu_ev_overlap), len(overlapping)
# overlapping


In [15]:
# Stance Test
# adu = 'I cant remember the topic that spurred this discussion but a friend and I were debating whether manmade things were natural.'
# ev_unit = 'In this essay, Mill argues the idea that the morality of an action can be judged by whether it is natural or unnatural.'
# target = 'natural things'
#
# stance = compare_stance(ev_unit, target)
# stance

'PRO'

In [None]:
# ### ASSERT SAME STANCE ###
# from detection.stance_classifier import sentence_stance, compare_stance
#
# # TODOs: Ensure KPs Extracts are constrained to 1 unit
# opposing_stance = []
# for i in overlapping:
#     adu = i["adu"]
#     target = " ".join(i for i in i["adu_kp"])
#     ev_unit = i["ev_unit"]
#
#     ev_stance = compare_stance(ev_unit, ev_unit, target)
#     adu_stance = sentence_stance(adu, target)
#
#     if ev_stance != adu_stance:
#         opposing_stance.append((ev_unit, ev_stance, adu_stance))
#
#     else: continue
#
# opposing_stance

In [None]:
### RANKING ###

# TODOs: Speed-up, Parrelleise, Yield
# ev_units = evidence
# adu_kp = extract_keyphrase(adu)

# adu_ev_overlap = []

# kp_1 = ['sex', 'relationship', 'opportunity'] 
# kp_2 = ['better sex']

# overlap_score(kp_2, kp_1)

# for ev_unit in evidence:
#     #print(ev_unit)
#     toks = tokeniser(ev_unit)

#     # Exprimental Value
#     if len(toks) <= 8:
#         continue
    
#     ev_unit_kp = extract_keyphrase(ev_unit)
#     kp_overlap = overlap_score(evidence_kp=ev_unit_kp, adu_kp=adu_kp)
    
#     adu_ev_overlap.append({
#         "adu": adu, 
#         "adu_kp": adu_kp,
#         "ev_unit": ev_unit,
#         "ev_unit_kp": ev_unit_kp, 
#         "kp_overlap": kp_overlap
        
#         })
        
# adu_ev_overlap


#rank_passages(retrieved_ev)

In [None]:
# import spacy
# from spacy.matcher import PhraseMatcher
# from fuzzywuzzy import fuzz, process

# # TODOs: Package as a Module
# # TODOs: Handle Negation (Polarity shifters)
# # TODOs: Review Unsuperived Approach; Consider adveanced patterns and common-sence knowledge

# nlp = spacy.load("en_core_web_sm")

# sentence = "I hate abortion rights. Abortions should be banned."
# sentence_2 = "I like abortion rights. I belive we should keep them."
# sentence_3 = "I hate tennis. People should play tennis more often"

# ### STANCE SCORING ###

# # TODOs: https://www.cs.uic.edu/~liub/FBS/opinion-mining-final-WSDM.pdf 
# # TODOs: Pattern based Negation
# # TODOs: Semantic Orientation of an opinion (Claim)
# # TODOs:Group synonyms of 'features', 'targets'

# phrase_matcher = PhraseMatcher(nlp.vocab)

# ### SENTIMENT LEXICONS ###
# pos = [w.replace("\n", "") for w in open("../../data/lexicon/positive_lex.txt")]
# neg = [w.replace("\n", "") for w in open("../../data/lexicon/negative_lex.txt")]
# polarity_shifters = [w.replace("\n", "") for w in open("../../data/lexicon/shifter_lexicon.txt")]

# ### STANCE: ASPECT-SEMANTIC ORIENTATION ###
# def extract_aspect(sentence, n_gram):
#     aspects = extract_keyphrase(str(sentence))[0]

#     return nlp(aspects)

# def index_aspect(phrase, aspect, sentence):    
#     patterns = [nlp(aspect)]
#     phrase_matcher.add(phrase, None, *patterns)

#     start = 0
#     stop = 0

#     matched_phrases = phrase_matcher(sentence)
#     for i in matched_phrases:
#         _, start, stop = i
        
#     return start, stop

# # TODOs: Implement Polarity Shifters, Simple
# # TODOs: Implement Polarity Shifters, Complex, Verb Patterns
# def stance_score(start, stop, sentence):
#     pos_score = 0.0
#     neg_score = 0.0

#     score = 0
#     for idx, tok in enumerate(sentence):
#         if idx == start or idx == stop:
#             continue

#         # TODOs: Implement Polarity Shift
#         # TODOs: Experiement with descriptive term + keyphrase aspects
#         # TODOs: ABSA https://www.kaggle.com/code/phiitm/aspect-based-sentiment-analysis
#         # Use external libaray: Textblob
        
#         k = 8
#         # Negation Rules
#         shifted_tok = None
#         shifted_toks = []

#         if (tok.dep_ == "neg") or (tok.dep_ in polarity_shifters):
#             #Shift to Negative
#             if idx <= k:
#                 if idx < start: neg_score += 1/(start - idx)
#                 else: neg_score += 1/(idx - stop)**0.5

#             if shifted_tok != None and shifted_tok in neg:
#                 print(shifted_tok.text)
#                 # Shift to Positive
#                 if idx < start: pos_score += 1/(start - idx)
#                 elif idx > start: pos_score += 1/(idx - stop)**0.5
#                 else: continue

#         # Aspect Sentement Orientation
#         if tok.text in pos:
#             if tok in shifted_toks:
#                 continue
            
#             if idx < start: pos_score += 1/(start - idx)
#             else: pos_score += 1/(idx - stop)**0.5

#         if tok.text in neg:
#             if tok in shifted_toks:
#                 continue

#             if idx <= start: neg_score += 1/(start - idx)
#             else: neg_score += 1/(idx - stop)**0.5
    
#     score = pos_score - neg_score /(pos_score + neg_score + 1)

#     return score

# def overlap_score(evidence_kp, adu_kp):
#     score = 0
    
#     # Split Keyphrase into components, scoring partial units as overlap
#     for i in evidence_kp:
#         for j in i.split():
#             # Ensure string value, to enact .find
#             if " ".join(adu_kp).find(j) != -1: 
#                 score += 1
#                 token = j
            
#             else: continue
    
#     return score

# def get_overlapping_token(evidence_kp, adu_kp):
#     for i in evidence_kp:
#         overlap_tokens = []
#         for j in i.split():
#             if " ".join(adu_kp).find(j) != -1: 
#                 overlap_tokens.append(j) 
            
#         return " ".join(i for i in overlap_tokens)

# def sentence_stance(sentence, aspect):
#     sentence = nlp(sentence)

#     start, stop = index_aspect("aspects", aspect, sentence)
#     score = stance_score(start, stop, sentence)

#     # Add Neutral
#     #stance = {"claim": sentence, "stance": "PRO", "aspect": aspect} if score > 0 else {"claim": sentence, "stance": "CON", "aspect": aspect}
    
#     return "PRO" if score > 0 else "CON"

# def fuzzy_match(target, evidence_unit):

#     overlapping_aspect = process.extractOne(target, ev.split())[0]
#     score = overlapping_aspect[1]

#     overlapping_aspect = nlp(re.sub(r'[^\w]', ' ', overlapping_aspect))

#     return overlapping_aspect, score

# def compare_stance(ev_unit, evidence_aspect, adu_target):
#     # Note: Already identified mathcing or partially matching Aspects. 

#     # Get the overlapping evidence aspect-target.
#     overlapping_target, score = fuzzy_match(target=adu_aspect, evidence_unit=ev)
    
#     # Get position of the overlapping_target
#     start, stop = index_aspect("OVERLAP", nlp(overlapping_target), nlp(ev_unit))

#     # Assert Stance towards evidence aspect
#     score = stance_score(start, stop, nlp(ev_unit))
    
#     return "PRO" if score > 0 else "CON"

# ev = "These simple ideas and techniques could help both you and your lover enjoy sex. 1 / 10 Getty Images/Caiaimage Think beyond the thrust."
# ev_aspect = "sex", "relationship", "opportunity"

# adu = 'Hello! Let me preface by saying I dont believe there is a better sex.'
# adu_aspect = "better sex"

# print(sentence_stance("The mutual trust and understanding you share with your partner will lead to better sex, but that's not the only reason sex can be better when you're not in a relationship.", adu_aspect))
# print(compare_stance(ev, ev_aspect, adu_aspect))


In [None]:
# from spacy.matcher import DependencyMatcher, Matcher
# matcher = Matcher(vocab=nlp.vocab)
# matcher

# # Matching Rule: Pronouns with Verbs that follow them
# aspect = "better sex"
# patterns = [
#     [{"DEP": "neg"}, {"LOWER": aspect}],
#     [{"DEP": "neg"}, {"POS": "ADJ"}, {"LOWER": aspect}],
#     [{"POS": "VERB"}, {"POS": "ADJ"}, {"LOWER": aspect}],
#     [{"LOWER": aspect.lower()}]
# ]

# test = nlp("Hello! Let me preface by saying I dont believe there is a not better sex.")
# test_2 = nlp("These simple ideas and techniques could help both you and your lover enjoy better sex.")

# matcher.add("test", patterns=patterns)
# result = matcher(test_2, as_spans=True)

# result

# # for tok in test:
# #     print(tok.i, tok, tok.pos_, tok.dep_, tok.head.i, sep="\t")

In [None]:
### TARGETED RETRIEVAL: ATTACKING PEMISES ###

# from BERT_adu_classifier import predict

# premises = []
# for sent in sentences:
#     prediction = predict(sent)
    
#     if prediction == "premise":
#         premises.append(sent)