In [None]:
### TODOs ###
# DONE: Implement Semantic Ranking
# TODOs: Commonsense Query and Concept Expansion: Topics, Concepts, Synonyms
# TODOs: Targeted Retrieval with NLI over ADUs, Premises, Claims; discard non-ADUs

# DONE: News Data
# DONE: Add Concepts
# DONE: Cosine Semantic Search
# DONE: Prior Pre-processing, tokenization and sentence segmentation to speed processing
# TODOs: Domain Restrict. Polarising social and political debate (Class labelling) only for higher-quality argument-knowledge set.
# TODOs: News, Political, Sociology and 'Good', 'Positive' counter-evidence Knowledge Base.
# TODOs: Bag of Topics Modelling
# TODOs: Implement as a Class

# TODOs: Keyphrase Selection
# DONE: Manage Duplicate Keywords
# DONE: Sentential Ranking
# DONE: Include Topic Label
# DONE: Include Concept Label
# DONE: Add News
# TODOs: Targeted Retreival with Semantic Graphs
# TODOs: Target Argumentative Content Only
# TODOs: Targeted Argument Content: Adus + Extractive Summary
# TODOs: Query Expansion
# TODOs: Multi-Field Search
# TODOs: Additional News and Knowledge Sources

In [None]:
### INIT LOGGING ###
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ARGUMENT-EXTRACTOR")

In [None]:
### INIT KNOWLEDGEBASE ###
from src.utils.elastic_db import ElasticDB

PORT = "http://localhost:9200"
db = ElasticDB(elastic_port=PORT)

In [None]:
### NLP FUNCTIONS ###
from src.utils.utils import tokeniser, sentences_segment

In [None]:
### ADU CLASSIFIER ###
# import os
# path = "/Users/joshua.sheppard/PycharmProjects/countaBot/"
# os.chdir(path)

from src.detection.adu_classifier import predict

In [None]:
### LOAD DATASETS ###
import json
import os

root = "/Users/joshua.sheppard/PycharmProjects/countaBot"
os.chdir(root)

args = [json.loads(ln) for ln in open("./src/data/processed/cmv_processed.jsonl")]
mined_args = [json.loads(ln) for ln in open("./src/data/processed/cmv_argument_extraction.jsonl")]
topics = [json.loads(ln) for ln in open("./src/data/processed/argument_topic_concept.jsonl")]
concepts = [json.loads(ln) for ln in open("./src/data/processed/argument_concept.jsonl")]

# args = [json.loads(ln) for ln in open("../data/processed/cmv_processed.jsonl")]
# mined_args = [json.loads(ln) for ln in open("../data/processed/cmv_argument_extraction.jsonl")]
# topics = [json.loads(ln) for ln in open("../data/processed/argument_topic_concept.jsonl")]
# concepts = [json.loads(ln) for ln in open("../data/processed/argument_concept.jsonl")]

In [None]:
print("ARGS:", len(args), " MINED-ARGS:",  len(mined_args), " TOPICS:", len(topics), " CONCEPTS: ", len(concepts))

In [None]:
### OPERATE ON A UNIQUE SET ###
import pandas as pd

# def unique_entries(args, key="id"):
#     data_ = pd.DataFrame(args)
#     unique = data_.drop_duplicates(subset=key)
#
#     unique_ = []
#     for _, i in unique.iterrows():
#         unique_.append({
#             "id": i["id"],
#             "claim": i["claim"],
#             "argument": i["argument"],
#             "tgt_counter": i["tgt_counter"],
#         })
#
#     return unique_
#
# unique_args = unique_entries(mined_args)
# unique_args

In [None]:
#len(unique_args)

In [None]:
### INSPECT SUBJECT ARG ###
import random
_ = random.randint(0, len(mined_args))

arg = " ".join(i["sentence"] for i in mined_args[_]["argument"])
claim = mined_args[_]["claim"]["sentence"]

#print(mined_args[_])
print(_, "\n")
print("CLAIM: ", claim, "\n")
print("ARG: ", arg, "\n")

In [None]:
### KEYPHRASE EXTRACTORS ###
from src.utils.keyphrase_extraction import yake_extract_keyphrase, summa_extract_keyphrase
import keybert

test = "Brazil's minimum income has increasingly been accepted."
ev_kp = yake_extract_keyphrase(test)
ev_kp_ = summa_extract_keyphrase(test)

test_2 = " "
ev_kp_2 = yake_extract_keyphrase(test_2)
ev_kp_2_ = summa_extract_keyphrase(test_2)

print(ev_kp)
print(ev_kp_)

# Assert can Handel Blanks
print(ev_kp_2)
print(ev_kp_2_)

In [None]:
# def get_notion(notions_ids, notions_lst, arg_id, label):
#     notion_id = notions_ids.index(arg_id)
#     notion = notions_lst[notion_id][label]
#     return str(notion) if notion else None
#
# topic_ids = [json.loads(ln)["id"] for ln in open("./src/data/processed/argument_topic_concept.jsonl")]
#
# print(topic_ids.index("t3_30oi71"))
# print(topics[453])

In [None]:
from tqdm.notebook import tqdm
import multiprocessing
from src.detection.stance_classifier import sentence_stance, compare_stance
from src.detection.stance_classifier import sentence_stance
import time
import re

# Disable Huggingface Logging
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

topic_ids = [json.loads(ln)["id"] for ln in open("./src/data/processed/argument_topic_concept.jsonl")]
concept_ids = [json.loads(ln)["id"] for ln in open("./src/data/processed/argument_concept.jsonl")]

def clean(phrase):
    return re.sub(r"[,.;@#?!&$]+\ *", " ", phrase)

def get_notion(notions_ids, notions_lst, arg_id, label):
    notion_id = notions_ids.index(arg_id)
    notion = notions_lst[notion_id][label]
    return str(notion) if notion else None

### RETRIEVER ###
db = db
queries = []
retrieved_ev = []

# TODOs: Argumentative Sentence
# TODOs: Query Expansion
def search(mined, type="tgt_counter", l=10):
    id_ = mined["id"]
    claim = arg["claim"]

    topic = get_notion(topic_ids, topics, id_, "topic_label")
    concept = get_notion(concept_ids, concepts, id_, "concept_label")

    retrieved = []

    adu_count = 0
    targeted_response = []
    for adu in mined[type]:

        sentence = adu["sentence"]
        # if predict(sentence) != "premise":
        #     # Count ADUs for reference
        #     continue

        # TODOs: Check this isn't overriding continue
        adu_count += 1

        #kp = extract_keyphrase(sentence)
        kp = list(set(adu["kp"]))

        # TODOs: Common-sense Query Expansion
        query = []
        query.extend(kp)

        # Ensure topics and concepts are unpacked (extended) into query list, as lists, else string will unpack 'l', 'i', 'k', 'e', 't'
        query.extend([topic]) if topic else query
        query.extend([concept]) if concept else query
        query = list(set(query))

        # Note: Now query becomes a string - be careful
        query = ", ".join(i for i in query)
        # print(query)

        search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=l)]

        source = [i[0] for i in search]
        evidence = [i[1] for i in search]

        #print("query", query)
        merged = ", ".join(i for i in evidence)
        ev_kp = list(set(yake_extract_keyphrase(merged)))

        retrieved.append({"passages": evidence, "kp": [clean(i) for i in ev_kp], "source": source})

        targeted_response.append({"sentence": adu["sentence"], "selected_keyphrases": []})

    # TODOs: Implement yield without storing list
    return ({
        "id": id_,
        "claim": claim,
        "argument": mined["argument"],
        "tgt_counter": [i for i in targeted_response],
        "retrieved": [i for i in retrieved],
        "adu_count": adu_count
    })

# SINGLE ARGUMENT INSPECT
# SAMPLE = unique_args[0]
# results = search(SAMPLE)

# tic = time.time()
# SAMPLE = mined_args

# retrieved_ev = []
# with multiprocessing.Pool(8) as pool:
#     with tqdm(total=(len(SAMPLE)), position=0, leave=True) as pbar:
#         for arg in SAMPLE:
#             retrieved_ev.append(search(arg))
#             pbar.update()
#     toc = time.time()

In [None]:
retrieved_ev[4]

In [None]:
import copy
retrieved_ev_ = copy.deepcopy(retrieved_ev)

In [None]:
duration = toc - tic
print("TIME", duration)
retrieved_ev[0]

In [None]:
len(retrieved_ev)

In [None]:
# TODOs: Check Counter, Argument params pre-proccess
# TODOs: Process Argument pairs fully; Constrain at train time

_ = random.randint(0, len(mined_args))

# NOTE: ADU Opinion Classifier reduces returned argument response. This is ok.
print("Argument", len(retrieved_ev[_]["argument"]), "Retrieved", len(retrieved_ev[70]["retrieved"]))

# NOTE: ADU Opinion Classifier reduces returned argument response. This is ok.
print("Counter", len(retrieved_ev[_]["tgt_counter"]))

In [None]:
# file_name = "cmv_retrieved"
# fout = open(f"./src/data/{file_name}.jsonl", "w")
#
# #with fout:
#     fout.write(json.dumps(retrieved_ev))
#
# logger.info(f"[{len(retrieved_ev)} Data Stored as {file_name}.jsonl]")

In [None]:
### REVIEW ###
# import os
# print(os.getcwd())

retrieved_ev_ = [json.loads(ln) for ln in open("./src/data/cmv_retrieved.jsonl", "r")][0]

_ = random.randint(0, len(review))
print(retrieved_ev_[_]["argument"], "\n")
print(retrieved_ev_[_]["retrieved"], "\n")
print(retrieved_ev_[_]["tgt_counter"], "\n")

In [None]:
len(retrieved_ev_)

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch
import time
import copy

# TODOs: Discard equivalent stance, per sentence
model = SentenceTransformer('all-MiniLM-L6-v2')
def cosine_similarity_(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)

    cos = torch.nn.CosineSimilarity()
    scores = cos(embeddings[0], embeddings[1:])

    scored = []
    retrieved_sentences = sentences[1:]
    for sent, similarity in zip(retrieved_sentences, scores):
        scored.append((sent, similarity.numpy().item()))

    return scored

def rank_passages(ev, k=3):
    """ return ranked passages using cosine-similarity between the input-argument and the retrieved passages
        k determines the number of returned passages from the originally retrieved set.
    """
    #adus = [i["sentence"] for i in ev["argument"]]
    # Compare TGT with RETREIVED
    adus = [i["sentence"] for i in ev["tgt_counter"]]
    retrieved_passages = [i["passages"] for i in ev["retrieved"]]

    #print(retrieved_passages)

    # Merge
    # Output 1 x merged sentences object per ADU sentence, with k collected passages as a list of sentences
    merged_passages = []
    for passages in retrieved_passages:
        merged_sents = []
        # Iterate n x sentences for each k=5 retrieved passages
        for passage in passages:
            # Segment as a list of sentences
            sents = sentences_segment(passage)
            # Add sentences to merged_sentences object
            merged_sents.extend(sents)

        # Store merged sentence object for each ADU
        merged_passages.append(merged_sents)

    rank_retrieved = []
    # Rank n x merged sentences for each 1 x ADU
    for adu, merged in zip(adus, merged_passages):
        scored = []
        sentences = [adu]
        sentences.extend(merged)
        scored = cosine_similarity_(sentences)

        ranked_sents = sorted(scored, key=lambda x: x[1], reverse=True)

        # Select top-k sentences
        ranked_sents = ranked_sents[0:k]

        merged = ", ".join(i[0] for i in ranked_sents)
        merged_kp = yake_extract_keyphrase(merged)
        rank_retrieved.append({"ranked_passages": merged, "kp": merged_kp})

    #print("\n RANKED", rank_retrieved)
    return rank_retrieved

# TODOs: Join passages and sentence rank
### SCORE COSINE SIMILARITY ###
tic = time.time()
retrieved_ranked = copy.deepcopy(retrieved_ev_)
counta = 0
with tqdm(total=(len(retrieved_ev_)), position=0, leave=True) as pbar:
    for i in range(0, len(retrieved_ev_)):
        counta += 1
        retrieved_ranked[i]["retrieved"] = [i for i in rank_passages(retrieved_ev_[i])]
        pbar.update()

toc = time.time()

In [None]:
retrieved_ranked_ = copy.deepcopy(retrieved_ranked)

In [None]:
subject = retrieved_ranked[2]

for i in subject:
    for _, j in zip(subject["tgt_counter"], subject["retrieved"]):
        print("COUNTER: ", _, "\n")
        print("EVIDENCE: ",j, "\n")

In [None]:
duration = toc - tic
print(duration)
len(retrieved_ranked)

In [None]:
# NOTE: Zipping retrieved evidence, args
print(len(retrieved_ranked), len(retrieved_ev), len(args))

In [None]:
_ = random.randint(0, len(retrieved_ev))
retrieved_ranked[_]

In [None]:
print(_)

In [None]:
file_name = "cmv_rr_"
fout = open(f"./src/data/processed/{file_name}.jsonl", "w")

# Deep_copies
rr = copy.deepcopy(retrieved_ranked)

with tqdm(total=(len(rr))) as pbar:
    with fout:
        for ln in rr:
            fout.write(json.dumps(ln))
            pbar.update()

logger.info(f"[{len(rr)} Data Stored as {file_name}.jsonl]")

In [None]:
### INSPECT OUTPUT ###
rr_ = [json.loads(ln) for ln in open("./src/data/processed/cmv_rr.jsonl", "r")][0]

In [None]:
subject = rr_[random.randint(0, len(rr_))]

print("CLAIM: ", subject["claim"]["sentence"])
print("===========================================\n")
for i, j, k in zip(subject["argument"], subject["tgt_counter"], subject["retrieved"]):
    print("ARG: ", i["sentence"], "\n")
    print("COUNTER: ", j["sentence"], "\n")
    print("EVIDENCE: ", clean(k["ranked_passages"]).lower(), "\n")

In [None]:
# DONE: Keyphrase Selection
# TODOs: Full-run, arguments
import copy

### KEYPHRASE SELECTION OBJECT ###
_rr = copy.deepcopy(rr_)

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
import re
def clean(phrase):
    return re.sub(r"[,.;@#?!&$]+\ *", " ", phrase)

def cosine_similarity_(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)

    cos = torch.nn.CosineSimilarity()
    scores = cos(embeddings[0], embeddings[1:])

    scored = []
    retrieved_sentences = sentences[1:]
    for sent, similarity in zip(retrieved_sentences, scores):
        scored.append((sent, similarity.numpy().item()))

    return scored

def selected_keyphrases(arg):
    kps = [_["kp"] for _ in arg["retrieved"]]
    tgt_sentences = [_["sentence"] for _ in arg["tgt_counter"]]

    selected_kps = []
    for tgt, kp in zip(tgt_sentences, kps):
        vectors = [tgt]
        vectors.extend(kp)

        similarity = cosine_similarity_(vectors)
        #print("Before: ", [i[0] for i in similarity])
        selected = [i[0] for i in similarity if i[1] > 0.2]

        selected_kps.append(list(set(selected)))

    for _, j in zip(arg["tgt_counter"], selected_kps):
        _["selected_keyphrases"] = j

In [None]:
### SELECTED KEYPHRASES ###
import tqdm as tqdm

SAMPLE = _rr
with tqdm.tqdm_notebook(total=(SAMPLE), position=0, leave=True) as pbar:
    for arg in SAMPLE:
        selected_keyphrases(arg)
        pbar.update()

In [None]:
_ = random.randint(0, 9)
_rr[_]["tgt_counter"]

In [None]:
# WORK WITH DEEP COPIES

def overlap_kp(string, sub):
    count = start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            count+=1
        else:
            return count

## WORKING, YET REPLACED WITH SIMILARITY ###

# DONE: Similarity rank
# DONE: Add Stopwords
# stop = [i.strip() for i in open("./src/data/lexicon/stopwords.txt")]
# def selected_keyphrases(arg):
#     kps = [_["kp"] for _ in arg["retrieved"]]
#     tgt_sentences = [_["sentence"] for _ in arg["tgt_counter"]]
#
#     selected_kps = []
#
#     # Iterate per target sentence
#     for tgt, kp in zip(tgt_sentences, kps):
#         selected = []
#
#         for terms in kp:
#             singletons = terms.split()
#             for single in singletons:
#                 if single in stop:
#                     continue
#                 if overlap_kp(tgt.lower(), single) > 0:
#                     selected.append(terms)
#
#         selected_kps.append(list(set(selected)))
#
#     for _, j in zip(arg["tgt_counter"], selected_kps):
#         _["selected_keyphrases"] = j

In [None]:
#### TOY EXAMPLE: COMPATING SIMILARITY ###

counter = ["i dont see a comment on puberty yet so ill weigh in a bit.there is absolutely no reason to believe that all transwomen athletes have an advantage over their female counterparts as your assertion makes.the defining biological trigger that gives males an advantage over females is the presence of high amounts of testosterone"]

phrases = [
    'assertion makes the defining biological',
    'makes the defining biological trigger',
    'high amounts of testosterone',
    'assertion makes the defining',
    'makes the defining biological',
]

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
embedding = counter
embedding.extend(phrases)

embedding


In [None]:
sentence_embeddings = model.encode(embedding)
result = cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)

result

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

In [None]:
### EXTRACT LABELLED INSTANCES ###
# def extract_labelled(dict_):
#     res = []
#     count = 0
#     for i in dict_.items():
#         j, k = i
#
#         if k["selectec_keyphrases"] != []:
#             #res.append({"id": i[0], "argument": k["argument"], "label": k["label"]})
#             res.append(count)
#     return res
#
# counts = []
# for i in rr_train_:
#     sents = [_["sentence"] for _ in i["tgt_counter"]]
#     empties = [_ for _ in i["tgt_counter"] if _["selected_keyphrases"] != []]
#     counts.append((sents, empties))

# for i in rr_train_[0]["tgt_counter"]:
#     print(i["selected_keyphrases"])

for i in rr_train_["tgt_counter"]:
    print(i)

    break
    # print(i["selected_keyphrases"])

In [None]:
for i in rr_train_:
    print(i["tgt_counter"])

In [None]:
}import random
_ = random.randint(0, len(rr_train))
rr_train_[_]["tgt_counter"]

In [None]:
print(_)

In [None]:
file_name = "cmv_rr_selected"
fout = open(f"./src/data/processed/{file_name}.jsonl", "w")

# Deep_copies
import copy
rr_selected = copy.deepcopy(rr_train_)

with fout:
    fout.write(json.dumps(rr_selected))
    fout.write("\n")

logger.info(f"[{len(rr_selected)} Data Stored as {file_name}.jsonl]")

In [None]:
### PREVIOUS RETRIEVER ###

# from tqdm.notebook import tqdm
# import multiprocessing
# from src.detection.stance_classifier import sentence_stance, compare_stance
# from src.utils_.word_net_expansion import expand_query
# from src.detection.stance_classifier import sentence_stance
# # from multiprocessing.pool import ThreadPool as Pool
# import time
#
# # Disable Huggingface Logging
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
#
# ### RETRIEVER ###
# db = db
# queries = []
# retrieved_ev = []
#
# topic_ids = [json.loads(ln)["id"] for ln in open("../data/argument_topic_concept.jsonl")]
# concept_ids = [json.loads(ln)["id"] for ln in open("../data/argument_concept.jsonl")]
#
# # TODOs: Argumentative Sentence
# # TODOs: Query Expansion
# def search(mined):
#     id_ = mined["id"]
#     # print("\n", id_)
#
#     topic = arg["argument"][0]["topic"]
#     concept = arg["argument"][0]["concept"]
#
#     retrieve_len = 5
#     retrieved = []
#
#     # for adu in mined["argument"]:
#     for adu in mined["tgt_counter"]:
#
#         sentence = adu["sentence"]
#         if len(tokeniser(sentence)) <= 8:
#             continue
#
#         kp = list(set(adu["kp"]))
#         #print(kp)
#         # topic = adu["topic"]
#         # concept = adu["concept"]
#
#         kp.append(topic) if topic else kp
#         kp.append(concept) if concept else kp
#         # print(kp)
#
#         query = ", ".join(i for i in kp)
#
#         search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=retrieve_len)]
#
#         source = [i[0] for i in search]
#         evidence = [i[1] for i in search]
#
#         merged = ", ".join(i for i in evidence)
#         ev_kp = list(set(yake_extract_keyphrase(merged)))
#
#         retrieved.append({"passages": evidence, "kp": [i for i in ev_kp], "source": source})
#
#     # TODOs: Implement yield without storing list
#     return ({
#         "id": id_,
#         "argument": mined["argument"],
#         "tgt_counter": mined["tgt_counter"],
#         "retrieved": [i for i in retrieved],
#     })
#
# # SINGLE ARGUMENT INSPECT
# # SAMPLE = unique_args[0]
# # results = search(SAMPLE)
#
# tic = time.time()
# SAMPLE = unique_args[0:100]
#
# retrieved_ev = []
# with tqdm(total=(len(SAMPLE)), position=0, leave=True) as pbar:
#     for arg in SAMPLE:
#         retrieved_ev.append(search(arg))
#         pbar.update()
# toc = time.time()

In [None]:
### QUERY EXPANSION ###
# TODOs: ConceptNet, pre-processing
# TODOs: WordNet, faster, also pre-processing
# from src.utils_.concept_net_expansion import ConceptNet
# sample = mined_args[5]
#
# concept_net = ConceptNet(api="http://api.conceptnet.io/", l=5)
# print(sample["claim"]["kp"])
# concept_ = sample["claim"]["kp"][0]

# terms = concept_.split()
# import os
# print(os.getcwd())
# stop = [i.strip() for i in open("./src/data/lexicon/stopwords.txt")]
# #print(stop)
#
# terms_ = [i for i in terms if i not in stop]
# expansion = [concept_net.get_similar(i) for i in terms_]
#
# expansion

In [None]:
# fout = open("../data/cmv_rr.jsonl", "w")
#
# args = [json.loads(ln) for ln in open("../data/cmv_processed.jsonl")]
# sample = args[0:sample]
#
# # for i, j in zip(retrieved_ranked, sample):
# #     # Add counter to the dictionary (implicitly, i)
# #     i["counter"] = j["counter"]
# #     fout.write(json.dumps(i))
# #     fout.write("\n")

In [None]:
# OLD
# kp = adu["kp"]
# topic = adu["topic"]
# concept = adu["concept"]
#
# kp.append(topic) if topic else kp
# kp.append(concept) if concept else kp
#
# query = ", ".join(i for i in adu["kp"])
# print(query)
#
# # TODOs: Add title field for all ES indices to enable multi-field search
# search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=retrieve_len)]
#
# #evidence = [i[1] for i in search]
# #source = [i[0] for i in search]
#
# evidence = [i[1] for i in search]
# ev_kp = yake_extract_keyphrase(evidence)
#
#         # try:
#         #     ev_kp = yake_extract_keyphrase(evidence)
#         # except:
#         #     ev_kp = [" "]
#
#         #retrieved.append({"passages": evidence, "kp": [i for i in ev_kp], "source": source})
#         #retrieved.append({"passages": evidence})
#
#
#
#     # TODOs: Implement yield without storing list
#     return ({
#         "id": id_,
#         "argument": mined["argument"],
#         "retrieved": [i for i in retrieved]
#     })
#
# for arg in unique_args[0:10]:
#     retrieved_ev.append(search(arg))
#
# # SAMPLE = unique_args[0:100]
# #
# # step = max(int(len(SAMPLE) / 10), 1)
# # BATCHES = [SAMPLE[i:i + step] for i in range(0, len(SAMPLE), step)]
# #
# # retrieved_ev = []
# # for idx, batch in enumerate(BATCHES):
# #     print('-' * 25 + 'Batch %d/%d' % (idx + 1, len(batch)) + '-' * 25)
# #
# #     with multiprocessing.Pool(8) as pool:
# #         with tqdm(total=(len(batch))) as pbar:
# #             for arg in batch:
# #                 retrieved_ev.append(search(arg))
# #                 pbar.update()

In [None]:
# SINGLE SEARCH FUNCTION
####

# from tqdm.notebook import tqdm
# import multiprocessing
# from src.detection.stance_classifier import sentence_stance, compare_stance
# from src.utils_.word_net_expansion import expand_query
# from src.detection.stance_classifier import sentence_stance
# # from multiprocessing.pool import ThreadPool as Pool
# import time
#
# # Disable Huggingface Logging
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
#
# ### RETRIEVER ###
# db = db
# queries = []
# retrieved_ev = []
#
# topic_ids = [json.loads(ln)["id"] for ln in open("../data/argument_topic_concept.jsonl")]
# concept_ids = [json.loads(ln)["id"] for ln in open("../data/argument_concept.jsonl")]
#
# import random
# _ = random.randint(0, len(unique_args))
# sample = unique_args[_]
#
# # TODOs: Argumentative Sentence
# def search(mined):
#     id_ = mined["id"]
#     print("\n", id_)
#
#     retrieve_len = 5
#     retrieved = []
#     for adu in mined["argument"]:
#
#         sentence = adu["sentence"]
#         if len(tokeniser(sentence)) <= 8:
#             continue
#
#         kp = list(set(adu["kp"][0:5]))
#         topic = adu["topic"]
#         concept = adu["concept"]
#
#         kp.append(topic) if topic else kp
#         kp.append(concept) if concept else kp
#
#         query = ", ".join(i for i in adu["kp"])
#         print(query)
#         print(" ")
#         search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=retrieve_len)]
#
#         source = [i[0] for i in search]
#         evidence = [i[1] for i in search]
#
#         merged = ", ".join(i for i in evidence)
#         ev_kp = yake_extract_keyphrase(merged)
#         print(ev_kp)
#
#         retrieved.append({"passages": evidence, "kp": [i for i in ev_kp], "source": source})
#
#     # TODOs: Implement yield without storing list
#     return ({
#         "id": id_,
#         "argument": mined["argument"],
#         "retrieved": [i for i in retrieved]
#     })
#
# result = search(sample)

In [None]:
# ### CHECK BLANKS ###
# args_ = [json.loads(ln)["argument"]["argument"] for ln in open("../data/cmv_processed.jsonl")]
# ids = [json.loads(ln)["id"] for ln in open("../data/cmv_processed.jsonl")]
#
# for j, k in zip(args_, ids):
#     if j == "":
#         print("blanks", j, k)

In [None]:
# from src.detection.stance_classifier import sentence_stance, compare_stance
# from src.utils_.word_net_expansion import expand_query
# from src.detection.stance_classifier import sentence_stance
# import multiprocessing
# import json
# import time
#
# # TODOs: Adu, Counter + KP Extraction as 'Argument Mining' preprocessing module
#
# # Disable Huggingface Logging
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
#
# topic_ids = [json.loads(ln)["id"] for ln in open("../data/argument_topic_concept.jsonl")]
# concept_ids = [json.loads(ln)["id"] for ln in open("../data/argument_concept.jsonl")]
#
# def get_notion(notions_ids, notions_lst, arg_id, label):
#     notion_id = notions_ids.index(arg_id)
#     notion = notions_lst[notion_id][label]
#     return str(notion) if notion else None
#
# def extract_adus(arg_):
#     arg, id_ = arg_
#     print("\n", id_)
#
#     topic = get_notion(topic_ids, topics, id_, "topic_label")
#     concept = get_notion(concept_ids, concepts, id_, "concept_label")
#
#     adu_sents = sentences_segment(arg)
#
#     adus = []
#     for _ in adu_sents:
#         if len(tokeniser(_)) <= 8:
#             continue
#
#         try:
#             kp = extract_keyphrase(_)
#         except:
#             kp = [" "]
#
#         kp.append(topic) if topic else kp
#         kp.append(concept) if concept else kp
#         print(kp)
#
#         adu = {"sentence": _, "kp": [i for i in kp], "stance": sentence_stance(_, kp[0])}
#
#         adus.append(adu)
#
#     yield ({
#         "id": id_,
#         "argument": [i for i in adus]
#     })
#
# step = max(int(len(unique) / 10), 1)
# batches = [unique[i:i + step] for i in range(0, len(unique), step)]
#
# mined_args = []
# # TODOs: Remove Huggingface Warnings
#
# for idx, batch in enumerate(batches):
#     print('-' * 25 + 'Batch %d/%d' % (idx + 1, len(batches)) + '-' * 25)
#
#     with multiprocessing.Pool(8) as pool:
#         with tqdm(total=(len(batch))) as pbar:
#             for arg in batch:
#                 mined_args.append([i for i in extract_adus(arg)])
#                 pbar.update()

In [None]:
# def get_topic(arg_id):
#     topic_id = topic_ids.index(arg_id)
#     topic = topics[topic_id]["topic_label"]
#     return str(topic) if topic else None
#
# def get_concept(arg_id):
#     concept_id = concept_ids.index(arg_id)
#     concept = concepts[concept_id]["concept_label"]
#     return str(concept) if concept else None

In [None]:
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer

# TODOs: Fix Vectorizer Issue
# kb = KeyBERT()
# vectorizer = KeyphraseCountVectorizer()
# def extract_keyphrase(doc, n_gram=3, n_kp=3, use_mmr="False", use_maxsum="False"):
#     try:
#         kp = kb.extract_keywords(doc, keyphrase_ngram_range=(0, 3), stop_words="english", diversity=0.3,)
#         kp_ = kb.extract_keywords(doc, vectorizer=vectorizer, stop_words="english", diversity=0.3)
#
#     except:
#         return [" "]
#
#     # Concatonate, remove duplicates
#     kp = kp + kp_
#     kp = [i[0] for i in kp]
#     kp = list(set(kp))
#
#     return kp

In [None]:
# # TODOs: Compute in Batches
# sample = unique[0:100]
# with multiprocessing.Pool(8) as pool:
#     with tqdm(total=(len(unique))) as pbar:
#         for arg in unique:
#             mined_args.append([i for i in extract_adus(arg)])
#             pbar.update()

In [None]:
# def retrieved_evidence(mined, retrieve_len=5):
#     """ Retrieves Evidence from Knowledge base, returning a well-formed Retrieved Evidence Object
#     given an input Argument"""
#
#     id_ = mined["id"]
#     print("\n", id_)
#
#     retrieved = []
#     adus = []
#     for _ in mined["argument"]:
#         if len(tokeniser(_)) <= 8:
#             continue
#
#         kp = extract_keyphrase(_)
#         print(kp)
#         adu = {"sentence": _, "kp": [i for i in kp], "stance": sentence_stance(_, kp[0])}
#
#         kp.append(topic) if topic else kp
#         kp.append(concept) if concept else kp
#
#         query = ", ".join(i for i in kp)
#         print(query)
#
#         # TODOs: Add title field for all ES indices to enable multi-field search
#         search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=retrieve_len)]
#
#         evidence = [i[1] for i in search]
#         source = [i[0] for i in search]
#
#         ev_kp = extract_keyphrase(evidence)
#
#         retrieved.append({"passages": evidence, "kp": [i[0] for i in ev_kp], "source": source})
#         #retrieved.append({"passages": evidence, "source": source})
#         adus.append(adu)
#
#     return ({
#         "id": id_,
#         "argument": [i for i in adus],
#         "retrieved": [i for i in retrieved]
#     })

In [None]:
# from sentence_transformers import SentenceTransformer, util
# from sklearn.metrics.pairwise import cosine_similarity
# import torch
# import time
#
# model = SentenceTransformer('all-MiniLM-L6-v2')
# def cosine_similarity_(sentences):
#     embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)
#
#     cos = torch.nn.CosineSimilarity()
#     scores = cos(embeddings[0], embeddings[1:])
#
#     scored = []
#     retrieved_sentences = sentences[1:]
#     for sent, similarity in zip(retrieved_sentences, scores):
#         scored.append((sent, similarity.numpy().item()))
#
#     return scored
#
# def rank_passages(ev, k=3):
#     adus = [i["sentence"] for i in ev["argument"]]
#     retrieved_passages = [i["passages"] for i in ev["retrieved"]]
#
#     # Merge
#     # Output 1 x merged sentences object per ADU sentence, with k=5 collected passages as a list of sentences
#     merged_passages = []
#     for passages in retrieved_passages:
#         merged_sents = []
#         # Iterate n x sentences for each k=5 retrieved passages
#         for passage in passages:
#             # Segment as a list of sentences
#             sents = sentences_segment(passage)
#             # Add sentences to merged_sentences object
#             merged_sents.extend(sents)
#
#         # Store merged sentence object for each ADU
#         merged_passages.append(merged_sents)
#
#     rank_retrieved = []
#     # Rank n x merged sentences for each 1 x ADU
#     for adu, merged in zip(adus, merged_passages):
#         scored = []
#         sentences = [adu]
#         sentences.extend(merged)
#         scored = cosine_similarity_(sentences)
#
#         ranked_sents = sorted(scored, key=lambda x: x[1], reverse=True)
#
#         # Select top-k sentences
#         ranked_sents = ranked_sents[0:k]
#
#         merged = ", ".join(i[0] for i in ranked_sents)
#         merged_kp = extract_keyphrase(merged)
#         rank_retrieved.append({"ranked_passages": merged, "kp": merged_kp})
#
#     return rank_retrieved
#
# # TODO: Clean text
# # TODO: Collect unique Keyphrases per Argument
# rank_passages(retrieved_ev[0])
# #print(len(test["ranked_passages"][0]))

In [None]:
# Handle duplicates
# def rank_passages(ev, k=3):
#     """ Handles a Retrieved Evidence Object, returning the top-k passages for each ADU """
#     # Per Argument
#     # Index into Retrieved Evidence Object
#     adus = [i for i in ev[0]["argument"]]
#     retrieved = [i for i in ev[0]["retrieved"]]
#
#     #print(len(retrieved), len(adus))
#
#     # Rank k-returned passages for each ADU
#     r_retrieved = []
#     for adu, passage in zip(adus, retrieved):
#         scored = []
#         ranked_ev = []
#         for _, kp in zip(passage["evidence"], passage["kp"]):
#             scored.append((_, kp, cosine_similarity(str(adu), str(_))))
#
#         scored = sorted(scored, key=lambda x: x[2], reverse=True)[0:3]
#         for i, j, k in scored:
#             ranked_ev.append({"evidence": i, "kp": j, "similarity": k})
#
#         r_retrieved.append(ranked_ev)
#
#     return r_retrieved

# 1 Argument x 4 ADUs x 5 Retrieved Passages
# ranked = [i for i in rank_passages(retrieved_ev[3])]
# print(len(ranked))
# print(ranked)

In [None]:
# from sentence_transformers import SentenceTransformer, util
# import torch
# import time
#
# model = SentenceTransformer('all-MiniLM-L6-v2')
#
# # TODOs: Join passages and sentence rank
# ### SCORE COSINE SIMILARITY ###
# def cosine_similarity(sent_1, sent_2):
#     sentences = [sent_1, sent_2]
#     embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)
#
#     cos = torch.nn.CosineSimilarity(dim=0)
#     score = cos(embeddings[0], embeddings[1])
#
#     return score.numpy().item()
#
# ### SCORE TF-KEYWORD OVERLAP ###
# def overlap_score(evidence_kp, adu_kp):
#     score = 0
#     # Split Keyphrase into components, scoring partial units as overlap
#     for i in evidence_kp:
#         for j in i.split():
#             # Ensure string value, to enact .find
#             if ", ".join([i for i in adu_kp]).find(j) != -1: score += 1
#
#             else: continue
#     return score
#
# ### RANK PASSAGES ###
# def score_passages(ev):
#     for _ in range(0, len(ev["argument"])):
#         print(_)
#
# from collections import defaultdict
# def rank_passages(ev, k=2):
#     adus = [i for i in ev["argument"]]
#     retrieved = [i for i in ev["retrieved"]]
#
#     rank_retrieved = []
#     count = 0
#
#     for adu, passages in zip(adus, retrieved):
#         count += 1
#         scored = []
#
#         # 5 passages
#         for passage in passages["passages"]:
#             score = cosine_similarity(str(adu), str(passage))
#             scored.append((passage, score))
#
#         ranked_passages = sorted(scored, key=lambda x: x[1], reverse=True)
#         ranked_passages = ranked_passages[0:k]
#
#         merged = ", ".join(i[0] for i in ranked_passages)
#         merged_kp = extract_keyphrase(merged)
#         rank_retrieved.append({"ranked_passages": merged, "kp": merged_kp})
#
#     return rank_retrieved
#
# import copy
# ### UPDATE RETRIEVED OBJECT ###
# # for i in range(0, len(retrieved_ev)):
# #     retrieved_ranked[i]["retrieved"] = [i for i in rank_passages(retrieved_ev[i])]
#
# tic = time.time()
# retrieved_ranked = copy.deepcopy(retrieved_ev)
# with tqdm(total=(len(retrieved_ev)), position=0, leave=True) as pbar:
#     for i in range(0, len(retrieved_ev)):
#         retrieved_ranked[i]["retrieved"] = [i for i in rank_passages(retrieved_ev[i])]
#     pbar.update()
#
# toc = time.time()
# # duration = toc - tic
#
# retrieved_ranked

In [None]:
# Fix KW extraction
# Fix Duplicates
# def rank_passages(ev, k=2):
#     adus = [i for i in ev["argument"]]
#     retrieved = [i for i in ev["retrieved"]]
#
#     rank_retrieved = []
#     count = 0
#
#     for adu, passages in zip(adus, retrieved):
#         count += 1
#         scored = []
#
#         # 5 passages
#         for passage in passages["passages"]:
#             score = cosine_similarity(str(adu), str(passage))
#             scored.append((passage, score))
#
#         ranked_passages = sorted(scored, key=lambda x: x[1], reverse=True)
#         ranked_passages = ranked_passages[0:k]
#
#         merged = ", ".join(i[0] for i in ranked_passages)
#         merged_kp = extract_keyphrase(merged)
#         rank_retrieved.append({"ranked_passages": merged, "kp": merged_kp})
#
#     return rank_retrieved
#
# ev = retrieved_ev[0]
# rank_passages(ev)

In [None]:
# def fuck_you():
#     print("fuck you")
#
# fuck_you()
#
# def rank_(ev):
#     # Index into Retrieved Evidence Object
#     ev = ev[0]
#     adus = [i for i in ev[0]["argument"]]
#     retrieved = [i for i in ev[0]["retrieved"]]
#     k = 3
#     print("hello")
#     # # Rank k-returned passages for each ADU
#     # count = 0
#     # r_retrieved = []
#     # for adu, passage in zip(adus, retrieved):
#     #     count += 1
#     #     ranked_passages = []
#     #     for _ in passage["evidence"]:
#     #         print(_)
#     #         ranked_passages.append((_, cosine_similarity(adu, _)))
#     #         r_retrieved.append({"evidence": i, "similarity": k} for i, k in sorted(ranked_passages, key=lambda x: x[1], reverse=True)[0:k])
#     #         print(r_retrieved)
#
#     # return {
#     #     "r_retrieved": r_retrieved
#     # }
#
# #print(rank_(retrieved_ev[0:1]))

In [None]:
# from multiprocessing.pool import ThreadPool as Pool
# from yake import KeywordExtractor
# import tqdm.notebook as tqdm
# import time
# from summa import keywords
# from tqdm import tqdm
#
# ### PASSAGE RANKING; KEYWORD OVERLAP ###
# kw_extractor = KeywordExtractor(lan="en", n=3, top=5)
#
# # TODOs: For each ADU, Rank Merged Evidence using Keyword Overlap and Filter for Contrasting Stance
# # TODOs: Handel Multiple Keywords
#
# def overlap_score(evidence_kp, adu_kp):
#     score = 0
#     # TODOs: Robust 'None' handeling
#     if adu_kp == None:
#         return score
#     # Split Keyphrase into components, scoring partial units as overlap
#     else:
#         for i in evidence_kp:
#             for j in i.split():
#                 # Ensure string value, to enact .find
#                 if ", ".join([i for i in adu_kp]).find(j) != -1: score += 1
#
#                 else: continue
#
#     return score
#
# def calculate_overlap(merged_ev, adu_kp):
#
#     for ev_unit in sentences_segment(merged_ev):
#         toks = tokeniser(ev_unit)
#         kp_overlap = 0
#
#         if len(toks) <= 8: continue
#
#         #ev_unit_kp = [i for i in keywords.keywords(ev_unit).split("\n")]
#         ev_unit_kp = [i[0] for i in kw_extractor.extract_keywords(ev_unit)]
#
#         if ev_unit_kp:
#             kp_overlap = overlap_score(evidence_kp=ev_unit_kp, adu_kp=adu_kp)
#
#         else: ev_unit_kp = None
#         yield ev_unit, ev_unit_kp, kp_overlap
#
# # pool = Pool(8)
# ### RANK PASSAGES ###
# def score_passages(ev_):
#     adu = ev_[0]["argument_discourse_unit"]
#     adu_stance = ev_[0]["adu_stance"]
#     merged_ev = ev_[0]["merged_evidence"]
#     adu_kp = ev_[0]["adu_keyphrases"]
#
#     ### CALCULATE OVERLAP ###
#     for ev_unit, ev_unit_kp, kp_overlap in calculate_overlap(merged_ev, adu_kp):
#         target = adu_kp[0]
#
#         compared_stace = compare_stance(ev_unit, target)
#         if compared_stace != adu_stance:
#             yield {
#                 "adu": adu,
#                 "adu_kp": adu_kp,
#                 "evidence_unit": ev_unit,
#                 "evidence_kps": ev_unit_kp,
#                 "overlap": kp_overlap,
#                 "evidence_stance": compare_stance(ev_unit, target),
#                 "adu_stance": adu_stance
#             }
#
#         else: continue
#
# ### SCORED EVIDENCE ###
# def score_evidence(retrieved_evidence):
#     for ev_ in retrieved_ev:
#         yield [i for i in score_passages(ev_)]
#
# ### RANKED EVIDENCE ###
# def rank_filter_counter_evidence(retireved_evidence, k=3):
#     with tqdm(total=(len(retrieved_ev))) as pbar:
#         for i in score_evidence(retrieved_ev):
#             yield sorted(i, key=lambda y: y["overlap"], reverse=True)[0:k]
#
#             pbar.update()
#
#
# ### SELECT TOP-K COUNTER-EVIDENCE ###
# tic = time.time()
# ranked_sorted_evidence = [i for i in rank_filter_counter_evidence(retrieved_ev)]
# ranked_sorted_evidence
# toc = time.time()
#
# print(toc - tic)
# # TIME 1:20M

In [None]:
# idx = 2
# for ln in retrieved_ev:
#     r = ln[0]
#     for _ in range(0, len(r["argument"])):
#         print(r["argument"][_]["sentence"])
#         print(r["argument"][_]["kp"])
#         print("")
#         print(r["retrieved"][_]["evidence"])
#         print(r["retrieved"][_]["kp"])

#"counter": {"counter": arg["counter"]["counter"], "counter_kp": arg["counter"]["counter_keyphrases"]}
# "argument_discourse_unit": adu,
# "query": query,
# "adu_keyphrases": [i for i in kp],
# "adu_stance": sentence_stance(adu, kp),
# "merged_evidence": ", ".join(ln for ln in evidence)
# "retrieved_documents_titles": titles,
# "retrieved_evidence": evidence,

In [None]:
# # TODOs: Speed-up, Parrelleise, Yield
# def overlap_score(evidence_kp, adu_kp):
#     score = 0

#     # Split Keyphrase into components, scoring partial units as overlap
#     for i in evidence_kp:
#         for j in i.split():
#             # Ensure string value, to enact .find
#             if " ".join(adu_kp).find(j) != -1: score += 1

#             else: continue

#     return score

# ev_units = evidence
# adu_kp = extract_keyphrase(adu)

# adu_ev_overlap = []

# kp_1 = ['sex', 'relationship', 'opportunity']
# kp_2 = ['better sex']

# overlap_score(kp_2, kp_1)

# for ev_unit in evidence:
#     #print(ev_unit)
#     toks = tokeniser(ev_unit)

#     # Exprimental Value
#     if len(toks) <= 8:
#         continue

#     ev_unit_kp = extract_keyphrase(ev_unit)
#     kp_overlap = overlap_score(evidence_kp=ev_unit_kp, adu_kp=adu_kp)

#     adu_ev_overlap.append({
#         "adu": adu,
#         "adu_kp": adu_kp,
#         "ev_unit": ev_unit,
#         "ev_unit_kp": ev_unit_kp,
#         "kp_overlap": kp_overlap

#         })

# adu_ev_overlap

In [None]:
# ### OVERLAP RANKED EVIDENCE ###

# adu_ev_overlap.sort(key=lambda y: y["kp_overlap"], reverse=True)
# adu_ev_overlap

# ### FILTER IRRELEVANT EVIDENCE ###
# overlapping = [i for i in adu_ev_overlap if i["kp_overlap"] !=0]

# len(adu_ev_overlap), len(overlapping)
# overlapping


In [None]:
# Stance Test
# adu = 'I cant remember the topic that spurred this discussion but a friend and I were debating whether manmade things were natural.'
# ev_unit = 'In this essay, Mill argues the idea that the morality of an action can be judged by whether it is natural or unnatural.'
# target = 'natural things'
#
# stance = compare_stance(ev_unit, target)
# stance

In [None]:
# ### ASSERT SAME STANCE ###
# from detection.stance_classifier import sentence_stance, compare_stance
#
# # TODOs: Ensure KPs Extracts are constrained to 1 unit
# opposing_stance = []
# for i in overlapping:
#     adu = i["adu"]
#     target = " ".join(i for i in i["adu_kp"])
#     ev_unit = i["ev_unit"]
#
#     ev_stance = compare_stance(ev_unit, ev_unit, target)
#     adu_stance = sentence_stance(adu, target)
#
#     if ev_stance != adu_stance:
#         opposing_stance.append((ev_unit, ev_stance, adu_stance))
#
#     else: continue
#
# opposing_stance

In [None]:
### RANKING ###

# TODOs: Speed-up, Parrelleise, Yield
# ev_units = evidence
# adu_kp = extract_keyphrase(adu)

# adu_ev_overlap = []

# kp_1 = ['sex', 'relationship', 'opportunity'] 
# kp_2 = ['better sex']

# overlap_score(kp_2, kp_1)

# for ev_unit in evidence:
#     #print(ev_unit)
#     toks = tokeniser(ev_unit)

#     # Exprimental Value
#     if len(toks) <= 8:
#         continue
    
#     ev_unit_kp = extract_keyphrase(ev_unit)
#     kp_overlap = overlap_score(evidence_kp=ev_unit_kp, adu_kp=adu_kp)
    
#     adu_ev_overlap.append({
#         "adu": adu, 
#         "adu_kp": adu_kp,
#         "ev_unit": ev_unit,
#         "ev_unit_kp": ev_unit_kp, 
#         "kp_overlap": kp_overlap
        
#         })
        
# adu_ev_overlap


#rank_passages(retrieved_ev)

In [None]:
# import spacy
# from spacy.matcher import PhraseMatcher
# from fuzzywuzzy import fuzz, process

# # TODOs: Package as a Module
# # TODOs: Handle Negation (Polarity shifters)
# # TODOs: Review Unsuperived Approach; Consider adveanced patterns and common-sence knowledge

# nlp = spacy.load("en_core_web_sm")

# sentence = "I hate abortion rights. Abortions should be banned."
# sentence_2 = "I like abortion rights. I belive we should keep them."
# sentence_3 = "I hate tennis. People should play tennis more often"

# ### STANCE SCORING ###

# # TODOs: https://www.cs.uic.edu/~liub/FBS/opinion-mining-final-WSDM.pdf 
# # TODOs: Pattern based Negation
# # TODOs: Semantic Orientation of an opinion (Claim)
# # TODOs:Group synonyms of 'features', 'targets'

# phrase_matcher = PhraseMatcher(nlp.vocab)

# ### SENTIMENT LEXICONS ###
# pos = [w.replace("\n", "") for w in open("../../data/lexicon/positive_lex.txt")]
# neg = [w.replace("\n", "") for w in open("../../data/lexicon/negative_lex.txt")]
# polarity_shifters = [w.replace("\n", "") for w in open("../../data/lexicon/shifter_lexicon.txt")]

# ### STANCE: ASPECT-SEMANTIC ORIENTATION ###
# def extract_aspect(sentence, n_gram):
#     aspects = extract_keyphrase(str(sentence))[0]

#     return nlp(aspects)

# def index_aspect(phrase, aspect, sentence):    
#     patterns = [nlp(aspect)]
#     phrase_matcher.add(phrase, None, *patterns)

#     start = 0
#     stop = 0

#     matched_phrases = phrase_matcher(sentence)
#     for i in matched_phrases:
#         _, start, stop = i
        
#     return start, stop

# # TODOs: Implement Polarity Shifters, Simple
# # TODOs: Implement Polarity Shifters, Complex, Verb Patterns
# def stance_score(start, stop, sentence):
#     pos_score = 0.0
#     neg_score = 0.0

#     score = 0
#     for idx, tok in enumerate(sentence):
#         if idx == start or idx == stop:
#             continue

#         # TODOs: Implement Polarity Shift
#         # TODOs: Experiement with descriptive term + keyphrase aspects
#         # TODOs: ABSA https://www.kaggle.com/code/phiitm/aspect-based-sentiment-analysis
#         # Use external libaray: Textblob
        
#         k = 8
#         # Negation Rules
#         shifted_tok = None
#         shifted_toks = []

#         if (tok.dep_ == "neg") or (tok.dep_ in polarity_shifters):
#             #Shift to Negative
#             if idx <= k:
#                 if idx < start: neg_score += 1/(start - idx)
#                 else: neg_score += 1/(idx - stop)**0.5

#             if shifted_tok != None and shifted_tok in neg:
#                 print(shifted_tok.text)
#                 # Shift to Positive
#                 if idx < start: pos_score += 1/(start - idx)
#                 elif idx > start: pos_score += 1/(idx - stop)**0.5
#                 else: continue

#         # Aspect Sentement Orientation
#         if tok.text in pos:
#             if tok in shifted_toks:
#                 continue
            
#             if idx < start: pos_score += 1/(start - idx)
#             else: pos_score += 1/(idx - stop)**0.5

#         if tok.text in neg:
#             if tok in shifted_toks:
#                 continue

#             if idx <= start: neg_score += 1/(start - idx)
#             else: neg_score += 1/(idx - stop)**0.5
    
#     score = pos_score - neg_score /(pos_score + neg_score + 1)

#     return score

# def overlap_score(evidence_kp, adu_kp):
#     score = 0
    
#     # Split Keyphrase into components, scoring partial units as overlap
#     for i in evidence_kp:
#         for j in i.split():
#             # Ensure string value, to enact .find
#             if " ".join(adu_kp).find(j) != -1: 
#                 score += 1
#                 token = j
            
#             else: continue
    
#     return score

# def get_overlapping_token(evidence_kp, adu_kp):
#     for i in evidence_kp:
#         overlap_tokens = []
#         for j in i.split():
#             if " ".join(adu_kp).find(j) != -1: 
#                 overlap_tokens.append(j) 
            
#         return " ".join(i for i in overlap_tokens)

# def sentence_stance(sentence, aspect):
#     sentence = nlp(sentence)

#     start, stop = index_aspect("aspects", aspect, sentence)
#     score = stance_score(start, stop, sentence)

#     # Add Neutral
#     #stance = {"claim": sentence, "stance": "PRO", "aspect": aspect} if score > 0 else {"claim": sentence, "stance": "CON", "aspect": aspect}
    
#     return "PRO" if score > 0 else "CON"

# def fuzzy_match(target, evidence_unit):

#     overlapping_aspect = process.extractOne(target, ev.split())[0]
#     score = overlapping_aspect[1]

#     overlapping_aspect = nlp(re.sub(r'[^\w]', ' ', overlapping_aspect))

#     return overlapping_aspect, score

# def compare_stance(ev_unit, evidence_aspect, adu_target):
#     # Note: Already identified mathcing or partially matching Aspects. 

#     # Get the overlapping evidence aspect-target.
#     overlapping_target, score = fuzzy_match(target=adu_aspect, evidence_unit=ev)
    
#     # Get position of the overlapping_target
#     start, stop = index_aspect("OVERLAP", nlp(overlapping_target), nlp(ev_unit))

#     # Assert Stance towards evidence aspect
#     score = stance_score(start, stop, nlp(ev_unit))
    
#     return "PRO" if score > 0 else "CON"

# ev = "These simple ideas and techniques could help both you and your lover enjoy sex. 1 / 10 Getty Images/Caiaimage Think beyond the thrust."
# ev_aspect = "sex", "relationship", "opportunity"

# adu = 'Hello! Let me preface by saying I dont believe there is a better sex.'
# adu_aspect = "better sex"

# print(sentence_stance("The mutual trust and understanding you share with your partner will lead to better sex, but that's not the only reason sex can be better when you're not in a relationship.", adu_aspect))
# print(compare_stance(ev, ev_aspect, adu_aspect))


In [None]:
# from spacy.matcher import DependencyMatcher, Matcher
# matcher = Matcher(vocab=nlp.vocab)
# matcher

# # Matching Rule: Pronouns with Verbs that follow them
# aspect = "better sex"
# patterns = [
#     [{"DEP": "neg"}, {"LOWER": aspect}],
#     [{"DEP": "neg"}, {"POS": "ADJ"}, {"LOWER": aspect}],
#     [{"POS": "VERB"}, {"POS": "ADJ"}, {"LOWER": aspect}],
#     [{"LOWER": aspect.lower()}]
# ]

# test = nlp("Hello! Let me preface by saying I dont believe there is a not better sex.")
# test_2 = nlp("These simple ideas and techniques could help both you and your lover enjoy better sex.")

# matcher.add("test", patterns=patterns)
# result = matcher(test_2, as_spans=True)

# result

# # for tok in test:
# #     print(tok.i, tok, tok.pos_, tok.dep_, tok.head.i, sep="\t")

In [None]:
### TARGETED RETRIEVAL: ATTACKING PEMISES ###

# from BERT_adu_classifier import predict

# premises = []
# for sent in sentences:
#     prediction = predict(sent)
    
#     if prediction == "premise":
#         premises.append(sent)