In [None]:
### TODOs ###
# DONE: Implement Semantic Ranking
# TODOs: Commonsense Query and Concept Expansion: Topics, Concepts, Synonyms
# TODOs: Targeted Retrieval with NLI over ADUs, Premises, Claims; discard non-ADUs

# DONE: News Data
# DONE: Add Concepts
# DONE: Cosine Semantic Search
# DONE: Prior Pre-processing, tokenization and sentence segmentation to speed processing
# TODOs: Domain Restrict. Polarising social and political debate (Class labelling) only for higher-quality argument-knowledge set.
# TODOs: News, Political, Sociology and 'Good', 'Positive' counter-evidence Knowledge Base.
# TODOs: Bag of Topics Modelling
# TODOs: Implement as a Class

# TODOs: Keyphrase Selection
# DONE: Manage Duplicate Keywords
# DONE: Sentential Ranking
# DONE: Include Topic Label
# DONE: Include Concept Label
# DONE: Add News
# TODOs: Targeted Retreival with Semantic Graphs
# TODOs: Target Argumentative Content Only
# TODOs: Targeted Argument Content: Adus + Extractive Summary
# TODOs: Query Expansion
# TODOs: Multi-Field Search
# TODOs: Additional News and Knowledge Sources

In [1]:
### LOGGING ###
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ARGUMENT-EXTRACTOR")

### NLP FUNCTIONS ###
from src.utils.utils import tokeniser, sentences_segment

### KEYPHRASE EXTRACTORS ###
from src.utils.keyphrase_extraction import yake_extract_keyphrase, summa_extract_keyphrase
import keybert

INFO:summa.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu
INFO:KEYPHRASE_EXTRACTOR:[Test Keyphrase: ] 
 ['heathrow airport', 'environmental impact', 'aviation']


In [2]:
### CONNECT TO KNOWLEDGEBASE ###
from src.utils.elastic_db import ElasticDB

PORT = "http://localhost:9200"
db = ElasticDB(elastic_port=PORT)

INFO:src.utils.elastic_db:Connecting to http://localhost:9200 
INFO:src.utils.elastic_db:Connected to <Elasticsearch(['http://localhost:9200'])> 


In [None]:
### ADU CLASSIFIER ###
# import os
# path = "/Users/joshua.sheppard/PycharmProjects/countaBot/"
# os.chdir(path)

from src.detection.adu_classifier import predict

In [None]:
### LOAD DATASETS ###
import json
import os

root = "/Users/joshua.sheppard/PycharmProjects/countaBot"
os.chdir(root)

args = [json.loads(ln) for ln in open("./src/data/processed/cmv_processed.jsonl")]
mined_args = [json.loads(ln) for ln in open("./src/data/processed/cmv_argument_extraction.jsonl")]
topics = [json.loads(ln) for ln in open("./src/data/processed/argument_topic_concept.jsonl")]
concepts = [json.loads(ln) for ln in open("./src/data/processed/argument_concept.jsonl")]

In [None]:
print("ARGS:", len(args), " MINED-ARGS:",  len(mined_args), " TOPICS:", len(topics), " CONCEPTS: ", len(concepts))

In [None]:
### INSPECT SUBJECT ARG ###
import random
_ = random.randint(0, len(mined_args))

arg = " ".join(i["sentence"] for i in mined_args[_]["argument"])
claim = mined_args[_]["claim"]["sentence"]

#print(mined_args[_])
print(_, "\n")
print("CLAIM: ", claim, "\n")
print("ARG: ", arg, "\n")

In [None]:
from tqdm.notebook import tqdm
import multiprocessing
from src.detection.stance_classifier import sentence_stance, compare_stance
from src.detection.stance_classifier import sentence_stance
import time
import re

# Disable Huggingface Logging
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

topic_ids = [json.loads(ln)["id"] for ln in open("./src/data/processed/argument_topic_concept.jsonl")]
concept_ids = [json.loads(ln)["id"] for ln in open("./src/data/processed/argument_concept.jsonl")]

def clean(phrase):
    return re.sub(r"[,.;@#?!&$]+\ *", " ", phrase)

def get_notion(notions_ids, notions_lst, arg_id, label):
    notion_id = notions_ids.index(arg_id)
    notion = notions_lst[notion_id][label]
    return str(notion) if notion else None

### RETRIEVER ###
db = db
queries = []
retrieved_ev = []

# TODOs: Argumentative Sentence
# TODOs: Query Expansion
def search(mined, type="tgt_counter", l=10):
    id_ = mined["id"]
    claim = arg["claim"]

    topic = get_notion(topic_ids, topics, id_, "topic_label")
    concept = get_notion(concept_ids, concepts, id_, "concept_label")

    retrieved = []

    adu_count = 0
    targeted_response = []
    for adu in mined[type]:

        sentence = adu["sentence"]
        # if predict(sentence) != "premise":
        #     # Count ADUs for reference
        #     continue

        # TODOs: Check this isn't overriding continue
        adu_count += 1

        #kp = extract_keyphrase(sentence)
        kp = list(set(adu["kp"]))

        # TODOs: Common-sense Query Expansion
        query = []
        query.extend(kp)

        # Ensure topics and concepts are unpacked (extended) into query list, as lists, else string will unpack 'l', 'i', 'k', 'e', 't'
        query.extend([topic]) if topic else query
        query.extend([concept]) if concept else query
        query = list(set(query))

        # Note: Now query becomes a string - be careful
        query = ", ".join(i for i in query)
        # print(query)

        search = [(i["_source"]["document"]["source"], i["_source"]["document"]["text"]) for i in db.search(query_=query, k=l)]

        source = [i[0] for i in search]
        evidence = [i[1] for i in search]

        #print("query", query)
        merged = ", ".join(i for i in evidence)
        ev_kp = list(set(yake_extract_keyphrase(merged)))

        retrieved.append({"passages": evidence, "kp": [clean(i) for i in ev_kp], "source": source})

        targeted_response.append({"sentence": adu["sentence"], "selected_keyphrases": []})

    # TODOs: Implement yield without storing list
    return ({
        "id": id_,
        "claim": claim,
        "argument": mined["argument"],
        "tgt_counter": [i for i in targeted_response],
        "retrieved": [i for i in retrieved],
        "adu_count": adu_count
    })

# SINGLE ARGUMENT INSPECT
# SAMPLE = unique_args[0]
# results = search(SAMPLE)

# tic = time.time()
# SAMPLE = mined_args

# retrieved_ev = []
# with multiprocessing.Pool(8) as pool:
#     with tqdm(total=(len(SAMPLE)), position=0, leave=True) as pbar:
#         for arg in SAMPLE:
#             retrieved_ev.append(search(arg))
#             pbar.update()
#     toc = time.time()

In [None]:
retrieved_ev[4]

In [None]:
import copy
retrieved_ev_ = copy.deepcopy(retrieved_ev)

In [None]:
duration = toc - tic
print("TIME", duration)
retrieved_ev[0]

In [None]:
len(retrieved_ev)

In [None]:
# TODOs: Check Counter, Argument params pre-proccess
# TODOs: Process Argument pairs fully; Constrain at train time

_ = random.randint(0, len(mined_args))

# NOTE: ADU Opinion Classifier reduces returned argument response. This is ok.
print("Argument", len(retrieved_ev[_]["argument"]), "Retrieved", len(retrieved_ev[70]["retrieved"]))

# NOTE: ADU Opinion Classifier reduces returned argument response. This is ok.
print("Counter", len(retrieved_ev[_]["tgt_counter"]))

In [None]:
# file_name = "cmv_retrieved"
# fout = open(f"./src/data/{file_name}.jsonl", "w")
#
# #with fout:
#     fout.write(json.dumps(retrieved_ev))
#
# logger.info(f"[{len(retrieved_ev)} Data Stored as {file_name}.jsonl]")

In [None]:
### REVIEW ###
# import os
# print(os.getcwd())

retrieved_ev_ = [json.loads(ln) for ln in open("./src/data/cmv_retrieved.jsonl", "r")][0]

_ = random.randint(0, len(review))
print(retrieved_ev_[_]["argument"], "\n")
print(retrieved_ev_[_]["retrieved"], "\n")
print(retrieved_ev_[_]["tgt_counter"], "\n")

In [None]:
len(retrieved_ev_)

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch
import time
import copy

# TODOs: Discard equivalent stance, per sentence
model = SentenceTransformer('all-MiniLM-L6-v2')
def cosine_similarity_(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)

    cos = torch.nn.CosineSimilarity()
    scores = cos(embeddings[0], embeddings[1:])

    scored = []
    retrieved_sentences = sentences[1:]
    for sent, similarity in zip(retrieved_sentences, scores):
        scored.append((sent, similarity.numpy().item()))

    return scored

def rank_passages(ev, k=3):
    """ return ranked passages using cosine-similarity between the input-argument and the retrieved passages
        k determines the number of returned passages from the originally retrieved set.
    """
    #adus = [i["sentence"] for i in ev["argument"]]
    # Compare TGT with RETREIVED
    adus = [i["sentence"] for i in ev["tgt_counter"]]
    retrieved_passages = [i["passages"] for i in ev["retrieved"]]

    #print(retrieved_passages)

    # Merge
    # Output 1 x merged sentences object per ADU sentence, with k collected passages as a list of sentences
    merged_passages = []
    for passages in retrieved_passages:
        merged_sents = []
        # Iterate n x sentences for each k=5 retrieved passages
        for passage in passages:
            # Segment as a list of sentences
            sents = sentences_segment(passage)
            # Add sentences to merged_sentences object
            merged_sents.extend(sents)

        # Store merged sentence object for each ADU
        merged_passages.append(merged_sents)

    rank_retrieved = []
    # Rank n x merged sentences for each 1 x ADU
    for adu, merged in zip(adus, merged_passages):
        scored = []
        sentences = [adu]
        sentences.extend(merged)
        scored = cosine_similarity_(sentences)

        ranked_sents = sorted(scored, key=lambda x: x[1], reverse=True)

        # Select top-k sentences
        ranked_sents = ranked_sents[0:k]

        merged = ", ".join(i[0] for i in ranked_sents)
        merged_kp = yake_extract_keyphrase(merged)
        rank_retrieved.append({"ranked_passages": merged, "kp": merged_kp})

    #print("\n RANKED", rank_retrieved)
    return rank_retrieved

# TODOs: Join passages and sentence rank
### SCORE COSINE SIMILARITY ###
tic = time.time()
retrieved_ranked = copy.deepcopy(retrieved_ev_)
counta = 0
with tqdm(total=(len(retrieved_ev_)), position=0, leave=True) as pbar:
    for i in range(0, len(retrieved_ev_)):
        counta += 1
        retrieved_ranked[i]["retrieved"] = [i for i in rank_passages(retrieved_ev_[i])]
        pbar.update()

toc = time.time()

In [None]:
retrieved_ranked_ = copy.deepcopy(retrieved_ranked)

In [None]:
subject = retrieved_ranked[2]

for i in subject:
    for _, j in zip(subject["tgt_counter"], subject["retrieved"]):
        print("COUNTER: ", _, "\n")
        print("EVIDENCE: ",j, "\n")

In [None]:
duration = toc - tic
print(duration)
len(retrieved_ranked)

In [None]:
# NOTE: Zipping retrieved evidence, args
print(len(retrieved_ranked), len(retrieved_ev), len(args))

In [None]:
_ = random.randint(0, len(retrieved_ev))
retrieved_ranked[_]

In [None]:
print(_)

In [None]:
file_name = "cmv_rr_"
fout = open(f"./src/data/processed/{file_name}.jsonl", "w")

# Deep_copies
rr = copy.deepcopy(retrieved_ranked)

with tqdm(total=(len(rr))) as pbar:
    with fout:
        for ln in rr:
            fout.write(json.dumps(ln))
            pbar.update()

logger.info(f"[{len(rr)} Data Stored as {file_name}.jsonl]")

In [None]:
### INSPECT OUTPUT ###
rr_ = [json.loads(ln) for ln in open("./src/data/processed/cmv_rr.jsonl", "r")][0]

In [None]:
subject = rr_[random.randint(0, len(rr_))]

print("CLAIM: ", subject["claim"]["sentence"])
print("===========================================\n")
for i, j, k in zip(subject["argument"], subject["tgt_counter"], subject["retrieved"]):
    print("ARG: ", i["sentence"], "\n")
    print("COUNTER: ", j["sentence"], "\n")
    print("EVIDENCE: ", clean(k["ranked_passages"]).lower(), "\n")

In [None]:
# DONE: Keyphrase Selection
# TODOs: Full-run, arguments
import copy

### KEYPHRASE SELECTION OBJECT ###
_rr = copy.deepcopy(rr_)

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
import re
def clean(phrase):
    return re.sub(r"[,.;@#?!&$]+\ *", " ", phrase)

def cosine_similarity_(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)

    cos = torch.nn.CosineSimilarity()
    scores = cos(embeddings[0], embeddings[1:])

    scored = []
    retrieved_sentences = sentences[1:]
    for sent, similarity in zip(retrieved_sentences, scores):
        scored.append((sent, similarity.numpy().item()))

    return scored

def selected_keyphrases(arg):
    kps = [_["kp"] for _ in arg["retrieved"]]
    tgt_sentences = [_["sentence"] for _ in arg["tgt_counter"]]

    selected_kps = []
    for tgt, kp in zip(tgt_sentences, kps):
        vectors = [tgt]
        vectors.extend(kp)

        similarity = cosine_similarity_(vectors)
        #print("Before: ", [i[0] for i in similarity])
        selected = [i[0] for i in similarity if i[1] > 0.2]

        selected_kps.append(list(set(selected)))

    for _, j in zip(arg["tgt_counter"], selected_kps):
        _["selected_keyphrases"] = j

In [None]:
### SELECTED KEYPHRASES ###
import tqdm as tqdm

SAMPLE = _rr
with tqdm.tqdm_notebook(total=(SAMPLE), position=0, leave=True) as pbar:
    for arg in SAMPLE:
        selected_keyphrases(arg)
        pbar.update()

In [None]:
_ = random.randint(0, 9)
_rr[_]["tgt_counter"]

In [None]:
# WORK WITH DEEP COPIES

def overlap_kp(string, sub):
    count = start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            count+=1
        else:
            return count

## WORKING, YET REPLACED WITH SIMILARITY ###

# DONE: Similarity rank
# DONE: Add Stopwords
# stop = [i.strip() for i in open("./src/data/lexicon/stopwords.txt")]
# def selected_keyphrases(arg):
#     kps = [_["kp"] for _ in arg["retrieved"]]
#     tgt_sentences = [_["sentence"] for _ in arg["tgt_counter"]]
#
#     selected_kps = []
#
#     # Iterate per target sentence
#     for tgt, kp in zip(tgt_sentences, kps):
#         selected = []
#
#         for terms in kp:
#             singletons = terms.split()
#             for single in singletons:
#                 if single in stop:
#                     continue
#                 if overlap_kp(tgt.lower(), single) > 0:
#                     selected.append(terms)
#
#         selected_kps.append(list(set(selected)))
#
#     for _, j in zip(arg["tgt_counter"], selected_kps):
#         _["selected_keyphrases"] = j