In [None]:
### ZERO-SHOT CLASSIFICATION, TOPIC LABELLING ###

# TODOs: Discovering Interpretable Topics by Leveraging Common Sense Knowledge (Facebook)
# TODOs: https://towardsdatascience.com/zero-shot-text-classification-with-hugging-face-7f533ba83cd6
# DONE: Extract args, ids for labelling
# DONE: Get Labels
# TODOs: NLI vs Semantic
# TODOs: Zero-shot Classification ConceptNet (semantic extension)
# TODOs: Expand: Concept Net synonyms

In [None]:
### ASSERT WORKING DIRECTORY FOR IMPORTS ###
import os
print(os.getcwd())

In [None]:
import pandas as pd
import json
import random
from tqdm.notebook import tqdm

In [None]:
### LOAD ###
#args = [json.loads(ln) for ln in open("../data/cmv_processed.jsonl")]
arg_ids = [json.loads(ln)["id"] for ln in open("../data/cmv_processed.jsonl")]
args = [json.loads(ln)["argument"]["argument"] for ln in open("../data/cmv_processed.jsonl")]
claims = [json.loads(ln)["claim"] for ln in open("../data/cmv_processed.jsonl")]

extracts = [json.loads(ln)["extract"] for ln in open("../data/argument_extracts.jsonl")]
extract_ids = [json.loads(ln)["id"] for ln in open("../data/argument_extracts.jsonl")]

In [None]:
### SAMPLE ###
_ = random.randint(0, len(args))
args[_]

In [None]:
### UNIQUE ARGUMENTS ###
corpus = set()
idx = set()

for j, k in zip(args, arg_ids):
    corpus.add((j, k))

corpus = list(corpus)
type(corpus), len(corpus)

In [None]:
### UNIQUE EXTRACTS LIST ###
extract_corpus = set()
idx = set()

for j, k in zip(extracts, extract_ids):
    extract_corpus.add((j, k))

extract_corpus

In [None]:
### FORM NEW ARGUMENT-SET ###

# TODO: Return Domain Restricted
args = [json.loads(ln) for ln in open("../data/cmv_processed.jsonl")]
extracts = [json.loads(ln) for ln in open("../data/argument_extracts.jsonl")]
arg_ids = [json.loads(ln)["id"] for ln in open("../data/cmv_processed.jsonl")]

def get_arg(id_):
    if id_ in arg_ids:
        arg_location = arg_ids.index(id_)
        arg = args[arg_location]
        return arg

    else: return None

args_w_extract = []
for i in extracts:
    extract_id = i["id"]

    arg = get_arg(extract_id)
    arg["extract"] = i["extract"]

    args_w_extract.append(arg)

In [None]:
extracts = [json.loads(ln) for ln in open("../data/argument_extracts.jsonl")]

extract_w_claims = []
for i in extracts:
    extract = i["extract"]
    extract_id = i["id"]

    i["claim"] = get_arg(extract_id)

    extract_w_claims.append(extract)

In [None]:
### UNIQUE CLAIMS ###
# claim_corpus = set()
# idx = set()
#
# for j, k in zip(claims, ids):
#     claim_corpus.add((j, k))
#
# claim_corpus = list(claim_corpus)
# type(claim_corpus), len(claim_corpus)

In [None]:
### COMMON-SENSE POLARISED TOPIC LABELS ###
import re
def clean(clean):
    clean = re.sub(r"\n", "", clean)
    clean = re.sub(r'(?<=[a-z])\'(?=[a-z])', '', clean)
    clean = re.sub('([^a-zA-Z\s.!?])', "", clean)
    clean = re.sub('\s+', ' ', clean)

    clean = re.sub(r"www\S+", "", clean)
    return clean.strip().lower()

#TODOs: Prune and Add CauseNet Topics
controversial_topics = [clean(ln) for ln in open("../data/concepts/wiki_controversial_topics.txt")]
debate_topics = [clean(ln) for ln in open("../data/concepts/IBM_debate_topics_I.txt")]
debate_topics_ = [clean(ln) for ln in open("../data/concepts/IBM_debate_topics_II.txt")]
arg_kb_20 = [clean(ln) for ln in open("../data/concepts/argkb_20_topics.txt")]
wiki_race = [clean(ln) for ln in open("../data/concepts/wiki_racism_related.txt")]
wiki_ideologies = [clean(ln) for ln in open("../data/concepts/wiki_political_ideologies.txt")]

concepts = [clean(ln) for ln in open("../data/concepts/cause_concepts.txt")]

topics = []
topics.extend(controversial_topics)
topics.extend(debate_topics)
# topics.extend(debate_topics_)
topics.extend(wiki_ideologies)
# topics.extend(arg_kb_20)
topics.extend(wiki_race)

len(topics)

In [None]:
topics

In [None]:
# Semantic-Search, Cosine Similarity
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')
def semantic_search(corpus, query, threshold=0.30):
    # Construct Corpus set
    corpus_, id_ = zip(*corpus)
    corpus_ = list(corpus_)
    id_ = list(id_)

    # Embed the Corpus
    corpus_embeddings = embedder.encode(corpus_, convert_to_tensor=True)

    # Construct Query-Label set
    queries = set(query)

    mapped_dict = {}
    for i in range(0, len(id_)):
        mapped_dict[id_[i]] = {"argument": corpus_[i], "label": []}

    # Return top k=1 argument for each Label via Cosine Similarity
    top_k = min(1, len(corpus_))

    with tqdm(total=len(queries)) as pbar:
        for query in queries:
            query_embedding = embedder.encode(query, convert_to_tensor=True)

            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
            top_results = torch.topk(cos_scores, k=top_k)

            for score, idx in zip(top_results[0], top_results[1]):
                # 'Empirical' threshold
                if score >= threshold:
                    # Append Label
                    #mapped_dict[id_]["argument"] = corpus_[idx]
                    # Note: Can use the same idx index
                    mapped_dict[id_[idx]]["label"] = query.lower()

                #else: mapped_dict[id_]["label"] = "None"

            pbar.update()

    return mapped_dict

In [None]:
### EXTRACT LABELLED INSTANCES ###
def extract_labelled(dict_):
    res = []
    for i in dict_.items():
        j, k = i

        if k["label"] != []:
            res.append({"id": i[0], "argument": k["argument"], "label": k["label"]})
    return res

In [None]:
# TODOs: Reverse the Query
#sample = corpus[0:100]

### LABELLING: CONTROVERSIAL TOPICS ###
extract_topics = semantic_search(corpus=extract_corpus, query=topics, threshold=0)

In [None]:
print(len(extract_topics), len(extract_labelled(extract_topics)))
extract_topics

In [None]:
# print(len(arg_topics), len(corpus))
# print(len(arg_topics), len(extract_labelled(arg_topics)))
# arg_topics

In [None]:
# TODOs: Reverse the Query
#sample = corpus[0:100]

### LABELLING: CONTROVERSIAL TOPICS ###
claim_topics = semantic_search(corpus=claim_corpus, query=topics, threshold=0)

In [None]:
print(len(claim_topics), len(claim_corpus))
print(len(arg_topics), len(extract_labelled(claim_topics)))
arg_topics

In [None]:
### LABELLING: CONCEPTS ###
arg_concepts = semantic_search(corpus=corpus, query=concepts)

In [None]:
len(arg_concepts), len(extract_labelled(arg_concepts))

In [None]:
### STORE TOPIC LABELS CONCEPTS AND LABELS ###
fout = open("../data/argument_topic_concept.jsonl", "w")

for j, k in arg_topics.items():
    fout.write(json.dumps({
        "id": j,
        "argument": k["argument"],
        "topic_label": k["label"],
        #"concept_label": k["label"]
    }))
    fout.write("\n")

In [None]:
### STORE CONCEPT LABELS CONCEPTS AND LABELS ###
fout = open("../data/argument_concepts.jsonl", "w")

for j, k in arg_concepts.items():
    fout.write(json.dumps({
        "id": j,
        "argument": k["argument"],
        "concept_label": k["label"],
    }))
    fout.write("\n")

In [None]:
for j, k in arg_topics.items():
    if k["argument"] == "":
        print("blanks", j)

In [None]:
arg_concepts

In [None]:
### DOMAIN RESTRICTED ###



In [None]:
# ### QUERY EXPANSION ###
#
# # TODOs: Query Expansions [TypeOf, SimilarTerms, CanBe]
# # https://github.com/fitosegrera/python-conceptnet/blob/master/ConceptNet.py
# import json
# import urllib
#
# URL = "http://api.conceptnet.io/"
#
# # TODOs: Review. Similarity.
# class ConceptNet:
#
#     def __init__(self, api, l):
#         self.api = api
#         self.l = l
#
#     def search(self, lang, term):
#         url_to_search = self.api + "c/" + lang + "/" + term
#         data = urllib.request.urlopen(url_to_search)
#         json_data = json.load(data)
#         for i in json_data["edges"]:
#             print("----------------")
#             print(i["end"])
#             print("relation:", i["rel"])
#             print(i["surfaceEnd"])
#             print(i["surfaceStart"])
#             print("weight:", i["weight"])
#
#         return json_data
#
#     def get_relation(self, rel, concept):
#         url_to_search = self.api + f"search?node=/c/en/{concept}&rel=/r/{rel}"
#         data = urllib.request.urlopen(url_to_search)
#         obj_ = json.load(data)
#
#         labels = set()
#         for _ in obj_["edges"]:
#             labels.add((_["end"]["label"], _["weight"]))
#
#         return labels
#
#     def get_similar(self, concept):
#         res = []
#         rels = ["Synonym", "SimilarTo"]
#         for _ in rels:
#             res.extend(self.get_relation(_, concept))
#
#         return sorted(res, key=lambda x: x[1], reverse=True)[:self.l]

In [None]:
# concept_net = ConceptNet(api=URL, l=3)

In [None]:
### CONCEPT NET ###
# from src.utils_.concept_net_expansion import ConceptNet
# URL = "http://api.conceptnet.io/"
#
# concept_net = ConceptNet(URL, l=3)
# expansion_term = "carriage"
#
# test_concept = concept_net.get_similar(expansion_term)
# test_concept

In [None]:
### WORDNET ###
# from src.utils_.word_net_expansion import expand_query
#
# test_wn = expand_query(expansion_term)
# test_wn

In [None]:
### EXTRACT LABELLED INSTANCES ###
# def extract_labelled(dict_):
#     res = []
#     for i in dict_.items():
#         j, k = i
#
#         if k["label"] != "None":
#             res.append({"id": i[0], "argument": k["argument"], "label": k["label"]})
#     return res

# args_topic_labelled = extract_labelled(arg_topics)
# print(len(args_topic_labelled))
#
# args_topic_labelled

In [None]:
# args_topic_labelled = extract_labelled(arg_topics)
# print(len(args_topic_labelled))
#
# args_topic_labelled