In [1]:
### ZERO-SHOT CLASSIFICATION, TOPIC LABELLING ###

# TODOs: Discovering Interpretable Topics by Leveraging Common Sense Knowledge (Facebook)
# TODOs: https://towardsdatascience.com/zero-shot-text-classification-with-hugging-face-7f533ba83cd6
# DONE: Extract args, ids for labelling
# DONE: Get Labels
# TODOs: NLI vs Semantic
# TODOs: Zero-shot Classification ConceptNet (semantic extension)
# TODOs: Expand: Concept Net synonyms

In [2]:
### ASSERT WORKING DIRECTORY FOR IMPORTS ###
from src.utils.utils import clean
import os
print(os.getcwd())

/Users/joshua.sheppard/PycharmProjects/countaBot/src/detection


In [3]:
import pandas as pd
import json
import random
from tqdm.notebook import tqdm

In [10]:
### LOAD ###
arg_ids = [json.loads(ln)["id"] for ln in open("../data/processed/cmv_processed.jsonl")]
args = [json.loads(ln)["argument"]["argument"] for ln in open("../data/processed/cmv_processed.jsonl")]
claims = [json.loads(ln)["claim"] for ln in open("../data/processed/cmv_processed.jsonl")]

hate_speech = [json.loads(ln)["hate"] for ln in open("../data/processed/hate_cleaned.jsonl")]
hate_ids = [json.loads(ln)["id"] for ln in open("../data/processed/hate_cleaned.jsonl")]

In [11]:
len(hate_speech)

8867

In [6]:
### UNIQUE ARGUMENTS ###
corpus = set()
idx = set()

for j, k in zip(args, arg_ids):
    corpus.add((j, k))

corpus = list(corpus)
type(corpus), len(corpus)

(list, 5990)

In [7]:
### COMMON-SENSE POLARISED TOPIC LABELS ###

#TODOs: Prune and Add CauseNet Topics
controversial_topics = [clean(ln) for ln in open("../data/concepts/wiki_controversial_topics.txt")]
debate_topics = [clean(ln) for ln in open("../data/concepts/IBM_debate_topics_I.txt")]
debate_topics_ = [clean(ln) for ln in open("../data/concepts/IBM_debate_topics_II.txt")]
arg_kb_20 = [clean(ln) for ln in open("../data/concepts/argkb_20_topics.txt")]
wiki_race = [clean(ln) for ln in open("../data/concepts/wiki_racism_related.txt")]
wiki_ideologies = [clean(ln) for ln in open("../data/concepts/wiki_political_ideologies.txt")]

concepts = [clean(ln) for ln in open("../data/concepts/cause_concepts.txt")]

topics = []
topics.extend(controversial_topics)
topics.extend(debate_topics)
#topics.extend(debate_topics_)
topics.extend(wiki_ideologies)
topics.extend(arg_kb_20)
topics.extend(wiki_race)

len(topics)

4780

In [8]:
topics

['invasion of iraq',
 'abortion',
 'affirmative action',
 'african national congress',
 'alberta separatism',
 'american civil liberties union',
 'american hunters and shooters association',
 'history of the jews in the united states',
 'american jews',
 'anarchism',
 'anarchocapitalism',
 'antiamericanism',
 'anticlericalism',
 'antiirish racism',
 'antisemitism',
 'asian american',
 'atheism',
 'austrian school',
 'bashar assad',
 'syrian civil war',
 'bidoon social class',
 'black lives matter',
 'black supremacy',
 'boricua popular army',
 'brexit',
 'british national party',
 'capitalism',
 'capital punishment',
 'catalonia',
 'cbc news',
 'cherokee',
 'china',
 'east turkestan independence movement',
 'tibetan independence movement',
 'human rights in china',
 'chinese intelligence activity abroad',
 'christian right',
 'plame affair',
 'plame affair timeline',
 'communism',
 'communist party of china',
 'communist government',
 'communist state',
 'cnn',
 'conservatism in the un

In [16]:
# Semantic-Search, Cosine Similarity
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')
def semantic_search(corpus, ids, query, threshold=0.10):
    # Construct Corpus set
    corpus_ = list(corpus)
    id_ = list(ids)

    # Embed the Corpus
    corpus_embeddings = embedder.encode(corpus_, convert_to_tensor=True)

    # Construct Query-Label set
    queries = set(query)

    mapped_dict = {}
    for i in range(0, len(id_)):
        mapped_dict[id_[i]] = {"argument": corpus_[i], "label": []}

    # Return top k=1 argument for each Label via Cosine Similarity
    top_k = min(1, len(corpus_))

    with tqdm(total=len(queries)) as pbar:
        for query in queries:
            query_embedding = embedder.encode(query, convert_to_tensor=True)

            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
            top_results = torch.topk(cos_scores, k=top_k)

            for score, idx in zip(top_results[0], top_results[1]):
                # 'Empirical' threshold
                if score >= threshold:
                    # Append Label
                    #mapped_dict[id_]["argument"] = corpus_[idx]
                    # Note: Can use the same idx index
                    mapped_dict[id_[idx]]["label"] = query.lower()

                #else: mapped_dict[id_]["label"] = "None"

            pbar.update()

    return mapped_dict

In [15]:
### LABELLING: ARGUMENT SPEECH ###
topics = semantic_search(corpus=corpus, ids=arg_ids, query=topics, threshold=0)


KeyboardInterrupt



In [17]:
### LABELLING: HATEFUL SPEECH ###
hate_speech_topics = semantic_search(corpus=hate_speech, ids=hate_ids, query=topics, threshold=0)

  0%|          | 0/4499 [00:00<?, ?it/s]

In [None]:
### LABELLING: CONTROVERSIAL TOPICS ###
claim_topics = semantic_search(corpus=claim_corpus, query=topics, threshold=0)

In [None]:
### EXTRACT LABELLED INSTANCES ###
def extract_labelled(dict_):
    res = []
    for i in dict_.items():
        j, k = i

        if k["label"] != []:
            res.append({"id": i[0], "argument": k["argument"], "label": k["label"]})
    return res

In [None]:
print(len(topics), len(extract_labelled(topics)))
topics

In [None]:
### LABELLING: CONCEPTS ###
arg_concepts = semantic_search(corpus=corpus, query=concepts)

In [None]:
len(arg_concepts), len(extract_labelled(arg_concepts))

In [None]:
### STORE TOPIC LABELS CONCEPTS AND LABELS ###
fout = open("../data/argument_topic_concept.jsonl", "w")

for j, k in arg_topics.items():
    fout.write(json.dumps({
        "id": j,
        "argument": k["argument"],
        "topic_label": k["label"],
        #"concept_label": k["label"]
    }))
    fout.write("\n")

In [None]:
### STORE CONCEPT LABELS CONCEPTS AND LABELS ###
fout = open("../data/argument_concepts.jsonl", "w")

for j, k in arg_concepts.items():
    fout.write(json.dumps({
        "id": j,
        "argument": k["argument"],
        "concept_label": k["label"],
    }))
    fout.write("\n")

In [None]:
for j, k in arg_topics.items():
    if k["argument"] == "":
        print("blanks", j)

In [None]:
# ### FORM NEW ARGUMENT-SET ###
#
# # TODO: Return Domain Restricted
# args = [json.loads(ln) for ln in open("../data/cmv_processed.jsonl")]
# extracts = [json.loads(ln) for ln in open("../data/argument_extracts.jsonl")]
# arg_ids = [json.loads(ln)["id"] for ln in open("../data/cmv_processed.jsonl")]
#
# def get_arg(id_):
#     if id_ in arg_ids:
#         arg_location = arg_ids.index(id_)
#         arg = args[arg_location]
#         return arg
#
#     else: return None
#
# args_w_extract = []
# for i in extracts:
#     extract_id = i["id"]
#
#     arg = get_arg(extract_id)
#     arg["extract"] = i["extract"]
#
#     args_w_extract.append(arg)