In [1]:
### ZERO-SHOT CLASSIFICATION, TOPIC LABELLING ###

# TODOs: https://towardsdatascience.com/zero-shot-text-classification-with-hugging-face-7f533ba83cd6
# DONE: Extract args, ids for labelling
# DONE: Get Labels
# TODOs: NLI vs Semantic
# TODOs: Zero-shot Classification ConceptNet (semantic extension)
# TODOs: Expand: Concept Net synonyms

In [2]:
### ASSERT WORKING DIRECTORY FOR IMPORTS ###
import os
print(os.getcwd())

/Users/joshua.sheppard/PycharmProjects/countaBot/src/detection


In [3]:
import pandas as pd
import json
import random
from tqdm.notebook import tqdm

In [4]:
### LOAD ###
#args = [json.loads(ln) for ln in open("../data/cmv_processed.jsonl")]
ids = [json.loads(ln)["id"] for ln in open("../data/cmv_processed.jsonl")]
args = [json.loads(ln)["argument"]["argument"] for ln in open("../data/cmv_processed.jsonl")]

len(args), len(ids)

(10303, 10303)

In [5]:
### SAMPLE ###
_ = random.randint(0, len(args))
args[_]

'I believe that live versions of musical albums are nothing more than a cash grab by the artist or their labelmanageretc. to make more money off the songs they have already written. They also serve little to no point for anyone who likes clean audio since there is always someone screaming clapping on the wrong beats and trying to sing along which can make the song sound offkey if enough people arent on the right note. This artist has already made their money off the song the radiopandoraetc. replays the merchandise and the concert why do they feel the need to say well we have all this great audio from our hours in the recording studio but lets release this unpolished recording from a bunch of roadweary musicians!.I will grant some exceptions Some Jazz bands that do improvisations where no recording sounds the same Bands that dont sound right in a studio and need a raw edge like KISS Maybe if an artist has a new take on an old song but I still feel thats a little bit of a cash grabP.S. 

In [6]:
# Combined Label Set - CauseNet
# Experiment Title vs Argument
# Eval 1: Arguments

#corpus = sorted(set(args))
corpus = set()
idx = set()

for j, k in zip(args, ids):
    corpus.add((j, k))

len(corpus)

5990

In [290]:
### COMMON SENSE TOPIC LABELS ###
import re

#TODOs: Prune and Add CauseNet Topics
controversial_topics = [ln.strip() for ln in open("../data/concepts/wiki_controversial_topics.txt")]
debate_topics = [ln.strip() for ln in open("../data/concepts/IBM_debate_topics_I.txt")]
debate_topics_ = [ln.strip() for ln in open("../data/concepts/IBM_debate_topics_II.txt")]
concepts = [ln.strip() for ln in open("../data/concepts/cause_concepts.txt")]
arg_kb_20 = [ln.strip() for ln in open("../data/concepts/argkb_20_topics.txt")]

topics = []
topics.extend(controversial_topics)
topics.extend(debate_topics)
topics.extend(debate_topics_)
topics.extend(arg_kb_20)

len(topics)

6260

In [8]:
# Semantic-Search, Cosine Similarity
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_search(corpus, query, threshold=0.35):
    # Construct Corpus set
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

    # Construct Query Set
    #queries = sorted(set(query))
    queries = set(query)

    mapped_dict = dict((k, []) for k in queries)
    #arg_topics = dict((k, []) for k in idx)

    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(1, len(corpus))

    with tqdm(total=len(queries)) as pbar:
        for query in queries:
            query_embedding = embedder.encode(query, convert_to_tensor=True)

            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
            top_results = torch.topk(cos_scores, k=top_k)

            for score, idx in zip(top_results[0], top_results[1]):
                #print(corpus[idx], "(Score: {:.4f})".format(score))
                if score >= threshold:
                    mapped_dict[query].append(corpus[idx])
            pbar.update()

    return mapped_dict

In [9]:
### LABELLING: CONTROVERSIAL TOPICS ###
arg_topics = semantic_search(corpus=args, query=topics)

  0%|          | 0/5598 [00:00<?, ?it/s]

In [291]:
### EXTRACT LABELLED INSTANCES ###
def extract_labelled(dict_):
    res = []

    for i, j in dict_.items():
        if not j: continue
        else: res.append((i,j))

    return res

In [292]:
topic_labels = extract_labelled(arg_topics)
print(len(topic_labels))
topic_labels

4391


[('automotive companies',
  ['Im a big racing fan. Particularly Formula . In F teams are constantly trying to bend the rules and sometimes attempt to break them without getting caught. This is mostly looked on neutrally or favorably with the occasional rulebreak drawing the negative attention and a gate suffix. The Volkswagen Automotive Group with their legendary auto racing history Audi and Porsche are at the top of my mind here did not lose consumer trust by attempting to give their customers more performance for their dollar.']),
 ('Machine learning',
  ['Ill start off by saying that I graduated with a computer science degree in June and I work as a software developer. I have a solid understanding of some undergrad level machine learning algorithms and Ive worked withgot an overview of some more sophisticated stuff through my job. Im very impressed that e.g. Siri can act like shes talking to me but I have a pretty good idea of what the man behind the curtain looks like and I know th

In [293]:
### LABELLING: CONCEPTS ###
arg_concepts = semantic_search(corpus=args, query=concepts)

  0%|          | 0/51863 [00:00<?, ?it/s]

In [294]:
concept_labels = extract_labelled(arg_concepts)
concept_labels

[('"research funding"',
  ['I am speaking purely from an American perspective but I do genuinely feel that this issue should extend to all countries.My argument is as follows . Acceptance of funds from any single person or group makes that entity an investor in that research. . An investor in research should have access to the end results of that research. . If a researcher accepts government grants to fund their research then that government and by extension its citizens have become an investor and should have free access to the published work.To clarify I am indeed saying that acceptance of any amount of government funding should lead to free access to the published work regardless of if it is or . Change my view what is wrong about this line of thought?']),
 ('"delay in the diagnosis"',
  ['Over the last few years Ive been dealing with chronic health issues and Ive had the misfortune of becoming familiar with outrageous wait times in doctors offices.My primary care physician visits 

In [392]:
### QUERY EXPANSION ###
import json
import urllib

URL = "http://api.conceptnet.io/"

# TODOs: Review. Similarity.
class ConceptNet:

    def __init__(self, api, l):
        self.api = api
        self.l = l

    def search(self, lang, term):
        url_to_search = self.api + "c/" + lang + "/" + term
        data = urllib.request.urlopen(url_to_search)
        json_data = json.load(data)
        for i in json_data["edges"]:
            print("----------------")
            print(i["end"])
            print("relation:", i["rel"])
            print(i["surfaceEnd"])
            print(i["surfaceStart"])
            print("weight:", i["weight"])

        return json_data

    def get_relation(self, rel, concept):
        url_to_search = self.api + f"search?node=/c/en/{concept}&rel=/r/{rel}"
        data = urllib.request.urlopen(url_to_search)
        obj_ = json.load(data)

        labels = set()
        for _ in obj_["edges"]:
            labels.add((_["end"]["label"], _["weight"]))

        return labels

    def get_similar(self, concept):
        res = []
        rels = ["Synonym", "SimilarTo"]
        for _ in rels:
            res.extend(self.get_relation(_, concept))

        return sorted(res, key=lambda x: x[1], reverse=True)[:self.l]

In [393]:
concept_net = ConceptNet(api=URL, l=3)

In [403]:
### CONCEPT NET ###
expansion_term = "fabulous"
test_concept = concept_net.get_similar(expansion_term)
test_concept

[('fabulous', 2.0), ('fab', 2.0), ('mythological', 2.0)]

In [404]:
### WORDNET ###
from src.utils_.word_net_expansion import expand_query

test_wn = expand_query(expansion_term)
test_wn

'fabulous fab mythic'

In [4]:
fout = open("../data/argument_topics_concepts.jsonl", "w")

for k, v in arg_topics.items():
    fout.write(json.dumps({
        "id": "",
        "argument": k,
        "topic_label": v,
        "concept_label": ""
    }))
    fout.write("\n")

FileNotFoundError: [Errno 2] No such file or directory: '../../data/claim_topics.jsonl'

In [None]:
### CONCEPT NET ###
import requests

# obj = requests.get(f"http://api.conceptnet.io/c/en/{stance['aspect']}").json()
# obj.keys()
#
# for i in obj["edges"]:
#     if i["start"]["language"] == "en":
#         print("start", i["start"])
#         print("rel", i["rel"])
#         print("rel", i["rel"])
#         print(i["start"]["label"])
#         print("rel", i["rel"]["label"])

In [264]:
# # NOTE: Re-factor. Duplicated.
# import json
# import urllib
# URL = "http://api.conceptnet.io/"
#
# class ConceptNet(object):
#     def __init__(self, url):
#         self.url = url
#
#     def get_similar(self, phrase):
#         url_to_search = self.url + f"/query?node=/c/en/{phrase}&rel=/r/Synonym"
#
#         obj = urllib.request.urlopen(url_to_search)
#         obj_ = json.load(obj)
#
#         labels = set()
#         for _ in obj_["edges"]:
#             labels.add((_["end"]["label"], _["weight"]))
#
#         return labels

In [265]:
# concept_net = ConceptNet(URL)

In [None]:
# test_concept = concepts.get_similar("happy")
# test_concept

In [90]:
# # Semantic-Search, Cosine Similarity
# from sentence_transformers import SentenceTransformer, util
# import torch
#
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
#
# # Corpus with example sentences
# corpus = topics
# corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
#
# # Query sentences:
# #queries = sorted(set(args.titles.tolist()))
# queries = sorted(set(args))
# arg_topics = dict((k, []) for k in queries)
# #arg_topics = dict((k, []) for k in idx)
#
# # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
# top_k = min(1, len(corpus))
#
# with tqdm(total=(len(queries)), position=0, leave=True) as pbar:
#     for query in queries:
#         query_embedding = embedder.encode(query, convert_to_tensor=True)
#
#         cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
#         top_results = torch.topk(cos_scores, k=top_k)
#
#         for score, idx in zip(top_results[0], top_results[1]):
#             #print(corpus[idx], "(Score: {:.4f})".format(score))
#             if score > 0.45:
#                 #arg_topics[query].append(corpus[idx])
#                 arg_topics[query].append(corpus[idx])
#         pbar.update()

  0%|          | 0/5986 [00:00<?, ?it/s]