In [10]:
import json
import os
import random
import logging
import re

In [11]:
### LOGGING ###
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("KEYPHRASE-SELECTOR")

In [12]:
### INSPECT OUTPUT ###
os.chdir("/Users/joshua.sheppard/PycharmProjects/countaBot")

rr_ = [json.loads(ln) for ln in open("./src/data/processed/cmv_rr.jsonl", "r")][0]

In [24]:
def clean(phrase):
    return re.sub(r"[,.;@#?!&$]+\ *", " ", phrase)

subject = rr_[random.randint(0, len(rr_))]

print("CLAIM: ", subject["claim"]["sentence"])
print("===========================================\n")
for i, j, k in zip(subject["argument"], subject["tgt_counter"], subject["retrieved"]):
    print("ARG: ", i["sentence"], "\n")
    print("COUNTER: ", j["sentence"], "\n")
    print("EVIDENCE: ", clean(k["ranked_passages"]).lower(), "\n")

CLAIM:  situations where the world will end due to an artificial intelligence or super technology will never happen.

ARG:  with our constant fast technological advancement people seem to think that we will lead our selves to destruction because we do not use technology with care but recently all new modern technologies that are being invented are completely beneficial towards society as a whole such as loreal teaming up with organovo to d print human skin to use in product tests and nasa announcing a new rover able to make autonomous decision on its next mission to mars. 

COUNTER:  depends on your definition of super technology fission technology were pretty super in s and have brought both good and evil.technology is at its core an amplifier of human behavior it makes doing good and evil easier and allows us to do more with fewer human resources all it takes to end the world is human flaw and sufficiently advanced technology. 

EVIDENCE:  the recognition of the impact of technology 

In [25]:
# DONE: Keyphrase Selection
# TODOs: Full-run, arguments
import copy

### KEYPHRASE SELECTION OBJECT ###
_rr = copy.deepcopy(rr_)

In [7]:
len(_rr)

10303

In [12]:
_rr[0]["tgt_counter"]

[{'sentence': 'the majority of your points seem predicated on the idea that a basic income requires a flat tax or rejects progressive taxation and b that absolutely every social program must be scrapped for a bi to take effect.neither of these are true.',
  'selected_keyphrases': []},
 {'sentence': 'of course there are many conservatives or rightlibertarian who would prefer those outcomes but there are also many liberals and leftlibertarians who do not.the title of your is worded incorrectly.',
  'selected_keyphrases': []},
 {'sentence': 'you are arguing against not basic income but the questions of how to pay for it and what exactly would be replaced.if that doesnt change your view im really not sure what would.',
  'selected_keyphrases': []},
 {'sentence': 'the post is very well written and laid out its just a perfectly shot arrow aimed at the wrong target.',
  'selected_keyphrases': []}]

In [26]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch

model = SentenceTransformer('all-MiniLM-L6-v2')

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


In [27]:
import re
def clean(phrase):
    return re.sub(r"[,.;@#?!&$]+\ *", " ", phrase)

def cosine_similarity_(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=False)

    cos = torch.nn.CosineSimilarity()
    scores = cos(embeddings[0], embeddings[1:])

    scored = []
    retrieved_sentences = sentences[1:]
    for sent, similarity in zip(retrieved_sentences, scores):
        scored.append((sent, similarity.numpy().item()))

    return scored

def selected_keyphrases(arg):
    kps = [_["kp"] for _ in arg["retrieved"]]
    tgt_sentences = [_["sentence"] for _ in arg["tgt_counter"]]

    selected_kps = []
    for tgt, kp in zip(tgt_sentences, kps):
        vectors = [tgt]
        vectors.extend(kp)

        similarity = cosine_similarity_(vectors)
        selected = [i[0] for i in similarity if i[1] > 0.35]

        selected_kps.append(list(set(selected)))

    return selected_kps


In [39]:
from tqdm.notebook import tqdm
selected_rr = copy.deepcopy(_rr)
#SAMPLE = selected_rr[0:5]

with tqdm(total=(len(rr_)), position=0, leave=True) as pbar:
    for i in range(0, len(selected_rr)):
        selected_kps = selected_keyphrases(_rr[i])

        for sent, kp in zip(selected_rr[i]["tgt_counter"], selected_kps):
            sent["selected_keyphrases"] = kp
        pbar.update()

  0%|          | 0/10303 [00:00<?, ?it/s]

In [54]:
selected_rr_ = copy.deepcopy(selected_rr)

In [57]:
### INSPECT ###
_ = random.randint(0, len(selected_rr_))
selected_rr_[_]["tgt_counter"]

[{'sentence': 'im currently a junior in collegeuniversity.',
  'selected_keyphrases': ['Distinguished Junior Membership.',
   'Junior or Senior year',
   'University College.',
   'Junior Membership.',
   'Distinguished Junior']},
 {'sentence': 'i occasionally look back at things ive written a few years before and see that my writing was orders of magnitude worse then than it is now.',
  'selected_keyphrases': ['practice in writing.']},
 {'sentence': 'i know that in a few years ill look back on the writing im doing today and be embarrassed about most of it.writing is not something you can master in a few years.',
  'selected_keyphrases': ['started writing after college',
   'read cursive writing',
   'secretly writing.']},
 {'sentence': 'the best writers hone their craft over entire lifetimes.',
  'selected_keyphrases': ['novice and experienced writers',
   'Association of Science Writers']},
 {'sentence': 'even if you want to just write at a basic level of effectiveness youll find tha

In [58]:
file_name = "cmv_rr_selected"
fout = open(f"./src/data/processed/{file_name}.jsonl", "w")

with tqdm(total=(len(selected_rr))) as pbar:
    with fout:
        fout.write(json.dumps(selected_rr))

logger.info(f"[{len(selected_rr)} Data Stored as {file_name}.jsonl]")

  0%|          | 0/10303 [00:00<?, ?it/s]

INFO:KEYPHRASE-SELECTOR:[10303 Data Stored as cmv_rr_selected.jsonl]


In [8]:
review = [json.loads(ln) for ln in open("./src/data/processed/cmv_rr_selected.jsonl", "r")][0]

FileNotFoundError: [Errno 2] No such file or directory: './src/data/processed/cmv_rr_selected.jsonl'

In [5]:
len(review)

10303

In [23]:
### HATE SELECTION ###
import os
print(os.getcwd())
hate_rr = [json.loads(ln) for ln in open("./src/data/processed/hate_rr.jsonl")][0]

/Users/joshua.sheppard/PycharmProjects/countaBot


In [29]:
from tqdm import tqdm
import copy

sel_hate_rr = copy.deepcopy(hate_rr)

with tqdm(total=(len(hate_rr)), position=0, leave=True) as pbar:
    for i in range(0, len(hate_rr)):
        selected_kps = selected_keyphrases(hate_rr[i])

        for sent, kp in zip(sel_hate_rr[i]["tgt_counter"], selected_kps):
            sent["selected_keyphrases"] = kp
        pbar.update()

100%|██████████| 8867/8867 [04:38<00:00, 31.79it/s]


In [30]:
sel_hate_rr

[{'id': 'ENT1ST0001HS0033CN000021',
  'hate_speech': [{'sentence': 'according to a recent ofsted report a school in birmingham is still segregating girls and boys despite a hight court ruling in that this is unlawful.',
    'selected_keyphrases': [],
    'stance': 'CON',
    'aspect': 'birmingham segregating girls'}],
  'tgt_counter': [{'sentence': 'to be fair the ofsted report is more concerned with lack of enforcement and less about focussing on the practice of any particular faith.',
    'selected_keyphrases': ['Ofsted report'],
    'stance': 'PRO',
    'aspect': 'ofsted report concerned'}],
  'retrieved': [{'ranked_passages': "Ofsted report (2018)., Ofsted Report., In 2016, Academy received a mark of 'Inadequate' on the Ofsted report.",
    'kp': ['Academy received a mark',
     'Ofsted report',
     'Academy received',
     'received a mark',
     'Inadequate']}]},
 {'id': 'ENT1ST0001HS0033CN000021P1',
  'hate_speech': [{'sentence': 'in birmingham there is a school where girls and

In [32]:
file_name = "hate_rr_selected"
fout = open(f"./src/data/processed/{file_name}.jsonl", "w")

with tqdm(total=(len(sel_hate_rr))) as pbar:
    with fout:
        fout.write(json.dumps(sel_hate_rr))

logger.info(f"[{len(sel_hate_rr)} Data Stored as {file_name}.jsonl]")

  0%|          | 0/8867 [00:00<?, ?it/s]
INFO:KEYPHRASE-SELECTOR:[8867 Data Stored as hate_rr_selected.jsonl]
