In [None]:
### TODOs ###

# TODOs: Commonsense Query and Concept Expansion: Topics, Concepts, Synonyms
# TODOs: Domain Restrict. Polarising social and political debate (Class labelling) only for higher-quality
# TODO: Enhance Stance Module; Determine stance over entire argument. Only implicate stance for Noun
# TODOs: Targeted Retrieval with Semantic Graphs
# TODOs: Multi-Field Search

# DONE: Mine Args
# DONE: Mine Counters
# DONE: Add Concepts
# DONE: Prior tokenization and sentence segmentation to speed processing
# DONE: Adu, Counter + KP Extraction as 'Argument Mining' preprocessing module
# DONE: Manage Duplicate Keywords
# DONE: Sentential Ranking
# DONE: Include Topic Label
# DONE: Include Concept Label
# DONE: Add News

# TODOs: (1) coverage of topic signature words in the input statement; (2) a weighted summation of the coverage of n-grams in the argu- ment4; (3) the magnitude of stance score, where we keep the passages of the same polarity as the argument; (4) content word overlap with the argument; and (5) coverage of topic signature words in the argument.

In [18]:
import os
root = "/Users/joshua.sheppard/PycharmProjects/countaBot"
os.chdir(root)
print(os.getcwd())

### INIT LOGGING ###
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ARGUMENT-EXTRACTOR")

### NLP FUNCTIONS ###
from src.utils_.utils import tokeniser, sentences_segment

/Users/joshua.sheppard/PycharmProjects/countaBot


In [20]:
### LOAD DATASETS ###
import json
import os
import random

print(os.getcwd())
args = [json.loads(ln) for ln in open("./src/data/cmv_processed.jsonl")]
topics = [json.loads(ln) for ln in open("./src/data/argument_topic_concept.jsonl")]
concepts = [json.loads(ln) for ln in open("./src/data/argument_concept.jsonl")]

/Users/joshua.sheppard/PycharmProjects/countaBot


In [21]:
len(topics), len(concepts), len(args)

(5990, 5990, 10303)

In [23]:
### ASSERT BLANKS ###
args_ = [json.loads(ln)["argument"]["argument"] for ln in open("./src/data/cmv_processed.jsonl")]
ids = [json.loads(ln)["id"] for ln in open("./src/data/cmv_processed.jsonl")]

for j, k in zip(args_, ids):
    if j == "":
        print("blanks", j, k)

blanks  t3_3cm6jy
blanks  t3_1egv4k
blanks  t3_1egv4k
blanks  t3_5wjdve


In [24]:
### INSPECT ARG ###
import random
sample = random.randint(0, 99)

arg = args[sample]["argument"]["argument"]
claim = args[sample]["claim"]

print(sample, "\n")
print(claim, "\n")
print(arg, "\n")

99 

rewinding time is the best superpower for daily life 

a topic that ive had fun discussing is to imagine what could be done with a superpower. with so many choices and so many implications for each power this little game can spark long conversations on how each power would affect our daily lives. from all of these discussions ive come away with the view that one power is better than all others by the most metrics a power that i like to call rewind.disclaimer if youve never found yourself wondering what your life could be like with superpowers and have no interest in starting then this topic is definitely not for you. the topic is one massive hypothetical so that better be your thing ptldr because damn! i wrote way too much to ask you to read it all 



In [95]:
### EXTRACTORS ###
from src.utils_.keyphrase_extraction import yake_extract_keyphrase, summa_extract_keyphrase
from yake import KeywordExtractor
import re

### PHRASE CLEANER ###
def clean(phrase):
    return re.sub(r"[,.;@#?!&$]+\ *", " ", phrase)

### YAKE PARAMS ###
language = "en"
max_ngram_size = 3
deduplication_thresold = 0.9
deduplication_algo = 'seqm'

### YAKE ####
yake_extractor = KeywordExtractor(lan=language, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, n=3)

def yake_extract_keyphrase(doc, k=1):
    kp = yake_extractor.extract_keywords(doc)

    return [clean(i[0]) for i in kp][0:k]

### KEYBERT ###
from keybert import KeyBERT

kb = KeyBERT()
def extract_keyphrase(doc, n_gram=3, n_kp=3, use_mmr="False", use_maxsum="False"):
    kp = kb.extract_keywords(doc, keyphrase_ngram_range=(1, 4), stop_words='english', use_mmr=True, diversity=0.5)

    #return kp
    return [clean(i[0]) for i in kp]

### TEST CASES ###
test = "Brazil's minimum income has increasingly been accepted."
ev_kp = yake_extract_keyphrase(test, k=1)
print(ev_kp)

ev_kp = yake_extract_keyphrase(test)
ev_kp_ = summa_extract_keyphrase(test)

test_2 = " "
ev_kp_2 = yake_extract_keyphrase(test_2)
ev_kp_2_ = summa_extract_keyphrase(test_2)

test_3 = "Brazil's minimum income has increasingly been accepted."
ev_kp_3 = extract_keyphrase(test_3)

print(ev_kp)
print(ev_kp_)

print(ev_kp_2)
print(ev_kp_2_)

print(ev_kp_3)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


['Brazil minimum income']
['Brazil minimum income']
['minimum']
[]
[]
['brazil minimum income increasingly', 'minimum income increasingly accepted', 'income increasingly accepted', 'brazil', 'increasingly accepted']


In [96]:
from tqdm.notebook import tqdm
#from src.utils_.common_sense_expansion import wordNet_expansion
from src.detection.stance_classifier import sentence_stance, compare_stance
import multiprocessing
import json
import time

# Disable Huggingface Logging
os.environ["TOKENIZERS_PARALLELISM"] = "false"

topic_ids = [json.loads(ln)["id"] for ln in open("./src/data/processed/argument_topic_concept.jsonl")]
concept_ids = [json.loads(ln)["id"] for ln in open("./src/data/processed/argument_concept.jsonl")]

# Where notion == topic or concept
def get_notion(notions_ids, notions_lst, arg_id, label):
    notion_id = notions_ids.index(arg_id)
    notion = notions_lst[notion_id][label]
    return str(notion) if notion else None

# Extract Argument Discourse as Sentences, Keyphrases, Topics and Concepts
def extract_adus(arg_):

    id_ = arg_["id"]
    arg = arg_["argument"]["argument"]

    topic = get_notion(topic_ids, topics, id_, "topic_label")
    concept = get_notion(concept_ids, concepts, id_, "concept_label")

    adu_sents = sentences_segment(arg)

    extract_adus = []
    for _ in adu_sents:

        # Limit irrelevant sentences
        if len(tokeniser(_)) <= 5:
            continue

        kp = extract_keyphrase(_)

        singleton = yake_extract_keyphrase(_, 1)
        kp.extend(singleton)

        aspect = " " if kp == [] else kp[0]

        try:
            stance = sentence_stance(_, aspect)
        except:
            stance = " "

        adu = {"sentence": _, "kp": [i.lower() for i in kp], "stance": stance, "aspect": aspect, "topic": topic, "concept": concept}

        extract_adus.append(adu)

    claim = arg_["claim"]
    claim_kp = yake_extract_keyphrase(claim)
    claim_adu = {"sentence": claim, "kp": [i for i in claim_kp]}

    return ({
        "id": id_,
        "claim": claim_adu,
        "argument": [i for i in extract_adus]
    })

mined_args = []

with multiprocessing.Pool(8) as pool:
    with tqdm(total=(len(args)), position=0, leave=True) as pbar:
        for arg in args:
            mined_args.append(extract_adus(arg))
            pbar.update()

  0%|          | 0/10303 [00:00<?, ?it/s]

In [85]:
# test = []
#
# with multiprocessing.Pool(8) as pool:
#     with tqdm(total=(len(args[0:10])), position=0, leave=True) as pbar:
#         for arg in args[0:10]:
#             test.append(extract_adus(arg))
#             pbar.update()

  0%|          | 0/10 [00:00<?, ?it/s]

In [97]:
# STORE DEEP-COPY
import copy
mined_args_ = copy.deepcopy(mined_args)

In [101]:
# mined_args
print(len(mined_args))

import random
#_ = random.randint(0, len(sample))
example = mined_args[10]
example

10303


{'id': 't3_40uylb',
 'claim': {'sentence': 'the oregon militia are terrorists and if they were not white but instead from a brownmiddle eastern ethnicity doing the same thing same reason theyd be called terrorists immediately',
  'kp': ['called terrorists immediately']},
 'argument': [{'sentence': 'can someone explain to me why these group of people in oregon are not considered terrorists and are not being called that?',
   'kp': ['people oregon considered terrorists',
    'oregon considered terrorists called',
    'explain group people oregon',
    'terrorists called',
    'explain group',
    'group of people'],
   'stance': 'NEUTRAL',
   'aspect': 'people oregon considered terrorists',
   'topic': 'terrorist groups',
   'concept': None},
  {'sentence': 'not on reddit not in the media im having a tough time finding anyone saying itthey have taken a building by force with guns.. they have a political aimagenda.',
   'kp': ['building force guns political',
    'saying itthey taken buil

In [102]:
len(mined_args_)

10303

In [103]:
### COUNTER-ARGS ###
def extract_counters(arg_):
    id_ = arg_["id"]
    counter = arg_["tgt_counter"]["tgt_counter"]

    counter_sents = sentences_segment(counter)

    extract_counters = []
    for _ in counter_sents:
        if len(tokeniser(_)) <= 5:
            continue

        kp = extract_keyphrase(_)
        #kp = yake_extract_keyphrase(_)

        aspect = " " if kp == [] else kp[0]

        try:
            stance = sentence_stance(_, aspect)
        except:
            stance = " "

        counter_unit = {"sentence": _, "kp": [i for i in kp], "stance": stance, "aspect": aspect}

        extract_counters.append(counter_unit)

    return ({
        "id": id_,
        "tgt_counter": [i for i in extract_counters]
    })

mined_counters = []

sample = args
with multiprocessing.Pool(8) as pool:
    with tqdm(total=(len(args)), position=0, leave=True) as pbar:
        for arg in args:
            mined_counters.append(extract_counters(arg))
            pbar.update()

  0%|          | 0/10303 [00:00<?, ?it/s]

In [104]:
import copy
mined_counters_ = copy.deepcopy(mined_counters)

In [105]:
# mined_args
print(len(mined_counters))

import random
_ = random.randint(0, len(sample))
example = mined_counters[_]
example

10303


{'id': 't3_1kye6l',
 'tgt_counter': [{'sentence': 'why do you feel like you need to start drinking?',
   'kp': ['need start drinking',
    'drinking',
    'feel like need start',
    'feel like',
    'need'],
   'stance': 'PRO',
   'aspect': 'need start drinking'},
  {'sentence': 'it doesnt make you boring unless the only way you define interesting is through drinking which is pretty flawed.',
   'kp': ['define interesting drinking',
    'doesnt make boring unless',
    'way define interesting',
    'interesting',
    'pretty flawed'],
   'stance': 'PRO',
   'aspect': 'define interesting drinking'},
  {'sentence': 'there is nothing wrong with not drinking.',
   'kp': ['wrong drinking', 'drinking', 'wrong'],
   'stance': 'CON',
   'aspect': 'wrong drinking'},
  {'sentence': 'you totally can grow up sipping cola always and everywhere without being a freak or a sideshow attraction.',
   'kp': ['grow sipping cola freak',
    'sipping cola freak sideshow',
    'cola freak sideshow attractio

In [106]:
len(mined_args), len(mined_counters)

(10303, 10303)

In [107]:
len(mined_args_), len(mined_counters_)

(10303, 10303)

In [108]:
import os
print(os.getcwd())

/Users/joshua.sheppard/PycharmProjects/countaBot


In [109]:
file_name = "cmv_argument_extraction"
fout = open(f"./src/data/processed/{file_name}.jsonl", "w")

# Deep_copies
mined_args_ = copy.deepcopy(mined_args)
mined_counters_ = copy.deepcopy(mined_counters)

with tqdm(total=(len(mined_args_))) as pbar:
    with fout:
        for mined_arg, mined_counter in zip(mined_args_, mined_counters_):
            # Extended pre-formatted mined object
            mined_arg["tgt_counter"] = [_ for _ in mined_counter["tgt_counter"]]

            fout.write(json.dumps(mined_arg))

            fout.write("\n")
            pbar.update()

logger.info(f"[{len(mined_args_)} Data Stored as {file_name}.jsonl]")

  0%|          | 0/10303 [00:00<?, ?it/s]

INFO:ARGUMENT-EXTRACTOR:[10303 Data Stored as cmv_argument_extraction.jsonl]


In [111]:
### EVALUATE OUTPUT ###
train = [json.loads(ln) for ln in open(f"./src/data/processed/{file_name}.jsonl", "r")]

In [112]:
len(train)

10303

In [114]:
_ = random.randint(0, len(train))
print(train[_]["id"], "\n")
print(train[_]["argument"], "\n")
print(train[_]["tgt_counter"], "\n")

t3_3s7hmb 

[{'sentence': 'lets see what is the us at?life expectancy?corruption?gdp per capita?equality?upward mobility?education?freedom from corruption?internet speeds?environmentalism?peacelowest murder rates?average wealth?median wealth?median income?economic freedom according to the conservative heritage foundation?the correct answer is none of these.', 'kp': ['economic freedom according conservative', 'rates average wealth', 'life expectancy corruption gdp', 'median', 'foundation correct', 'peacelowest murder rates'], 'stance': 'PRO', 'aspect': 'economic freedom according conservative', 'topic': None, 'concept': None}, {'sentence': 'a mix of hong kong singapore scandinavian countries switzerland canada and australia take up most of these rankings.', 'kp': ['singapore scandinavian countries switzerland', 'hong kong singapore scandinavian', 'canada australia rankings', 'mix hong kong', 'australia', 'hong kong singapore'], 'stance': 'NEUTRAL', 'aspect': 'singapore scandinavian coun

In [None]:
# for i, j in zip(retrieved_ranked, sample):
#     # Add counter to the dictionary (implicitly, i)
#     i["counter"] = j["counter"]
#     fout.write(json.dumps(i))
#     fout.write("\n")

In [None]:
# Working Loop
# for i in mined_counters_:
#     test = {
#         "test": [j for j in i["counter"]]
#     }

In [None]:
# BATCH LOADING

# STEPS = 10
# STEP = max(int(len(SAMPLE) / STEPS), 1)
# BATCHES = [sample[i:i + STEP] for i in range(0, len(SAMPLE), STEP)]
#
# mined_counters = []
# for idx, batch in enumerate(BATCHES):
#     print('-' * 25 + 'Batch %d/%d' % (idx + 1, len(BATCHES)) + '-' * 25)
#
#     with multiprocessing.Pool(8) as pool:
#         with tqdm(total=(len(batch))) as pbar:
#             for counter in batch:
#                 mined_counters.append(extract_counters(counter))
#                 pbar.update()