In [28]:
### INIT LOGGING ###
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ARGUMENT-EXTRACTOR")

In [1]:
### NLP FUNCTIONS ###
from src.utils_.utils import tokeniser, sentences_segment

print(tokeniser("hello, my name is Josh!"))
print(sentences_segment("hello, my name is Josh! How are you doing today? I'm curious ... will this line seperate? I'm not so sure Dr. Evil"))

['hello', ',', 'my', 'name', 'is', 'Josh', '!']
['hello, my name is Josh!', 'How are you doing today?', "I'm curious ... will this line seperate?", "I'm not so sure Dr.", 'Evil']


In [2]:
### LOAD DATASETS ###
import json
import random

args = [json.loads(ln) for ln in open("../data/cmv_processed.jsonl")]
topics = [json.loads(ln) for ln in open("../data/argument_topic_concept.jsonl")]
concepts = [json.loads(ln) for ln in open("../data/argument_concept.jsonl")]

In [3]:
len(topics), len(concepts), len(args)

(5990, 5990, 10303)

In [4]:
### ASSERT BLANKS ###
args_ = [json.loads(ln)["argument"]["argument"] for ln in open("../data/cmv_processed.jsonl")]
ids = [json.loads(ln)["id"] for ln in open("../data/cmv_processed.jsonl")]

for j, k in zip(args_, ids):
    if j == "":
        print("blanks", j, k)

blanks  t3_3cm6jy
blanks  t3_1egv4k
blanks  t3_1egv4k
blanks  t3_5wjdve


In [5]:
### SUBJECT ARG ###
import random
sample = random.randint(0, 99)

arg = args[sample]["argument"]["argument"]
claim = args[sample]["claim"]

print(sample, "\n")
print(claim, "\n")
print(arg, "\n")

52 

Transgenders only enforce gender stereotypes. 

I try not to be bigoted and Id really like to open my mind to this especially as my cousin begins his HRT. Im a strong liberal but I believe the concept of transgender individuals is tied strongly to the concept of gender roles. Why bother changing your gender or identifying as a different one if they are equal? Dysphoria is a real issue but thats purely psychological. If someone truly believed man woman then why would they feel the need to be one or the other. 



In [6]:
### EXTRACT OVER UNIQUE ARGUMENTS ONLY ###
# unique = set()
# idx = set()
#
# for j, k in zip(args_, ids):
#     unique.add((j, k))
#
# unique = list(unique)
# type(unique), len(unique)
#
# unique

In [7]:
# len(unique)

In [8]:
### TODOs ###

# TODOs: Mine Args
# TODO: Enhance Stance Module; Determine stance over entire argument. Only implicate stance for Noun
# TODOs: Mine Counters
# TODOs: Add Concepts
# TODOs: Commonsense Query and Concept Expansion: Topics, Concepts, Synonyms
# TODOs: Parallel process
# TODOs: Prior tokenization and sentence segmentation to speed processing
# TODOs: Domain Restrict. Polarising social and political debate (Class labelling) only for higher-quality argument-knowledge set

In [9]:
from src.utils_.keyphrase_extraction import yake_extract_keyphrase, summa_extract_keyphrase

test = "Brazil's minimum income has increasingly been accepted."
ev_kp = yake_extract_keyphrase(test)
ev_kp_ = summa_extract_keyphrase(test)

test_2 = " "
ev_kp_2 = yake_extract_keyphrase(test_2)
ev_kp_2_ = summa_extract_keyphrase(test_2)

print(ev_kp)
print(ev_kp_)

# Can Handel Blanks
print(ev_kp_2)
print(ev_kp_2_)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu
INFO:KEYPHRASE_EXTRACTOR:[Test Keyphrase: ] 
 ['heathrow airport', 'environmental impact', 'aviation']


['Brazil minimum income', 'Brazil minimum', 'increasingly been accepted', 'minimum income', 'income has increasingly']
['minimum']
[]
[]


In [10]:
# def get_topic(arg_id):
#     topic_id = topic_ids.index(arg_id)
#     topic = topics[topic_id]["topic_label"]
#     return str(topic) if topic else None
#
# def get_concept(arg_id):
#     concept_id = concept_ids.index(arg_id)
#     concept = concepts[concept_id]["concept_label"]
#     return str(concept) if concept else None

In [11]:
# TODOs: Adu, Counter + KP Extraction as 'Argument Mining' preprocessing module
# TODOs: Implement Query Expansion at Query-time
# TODOs: Manage Duplicate Keywords
# DONE: Sentential Ranking
# DONE: Include Topic Label
# DONE: Include Concept Label
# DONE: Add News
# TODOs: Targeted Retreival with Semantic Graphs
# TODOs: Target Argumentative Content Only
# TODOs: Targeted Argument Content: Adus + Extractive Summary
# TODOs: Query Expansion
# TODOs: Multi-Field Search
# TODOs: Additional News and Knowledge Sources

In [None]:
from tqdm.notebook import tqdm
from src.detection.stance_classifier import sentence_stance, compare_stance
from src.utils_.word_net_expansion import expand_query
from src.detection.stance_classifier import sentence_stance
import multiprocessing
import json
import time

# Disable Huggingface Logging
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

topic_ids = [json.loads(ln)["id"] for ln in open("../data/argument_topic_concept.jsonl")]
concept_ids = [json.loads(ln)["id"] for ln in open("../data/argument_concept.jsonl")]

# Where notion == topic or concept
def get_notion(notions_ids, notions_lst, arg_id, label):
    notion_id = notions_ids.index(arg_id)
    notion = notions_lst[notion_id][label]
    return str(notion) if notion else None

# Extract Argument Discourse as Sentences, Keyphrases, Topics and Concepts
def extract_adus(arg_):

    id_ = arg_["id"]
    arg = arg_["argument"]["argument"]

    print("\n", id_)

    topic = get_notion(topic_ids, topics, id_, "topic_label")
    concept = get_notion(concept_ids, concepts, id_, "concept_label")

    adu_sents = sentences_segment(arg)

    extract_adus = []
    for _ in adu_sents:
        if len(tokeniser(_)) <= 5:
            continue

        try:
            kp = yake_extract_keyphrase(_)
        except:
            kp = [" "]

        print(kp)

        aspect = " " if kp == [] else kp[0]

        try:
            stance = sentence_stance(_, aspect)
        except:
            stance = " "

        adu = {"sentence": _, "kp": [i for i in kp], "stance": stance, "aspect": aspect, "topic": topic, "concept": aspect}

        extract_adus.append(adu)

    return ({
        "id": id_,
        "argument": [i for i in extract_adus]
    })

SAMPLE = args[0:1000]
STEPS = 10
STEP = max(int(len(SAMPLE) / STEPS), 1)
BATCHES = [args[i:i + STEP] for i in range(0, len(SAMPLE), STEP)]

mined_args = []

for idx, batch in enumerate(BATCHES):
    print('-' * 25 + 'Batch %d/%d' % (idx + 1, len(BATCHES)) + '-' * 25)

    with multiprocessing.Pool(8) as pool:
        with tqdm(total=(len(batch))) as pbar:
            for arg in batch:
                mined_args.append(extract_adus(arg))
                pbar.update()

# SAMPLE = args[0:10]
# mined_args = [i for i in SAMPLE]

-------------------------Batch 1/10-------------------------


  0%|          | 0/100 [00:00<?, ?it/s]


 t3_30oi71
['Income Increasingly Popular', 'Basic Income Increasingly', 'Increasingly Popular', 'Basic Income', 'Income Increasingly']
['Basic income', 'broad support', 'progressive left', 'left and libertarian', 'Basic']
['including Paul Krugman', 'economists including Paul', 'Centerleft economists including', 'Paul Krugman', 'including Paul']
['effective antipoverty measure', 'antipoverty measure', 'effective antipoverty', 'measure', 'effective']
['capital to labor', 'reduces inequality', 'inequality by redistributing', 'redistributing income', 'income from capital']

 t3_30oi71
['Income Increasingly Popular', 'Basic Income Increasingly', 'Increasingly Popular', 'Basic Income', 'Income Increasingly']
['Basic income', 'broad support', 'progressive left', 'left and libertarian', 'Basic']
['including Paul Krugman', 'economists including Paul', 'Centerleft economists including', 'Paul Krugman', 'including Paul']
['effective antipoverty measure', 'antipoverty measure', 'effective antipov

  0%|          | 0/100 [00:00<?, ?it/s]


 t3_2lvdpt
['Baltimore Ravens cheerleader', 'Baltimore Ravens', 'Ravens cheerleader', 'yearold boy', 'weeks ago']
['age of consent', 'plenty of instances', 'instances when older', 'older women', 'women often teachers']
['promiscuous than females', 'older men', 'men who prey', 'prey on young', 'young girls']
['species', 'cultures']
['Simply put men', 'Simply put', 'put men', 'men want sex', 'Simply']

 t3_6baulc
['wellaccepted fact', 'make money', 'money to make', 'money', 'wellaccepted']
['Tax law recognizes', 'Tax law', 'law recognizes', 'Tax', 'businesses']
['tax deductible', 'business', 'expenses', 'profit', 'tax']
['spend money commuting.For', 'money commuting.For employees', 'commuting.For employees commuting', 'spend money', 'money commuting.For']
['spend X dollars', 'work', 'spend', 'dollars', 'buscartrain']

 t3_6baulc
['wellaccepted fact', 'make money', 'money to make', 'money', 'wellaccepted']
['Tax law recognizes', 'Tax law', 'law recognizes', 'Tax', 'businesses']
['tax ded

In [45]:
# mined_args
import random
_ = random.randint(0, len(SAMPLE))
example = mined_args[_]
example

{'id': 't3_6jgnbe',
 'argument': [{'sentence': 'I think that for redistributing wealth to deal with increased automation expanding the welfare system makes way more sense than UBI.',
   'kp': ['sense than UBI',
    'increased automation expanding',
    'welfare system makes',
    'UBI',
    'redistributing wealth'],
   'stance': 'NEUTRAL',
   'aspect': 'sense than UBI',
   'topic': None,
   'concept': 'sense than UBI'},
  {'sentence': 'I want to begin this by defining UBI and the welfare system as I understand them.',
   'kp': ['defining UBI', 'welfare system', 'UBI', 'begin', 'defining'],
   'stance': 'NEUTRAL',
   'aspect': 'defining UBI',
   'topic': None,
   'concept': 'defining UBI'},
  {'sentence': 'UBI to provide all legal residents of a country a standard sum of cash unconnected to work The Welfare System Providing income to societys lowest earning citizens on a sliding scale so that the lower your income the more assistence you get.',
   'kp': ['Welfare System Providing',
    

In [None]:
### COUNTER-ARGS ###

In [None]:
### KP SELECTION ###

In [46]:
### WRITE TO DISK ###
file_name = "cmv_argument_extraction"
fout = open(f"../data/{file_name}.jsonl", "w")

with tqdm(total=(len(mined_args))) as pbar:
    with fout:
        for original_posts, mined in zip(args, mined_args):
            # Extended pre-formatted mined object
            mined["original_post"] = original_posts["argument"]
            mined["tgt_counter"] = original_posts["counter"]

            fout.write(json.dumps(mined))

            fout.write("\n")
            pbar.update()

logger.info(f"[{len(args)} Data Stored as {file_name}.jsonl]")

  0%|          | 0/100 [00:00<?, ?it/s]

INFO:ARGUMENT-EXTRACTOR:[10303 Data Stored as cmv_argument_extraction.jsonl]
