In [133]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from nltk import sent_tokenize, word_tokenize
from IPython.display import display
import re
from collections import Counter
import hashlib

"Topic: An argumentative text's topic is a description of what it is about. For argumentative texts from debates, we use the associated debate title as the topic. For CMV posts, their titles are also their conclusions; here, topic information is considered missing (denoted as 'NA' token)." [Syed.2020](https://webis.de/downloads/publications/papers/syed_2021a.pdf)

In [149]:
def parse(name):
    args = list()
    with open(f'../../not-gitted/webis-conclugen-2021/topic/{name}.source', 'r', encoding='utf-8') as f:
        sources = f.readlines()
    with open(f'../../not-gitted/webis-conclugen-2021/topic/{name}.target', 'r', encoding='utf-8') as f:
        targets = f.readlines()
    hashes = set()
    dup = list()
    for source, target in tqdm(zip(sources, targets)):
        _, topic, argument, __ = re.split('<\|[A-Z]*\|>', source)
        h = hashlib.sha256((argument + target).encode()).hexdigest()
        hashes.add(h)
        if h in hashes:
            dup.append(h)
        args.append({
            'premises': [{'text': argument}],
            'conclusion': target,
            'context': {'discussionTitle': topic},
            'id': h
        })
    return args, set(dup)

In [150]:
d = list()
d_train, dup_tr = parse('train')
d_test, dup_te = parse('test')

123538it [00:01, 67822.71it/s]
1374it [00:00, 41621.94it/s]


In [151]:
d.extend(d_train)
d.extend(d_test)
len(d)

124912

# Filter conclusions

In [153]:
def get_conclusions(arguments):
    rows = list()
    for argument in tqdm(arguments):
        conclusion = argument['conclusion']
        conc_len = len(word_tokenize(conclusion))
        rows.append({'conclusion': conclusion, 'conc_len': conc_len})

    rows = list({v['conclusion']: v for v in rows}.values())
    args = pd.DataFrame.from_records(rows)
    #args.to_csv('results/conclusions.csv', index
    return args

In [154]:
conclusions = get_conclusions(d)
conclusions

100%|████████████████████████████████████████████████████████████████████████| 124912/124912 [00:28<00:00, 4418.31it/s]


Unnamed: 0,conclusion,conc_len
0,It is unethical to expose children to the pres...,11
1,DACA Students should have zero financial aide ...,17
2,If you look like a man go to the men bathroom....,24
3,political subreddits should ban posts discoura...,7
4,"In some EU countries, unemployment benefits ar...",20
...,...,...
98134,Left-wing Brits should vote to Remain in the EU\n,9
98135,Internet anonymity enables citizens to exercis...,11
98136,Bran can become his own army by warging into a...,19
98137,Utilitarianism has no flaws\n,4


__Filter conclusions that are have length above average+std__

In [158]:
mean_length = np.mean(conclusions.conc_len.values)
std_length = np.std(conclusions.conc_len.values)
print(f'mean = {mean_length}\n std = {std_length}')
threshold = mean_length + std_length

mean = 18.218098819022
 std = 9.168070354484506


In [159]:
mean_length + std_length

27.386169173506506

In [160]:
filtered_conclusions = conclusions[conclusions.conc_len>=threshold]
filtered_conclusions

Unnamed: 0,conclusion,conc_len
7,Just because women are fine with being asked o...,28
9,"Gen-ed requirements in college, particularly i...",40
13,"Not necessarily, if it were correctly framed a...",80
14,Imagine you had to build a Global Hawk with pi...,45
16,Not having a consistent definition of an objec...,35
...,...,...
98113,"If teachers desire to bias students, they can ...",47
98116,Economic facts and conclusions of a legalisati...,29
98123,"That is no reason to excuse lies such as £350,...",53
98124,Until the Middle Ages Church leaders were elec...,34


__Conclusions are not questions__ [Eggs 2000](file:///C:/Users/Jonas/OneDrive/Library/ca/Text-%20und%20Gespr%C3%A4chslinguistik%20Ein%20internationales%20Handbuch%20zeitgen%C3%B6ssischer%20Forschung,%20Band%201%20Linguistics%20of%20Text%20and%20Conversation%20An%20International%20Handbook%20of%20Contemporary%20Research%20by%20Klaus%20Brinker,%20(z-lib.org).pdf) (p.397 ff.)

Questions are introduced by:

In [161]:
question_marker = ('who', 'where', 'what', 'how', 'which', 'why', 'is', 'are', 'do', 'does', 'did', 'was', 'were')

In [162]:
filtered_conclusions = filtered_conclusions[filtered_conclusions.conclusion.apply(lambda c: word_tokenize(c)[0].lower() not in question_marker)]
filtered_conclusions

Unnamed: 0,conclusion,conc_len
7,Just because women are fine with being asked o...,28
9,"Gen-ed requirements in college, particularly i...",40
13,"Not necessarily, if it were correctly framed a...",80
14,Imagine you had to build a Global Hawk with pi...,45
16,Not having a consistent definition of an objec...,35
...,...,...
98113,"If teachers desire to bias students, they can ...",47
98116,Economic facts and conclusions of a legalisati...,29
98123,"That is no reason to excuse lies such as £350,...",53
98124,Until the Middle Ages Church leaders were elec...,34


In [163]:
filtered_conclusions.describe()

Unnamed: 0,conc_len
count,13124.0
mean,35.827568
std,8.568806
min,28.0
25%,30.0
50%,33.0
75%,39.0
max,236.0


In [164]:
conclusions_we_want = set(filtered_conclusions.conclusion.values)

In [165]:
len(conclusions_we_want)

13124

# Filter sentences

In [166]:
for argument in tqdm(d):
    sentences = sent_tokenize(argument['premises'][0]['text'])
    argument['premises'][0]['sentences'] = sentences

100%|████████████████████████████████████████████████████████████████████████| 124912/124912 [00:40<00:00, 3072.83it/s]


In [167]:
def keep(arg):
    """
    Check whether text contains any letters. Returns true if so.
    """
    return any(c.isalpha() for c in arg['premises'][0]['text'])

In [168]:
filtered_arguments = list()
for argument in tqdm(d):
    if len(argument['premises'][0]['sentences']) > 2 and keep(argument):
        filtered_arguments.append(argument['id'])

100%|██████████████████████████████████████████████████████████████████████| 124912/124912 [00:00<00:00, 428220.94it/s]


In [169]:
arguments_we_want = set(filtered_arguments)

In [170]:
len(arguments_we_want)

69110

# Filter dataset

In [171]:
filtered_d = list()
for argument in tqdm(d):
    if argument['conclusion'] in conclusions_we_want and argument['id'] in arguments_we_want:
        filtered_d.append(argument)

100%|██████████████████████████████████████████████████████████████████████| 124912/124912 [00:00<00:00, 821816.81it/s]


In [172]:
len(filtered_d)

8686

In [173]:
with open('data/filtered-conclugen.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_d, f, indent=4)

# Contexts

In [174]:
discussion_title_arg_counts = dict()
for argument in tqdm(filtered_d):
    title = argument['context']['discussionTitle']
    if title in discussion_title_arg_counts:
        discussion_title_arg_counts[title] +=1
    else:
        discussion_title_arg_counts[title] = 1

100%|██████████████████████████████████████████████████████████████████████████| 8686/8686 [00:00<00:00, 666234.93it/s]


In [175]:
contexts = pd.DataFrame.from_dict([{'title': k, 'count': v} for k, v in discussion_title_arg_counts.items()])

In [176]:
filtered_contexts = contexts[contexts['count'] >= 2][contexts['title']!='NA']
filtered_contexts

  """Entry point for launching an IPython kernel.


Unnamed: 0,title,count
1,Should the UK Remain in the EU if the only Alt...,19
2,Is Morality Objective?,67
3,The Rebel Alliance would defeat the United Fed...,66
4,"Should ""women-only"" spaces be open to anyone i...",27
5,Time for a Maximum Wage? Should the US Limit C...,10
...,...,...
536,Should Internet Access Be A Basic Human Right?,2
541,Should Commercial Surrogacy be Legal in Libera...,2
542,What will life look like once humans achieve l...,2
544,Mike Pence Would Make a Better President than ...,2


In [177]:
filtered_contexts.describe()

Unnamed: 0,count
count,320.0
mean,8.834375
std,14.371059
min,2.0
25%,2.0
50%,4.0
75%,8.0
max,93.0


In [178]:
ready_to_summarize = list()
for argument in tqdm(filtered_d):
    if argument['context']['discussionTitle'] in context_ids:
        ready_to_summarize.append(argument)

100%|██████████████████████████████████████████████████████████████████████████| 8686/8686 [00:00<00:00, 228987.77it/s]


In [179]:
len(ready_to_summarize)

2827

In [181]:
with open('data/filtered-conclugen-to-summarize.json', 'w', encoding='utf-8') as f:
    json.dump(ready_to_summarize, f, indent=4)