### Manufacturing data set

In [1]:
import pandas as pd
df = pd.read_csv('./data/ground_truth.csv')
print(df)
predict_df = pd.read_csv('./result/1by1+predictions.csv')
print(predict_df)

                                              topic  \
0     Assisted suicide should be a criminal offence   
1     Assisted suicide should be a criminal offence   
2     Assisted suicide should be a criminal offence   
3     Assisted suicide should be a criminal offence   
4     Assisted suicide should be a criminal offence   
...                                             ...   
5545           The USA is a good country to live in   
5546           The USA is a good country to live in   
5547           The USA is a good country to live in   
5548           The USA is a good country to live in   
5549           The USA is a good country to live in   

                                               argument  \
0     a cure or treatment may be discovered shortly ...   
1     A patient should be able to decide when they h...   
2     a person has the right to end their suffering ...   
3     a person should have the dignity to choose how...   
4     a person should have the right to be a

In [2]:
predict_df_header = predict_df.columns
print(predict_df_header)

Index(['topic', 'stance', 'argument', 'key_point'], dtype='object')


In [3]:
def remove_duplicate_sentences(sentences):
    unique_sentences = []
    seen_sentences = set()

    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)

    return unique_sentences

In [4]:
ArgKP2021_topics = [
    "Assisted suicide should be a criminal offence",
    "Homeschooling should be banned",
    "The vow of celibacy should be abandoned",
    "We should abandon marriage",
    "We should abandon the use of school uniform",
    "We should abolish capital punishment",
    "We should abolish intellectual property rights",
    "We should abolish the right to keep and bear arms",
    "We should adopt an austerity regime",
    "We should adopt atheism",
    "We should adopt libertarianism",
    "We should ban human cloning",
    "We should ban private military companies",
    "We should ban the use of child actors",
    "We should close Guantanamo Bay detention camp",
    "We should end affirmative action",
    "We should end mandatory retirement",
    "We should fight for the abolition of nuclear weapons",
    "We should fight urbanization",
    "We should introduce compulsory voting",
    "We should legalize cannabis",
    "We should legalize prostitution",
    "We should legalize sex selection",
    "We should prohibit flag burning",
    "We should prohibit women in combat",
    "We should subsidize journalism",
    "We should subsidize space exploration",
    "We should subsidize vocational education",
    "Routine child vaccinations should be mandatory",
    "Social media platforms should be regulated by the government",
    "The USA is a good country to live in"
]

predict_KPs = []
for topic in ArgKP2021_topics:
    topic_p_df = predict_df[(predict_df['topic'] == topic) & (predict_df['stance'] == 1)]
    topic_n_df = predict_df[(predict_df['topic'] == topic) & (predict_df['stance'] == 0)]
    topic_p_predicts = list(topic_p_df['key_point'].values)
    topic_n_predicts = list(topic_n_df['key_point'].values)
    deduplicated_topic_p_arguments = remove_duplicate_sentences(topic_p_predicts)
    deduplicated_topic_n_arguments = remove_duplicate_sentences(topic_n_predicts)
    predict_KPs.append(deduplicated_topic_p_arguments)
    predict_KPs.append(deduplicated_topic_n_arguments)
print("list length: " + str(len(predict_KPs)))
total_kps = 0
for kps in predict_KPs:
    total_kps += len(kps)
    print(len(kps))
print("total kps: " + str(total_kps))

list length: 62
125
121
115
129
122
112
125
111
116
118
123
109
91
123
110
122
126
108
105
123
124
113
122
123
125
106
123
121
127
113
119
108
120
78
116
80
127
106
114
127
137
109
132
97
118
126
99
118
107
130
118
110
132
115
128
95
168
108
134
99
143
66
total kps: 7215


### Calculate cosine similarity with Bert embedding

In [8]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import itertools

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


def sentence_relevance_test(sentences):
    # Tokenize and convert sentences to BERT embeddings
    encoded_input = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Extract the embeddings from the model output
    sentence_embeddings = model_output.last_hidden_state.mean(dim=1).numpy()

    # Calculate cosine similarity between each pair of sentences
    pairs = list(itertools.combinations(range(len(sentences)), 2))

    for pair in pairs:
        idx1, idx2 = pair
        similarity_score = cosine_similarity([sentence_embeddings[idx1]], [sentence_embeddings[idx2]])[0][0]
        print(f"Similarity between Sentence {idx1 + 1} and Sentence {idx2 + 1}: {similarity_score}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
print(predict_KPs[0])

['The potential discovery of a cure or treatment shortly after a life has been ended unnecessarily.', 'Assisted suicide could potentially lead to exploitation by greedy relatives who may use it for financial benefits.', "The user views assisted suicide as equivalent to murder, regardless of the person's consent, placing moral responsibility on the one who aids in the act.", "The user questions the concept of 'reasonable certainty' and argues against ending life as a solution, highlighting that many people recover due to alternative treatments or medical advances. They also emphasize on the ethical implications of involvement of others in such decisions.", "The view that every individual involved in causing another person's death should face legal penalties.", 'The potential for abuse and exploitation of assisted suicide by those seeking personal gain from sick or ill relatives serves as a reason to prohibit it.', 'Potential misuse of assisted suicide, with individuals possibly being pr

In [10]:
print(sentence_relevance_test(predict_KPs[0]))

Similarity between Sentence 1 and Sentence 2: 0.78702712059021
Similarity between Sentence 1 and Sentence 3: 0.7449733018875122
Similarity between Sentence 1 and Sentence 4: 0.758685827255249
Similarity between Sentence 1 and Sentence 5: 0.8182483315467834
Similarity between Sentence 1 and Sentence 6: 0.798798143863678
Similarity between Sentence 1 and Sentence 7: 0.8017703294754028
Similarity between Sentence 1 and Sentence 8: 0.7935909032821655
Similarity between Sentence 1 and Sentence 9: 0.7456387281417847
Similarity between Sentence 1 and Sentence 10: 0.7539781928062439
Similarity between Sentence 1 and Sentence 11: 0.8125429153442383
Similarity between Sentence 1 and Sentence 12: 0.7569330930709839
Similarity between Sentence 1 and Sentence 13: 0.7573435306549072
Similarity between Sentence 1 and Sentence 14: 0.7319722175598145
Similarity between Sentence 1 and Sentence 15: 0.7184835076332092
Similarity between Sentence 1 and Sentence 16: 0.7160773277282715
Similarity between Sen

### Cluster predict KPs with Bert Embedding

In [5]:
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
import torch
import numpy as np
from sklearn.metrics import pairwise_distances_argmin_min

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def clustering_with_Bert(sentences, k) -> []:

    # Tokenize and convert sentences to BERT embeddings
    encoded_input = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Extract the embeddings from the model output
    sentence_embeddings = model_output.last_hidden_state.mean(dim=1).numpy()

    # # Choose the number of clusters (k)
    # k = 2

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(sentence_embeddings)

    # Get cluster labels for each sentence
    labels = kmeans.labels_

    # Get the cluster centers (optional)
    cluster_centers = kmeans.cluster_centers_

    # Find representative sentences from each cluster
    representative_indices = pairwise_distances_argmin_min(cluster_centers, sentence_embeddings)[0]

    result = []
    # Print the representative sentences
    for idx in representative_indices:
        result.append(sentences[idx])
        print(sentences[idx])
    return result
    # # Print the sentences and their corresponding cluster labels
    # for sentence, label in zip(sentences, labels):
    #     print(f"Cluster {label}: {sentence}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
import pandas as pd
df = pd.read_csv('./data/processed_ArgKP21.csv')
kps = list(df['kep_points'].values)
print(len(kps))

62


In [7]:
print(len(predict_KPs))

62


In [8]:
import ast
# Convert the string representation to a list
print(len(ast.literal_eval(kps[0])))

6


In [9]:
import os

# Set OMP_NUM_THREADS to 1
os.environ["OMP_NUM_THREADS"] = "1"

# Verify the setting
print("OMP_NUM_THREADS:", os.environ.get("OMP_NUM_THREADS"))


OMP_NUM_THREADS: 1


In [10]:
final_kps = []
for id, item in enumerate(predict_KPs):
    # k = len(ast.literal_eval(kps[id]))
    k=5
    print("id: "+ str(id) + "  k-means: "+ str(k))
    final_kps.append(clustering_with_Bert(item, k))


id: 0  k-means: 5
Assisted suicide may be exploited by people lacking mental capacity to understand its ultimate consequences, hence it should be criminalized.
Assisted suicide is equivalent to murder and should hence be considered a criminal offence.
Assisted suicide may be exploited by people lacking mental capacity to understand its ultimate consequences, hence it should be criminalized.
Assisting in suicide is equivalent to committing murder and should carry criminal charges.
Assisted suicide is a form of ending life and thus should be considered unlawful.
id: 1  k-means: 5
Assisted suicide must not be a criminal offense as it provides relief from grave suffering for seriously ill people.
People should have the autonomy to choose their own death, especially if they have a terminal disease, which challenges the notion of assisted suicide being a criminal act.
Individuals suffering from painful medical conditions should have the right to opt for assisted suicide.
Assisted suicide pro

In [13]:
print(final_kps[0])

['Assisted suicide may be exploited by people lacking mental capacity to understand its ultimate consequences, hence it should be criminalized.', 'Assisted suicide is equivalent to murder and should hence be considered a criminal offence.', 'Assisted suicide may be exploited by people lacking mental capacity to understand its ultimate consequences, hence it should be criminalized.', 'Assisting in suicide is equivalent to committing murder and should carry criminal charges.', 'Assisted suicide is a form of ending life and thus should be considered unlawful.']


In [17]:
df = pd.read_csv('./result/1by1+predictions_in_group.csv')
print(df)
print(len(final_kps))

                                                topic  stance  \
0       Assisted suicide should be a criminal offence       1   
1       Assisted suicide should be a criminal offence       0   
2                      Homeschooling should be banned       1   
3                      Homeschooling should be banned       0   
4             The vow of celibacy should be abandoned       1   
..                                                ...     ...   
57     Routine child vaccinations should be mandatory       0   
58  Social media platforms should be regulated by ...       1   
59  Social media platforms should be regulated by ...       0   
60               The USA is a good country to live in       1   
61               The USA is a good country to live in       0   

                                            arguments  \
0   ["a cure or treatment may be discovered shortl...   
1   ['`people reach their limit when it comes to t...   
2   ['all children attending school ensures that

In [23]:
for index, value in enumerate(final_kps):
    df.at[index,'predict_kps'] = str(value)

print(df)


                                                topic  stance  \
0       Assisted suicide should be a criminal offence       1   
1       Assisted suicide should be a criminal offence       0   
2                      Homeschooling should be banned       1   
3                      Homeschooling should be banned       0   
4             The vow of celibacy should be abandoned       1   
..                                                ...     ...   
57     Routine child vaccinations should be mandatory       0   
58  Social media platforms should be regulated by ...       1   
59  Social media platforms should be regulated by ...       0   
60               The USA is a good country to live in       1   
61               The USA is a good country to live in       0   

                                            arguments  \
0   ["a cure or treatment may be discovered shortl...   
1   ['`people reach their limit when it comes to t...   
2   ['all children attending school ensures that

In [24]:
df.to_csv('./result/1by1+predictions_in_group.csv', index=False)
print("Add successfully.")

Add successfully.


In [26]:
f = open("./result/KP_generation_clustering.txt",'w', encoding='UTF-8')
for batch_id, topic in enumerate(ArgKP2021_topics):
    print(batch_id*2)
    print("-------------------Processing-------------------")
    f.write(str(batch_id*2)+"\n")
    f.write("topic: "+topic+"\n")
    f.write("\n")
    for sentence in final_kps[batch_id*2]:
        f.write(sentence+"\n")
    f.write("\n")
    f.write(str(batch_id*2+1)+"\n")
    f.write("topic: "+topic+"\n")
    f.write("\n")
    for sentence in final_kps[batch_id*2+1]:
        f.write(sentence+"\n")
    f.write("\n")

print("-----------------Finished--------------------------")

0
-------------------Processing-------------------
2
-------------------Processing-------------------
4
-------------------Processing-------------------
6
-------------------Processing-------------------
8
-------------------Processing-------------------
10
-------------------Processing-------------------
12
-------------------Processing-------------------
14
-------------------Processing-------------------
16
-------------------Processing-------------------
18
-------------------Processing-------------------
20
-------------------Processing-------------------
22
-------------------Processing-------------------
24
-------------------Processing-------------------
26
-------------------Processing-------------------
28
-------------------Processing-------------------
30
-------------------Processing-------------------
32
-------------------Processing-------------------
34
-------------------Processing-------------------
36
-------------------Processing-------------------
38
--------------

In [33]:
import ast
test_df = pd.read_csv('./result/1by1+predictions_in_group.csv')
prediction = test_df['predict_kps'].values.tolist()
a = ast.literal_eval(prediction[0])
print(a[4])

Assisted suicide is a form of ending life and thus should be considered unlawful.
