### Import Data

In [1]:
import pandas as pd
df = pd.read_csv('./qwen/data/qwen_1by1+predictions_in_group.csv')
print(df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points',
       'predict_kps(avg_embedding)', 'predict_kps(best_embedding)',
       'predict_kps(PCA_embedding)'],
      dtype='object')


In [2]:
predict_df = pd.read_csv('./qwen/data/qwen_1by1+predictions.csv')
print(predict_df.columns)

Index(['topic', 'stance', 'argument', 'key_point', 'embedding(0_1_2_3)',
       'embedding(4_7_8_15)', 'embedding(5_6_9_10)', 'embedding(11_12_13_14)',
       'embedding(16_17_18_19)', 'embedding(20_21_22_23)',
       'embedding(24_25_26_27)', 'avg_embedding'],
      dtype='object')


### Do Agglomerative Hierarchical Clustering

In [7]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity

def agglomerative_hierarchical_clustering(embeddings, n_clusters):
    # Convert embeddings to numpy array
    embeddings = np.array(embeddings)

    # Compute cosine similarity matrix
    cosine_similarities = cosine_similarity(embeddings)

    # Perform Agglomerative Hierarchical Clustering
    cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='average')
    clusters = cluster.fit_predict(cosine_similarities)

    # Compute centroids of each cluster
    centroids = []
    for i in range(n_clusters):
        cluster_embeddings = embeddings[clusters == i]
        centroid = np.mean(cluster_embeddings, axis=0)
        centroids.append(centroid)

    # Find index of the closest embedding to each centroid
    centroid_indices = []
    for centroid in centroids:
        distances = [np.linalg.norm(embedding - centroid) for embedding in embeddings]
        centroid_index = np.argmin(distances)
        centroid_indices.append(centroid_index)

    return centroid_indices

In [8]:
import ast
for index, row in df.iloc[0:2000].iterrows():
    print(str(index)+": ")
    input_list = []
    selected_df = predict_df[(predict_df['topic'] == row['topic']) & (predict_df['stance'] == row['stance'])]
    sentence_embeddings = selected_df['embedding(16_17_18_19)'].values.tolist()
    for embedding in sentence_embeddings:
        input_list.append(ast.literal_eval(embedding))
    clusters_index = agglomerative_hierarchical_clustering(input_list, 5)
    print(clusters_index)
    final_list=[]
    for cluster_index in clusters_index:
        sentence_list = selected_df['key_point'].values.tolist()
        final_list.append(sentence_list[cluster_index])
    print(final_list)
    df.at[index,'predict_kps(PCA_embedding)'] = str(final_list)
print("Finished")

0: 
[13, 63, 116, 11, 61]
['Assisted suicide constitutes murder and ought to be legally classified as a criminal offense.', 'Assisted suicide should be considered a criminal offense akin to murder, as proponents argue that natural death is the preferred course.', 'The sanctity of human life necessitates that assisted suicide remains a criminal offence.', 'Assisted suicide is considered a criminal offense due to its inherent nature involving the intentional causing of death.', 'Assisted suicide constitutes a criminal offense due to the inevitable outcome of death, implicating assistants as having a direct role in the loss of life.']
1: 
[70, 59, 89, 25, 75]
['Individuals with terminal illnesses should have the right to choose assisted suicide.', 'Assisted suicide provides terminally ill patients the option to alleviate prolonged pain and suffering by voluntarily ending their lives.', 'Assisted suicide should be legally available for individuals suffering from incurable diseases.', 'Assi

In [9]:
df.to_csv('./qwen/data/qwen_1by1+predictions_in_group.csv', index=False)
print("Add new data successfully!!!!!")

Add new data successfully!!!!!


In [2]:
import pandas as pd
df = pd.read_csv('./result/KPM_cos_si_bert_embedding.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(v1)', 'confidence_score(v1)',
       'matched_kp(v2)', 'confidence_score(v2)', 'matched_kp(c_avg)',
       'confidence_score(c_avg)', 'matched_kp(c_best)',
       'confidence_score(c_best)', 'matched_kp(c_pca)',
       'confidence_score(c_pca)'],
      dtype='object')


In [4]:
c_avg_list = df['confidence_score(c_avg)'].values.tolist()
print("Confidence score(c_avg): "+ str(sum(c_avg_list)/len(c_avg_list)))
c_best_list = df['confidence_score(c_best)'].values.tolist()
print("Confidence score(c_best): "+ str(sum(c_best_list)/len(c_best_list)))
c_pca_list = df['confidence_score(c_pca)'].values.tolist()
print("Confidence score(c_pca): " + str(sum(c_pca_list)/len(c_pca_list)))

Confidence score(c_avg): 0.7651922760351682
Confidence score(c_best): 0.7690315096129301
Confidence score(c_pca): 0.7777170137118555
