### Import data

In [1]:
import pandas as pd
df = pd.read_csv('./data/KPM_qwen_in_group.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [2]:
prediction_df = pd.read_csv('./data/qwen_1by1+predictions_in_group.csv')
print(prediction_df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points',
       'predict_kps(avg_embedding)', 'predict_kps(best_embedding)',
       'predict_kps(PCA_embedding)'],
      dtype='object')


### Predict matched KP and calculate score directly from qwen

In [3]:
import random
from http import HTTPStatus
import dashscope
import re
import yaml

In [None]:
with open("../conf/index.yaml") as f:
    credentials = yaml.safe_load(f)
qwen_api_key = credentials['environment_variables']['QWEN_DASHSCOPE_API_KEY']

In [4]:
def generating(topic, argument,kps):
    try:
        messages = [
            {
                'role': 'system',
                'content': f"""
            You need to find the best matched kp to the given argument from a list of candidate Kps based on the topic "{topic}",
        and then return this best kp and its confidence score between 0 and 1. 0 represents not match, 1 represents match.
        The result format is as followed: best_kp: .... confidence_score:a number
            """
            },
            {
                'role': 'user',
                'content': "argument:"+argument+", kps:"+kps
            }
        ]
        dashscope.api_key = "sk-35553f9fac6745d0a831864c4108574e"
        response = dashscope.Generation.call(model="qwen-max",
                                   messages=messages,
                                   # 设置随机数种子seed，如果没有设置，则随机数种子默认为1234
                                   seed=random.randint(1, 10000),
                                   # 将输出设置为"message"格式
                                   result_format='message')
        if response.status_code == HTTPStatus.OK:
            output = response['output']["choices"][0]["message"]["content"]
            print(response['output']["choices"][0]["message"]["content"])
            pattern = re.compile(r"best_kp:\s*(.*?)\s*confidence_score:\s*([\d.]+)")
            match = pattern.search(output)

            # Check if both patterns are matched
            if match  :
                kp = match.group(1).strip()
                confidence_score = float(match.group(2))

                 # Print or use the extracted information
                print("Key Point:", kp)
                print("Confidence Score:", confidence_score)
                return kp,confidence_score
            else:
                print(f"Error. Regenerating string...")
                return generating(topic, argument, kps)
        else:
            print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
                response.request_id, response.status_code,
                response.code, response.message
            ))
            return generating(topic, argument, kps)
    except Exception as e:
        print(f"An error occurred: {e}. Retrying...")
        return generating(topic, argument, kps)

In [7]:
for index, row in df.iloc[6000:].iterrows():
    topic = row['topic']
    stance = row['stance']
    argument = row['argument']
    filtered_df = prediction_df[(prediction_df['topic']==topic) & (prediction_df['stance']== stance)]
    candidate_kps = filtered_df['predict_kps(avg_embedding)'].values.tolist()
    print(str(index)+": ")
    print(candidate_kps[0])
    kp, confidence_score = generating(topic, argument,candidate_kps[0])
    df.at[index,'matched_kp(1by1)'] = kp
    df.at[index,'confidence_score(1by1)'] = confidence_score

6000: 
['Government subsidies for journalism may lead to bias and self-censorship due to fear of losing financial support.', 'State subsidies for journalism lack majority public support, rendering them undemocratic and inappropriate for implementation by the government.', 'The abundance of existing news outlets suggests that subsidizing journalism may not be necessary.', 'Government subsidies for journalism are undesirable due to the perception that journalism has become overly commercialized and should maintain independence from state influence.', 'Avoid subsidizing journalism to prevent content bias driven by financial incentives.']
best_kp: subsidizing journalism only succeeds in propping up weak and poorly run organizations which otherwise would soon fail
confidence_score:1.0
Key Point: subsidizing journalism only succeeds in propping up weak and poorly run organizations which otherwise would soon fail
Confidence Score: 1.0
6001: 
['Government subsidies for journalism may lead to b

In [8]:
df.to_csv('./data/KPM_qwen_in_group.csv', index=False)
print("Add new data successfully!!!!!")

Add new data successfully!!!!!


In [1]:
import pandas as pd
df = pd.read_csv('./data/KPM_qwen_in_group.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [3]:
score_list = df['confidence_score(1by1)'].values.tolist()
print(sum(score_list)/len(score_list))

0.8723818734456921
