### Import data

In [59]:
import pandas as pd
df = pd.read_csv('./data/KPM_qwen_in_hybrid.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [58]:
prediction_df = pd.read_csv('./data/ArgKP21+predictions(v4).csv')
print(prediction_df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points', 'predict_kps'], dtype='object')


### Predict matched KP in group and select best one by ourselves

In [3]:
import random
from http import HTTPStatus
import dashscope
import re
import ast
import yaml

In [None]:
with open("../conf/index.yaml") as f:
    credentials = yaml.safe_load(f)
qwen_api_key = credentials['environment_variables']['QWEN_DASHSCOPE_API_KEY']

In [5]:
def correct_format_string(input_str):
    # Add double quotes before the first alphabet of each unquoted string element after [ and any spaces
    corrected_str = re.sub(r'(\[\s*)\'?([a-zA-Z])', r'\1"\2', input_str)

    # # Replace single quotes with double quotes around string elements
    # corrected_str = re.sub(r'\'([^\"]*?)\'', r'"\1"', corrected_str)

    # Ensure closing double quotes before a comma and a number
    corrected_str = re.sub(r'([^"\'])\s*,\s*(\d)', r'\1", \2', corrected_str)

    # Ensure single quotes before a comma and a number are replaced with double quotes
    corrected_str = re.sub(r'\'\s*,\s*(\d)', r'", \1', corrected_str)

    return corrected_str

In [7]:
def is_before_first_comma_number(input_str):
    # Find the index of the first comma
    comma_index = input_str.find(',')

    # Check if the character before the comma is a digit or alphabet
    if comma_index > 0:
        before_comma_char = input_str[comma_index - 1]
        if before_comma_char.isdigit() :
            return True  # If it's a digit, return True

    # If the condition is not met, return False
    return False

In [8]:
import re

def contains_kp_before_comma(input_str):
    # Define the pattern to search for "kp1", "kp2", "kp3", "kp4", or "kp5" before the first comma
    pattern = re.compile(r'kp([1-5])\s*,?', re.IGNORECASE)

    # Search for the pattern in the input string
    match = pattern.search(input_str)

    # Check if the match is found before the first comma
    if match:
        return True
    else:
        return False

In [9]:
import re

def replace_kp_before_comma(input_str):
    # Define the pattern to search for "kp1", "kp2", "kp3", "kp4", or "kp5" before the first comma
    pattern = re.compile(r'kp([1-5])\s*,?', re.IGNORECASE)

    # Function to replace "kpX" with "X"
    def replacer(match):
        return f'{match.group(1)},'

    # Replace the pattern in the input string
    corrected_str = pattern.sub(replacer, input_str)

    return corrected_str

In [17]:
def generating(topic, argument, kps, max_retries=10):
    attempt = 0
    while attempt < max_retries:
        try:
            messages = [
                {
                    'role': 'system',
                    'content': f"""
                You need to calculate the confidence score of the given argument to each kp of the candidate kps based on the topic "{topic}". The confidence score between 0 and 1 means to what extent the argument matches the kp. 0 represents not match, 1 represents match. You need to return a list of 5 lists. Each inner list contain each kp in string and the corresponding confidence score in float. The result format is as follows: [[kp1, confidence_score], [kp2, confidence_score],...]
                """
                },
                {
                    'role': 'user',
                    'content': f"argument: {argument}, kps: {kps}"
                }
            ]
            dashscope.api_key = qwen_api_key
            response = dashscope.Generation.call(model="qwen-max",
                                                 messages=messages,
                                                 seed=random.randint(1, 10000),
                                                 result_format='message')
            if response.status_code == HTTPStatus.OK:
                output = response['output']["choices"][0]["message"]["content"]
                # print("******************************************************")
                # print(response['output']["choices"][0]["message"]["content"])
                cleaned_output = output.replace('\\"', '"')
                # print("---------------------------------------------------")
                # print(cleaned_output)
                if not is_before_first_comma_number(cleaned_output):
                    corrected_output = correct_format_string(cleaned_output)
                else:
                    if contains_kp_before_comma(cleaned_output):
                        corrected_output = replace_kp_before_comma(cleaned_output)
                    else:
                        corrected_output = cleaned_output
                print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                print(corrected_output)
                output_list = ast.literal_eval(corrected_output)
                print(len(output_list))
                if len(output_list) == 5:
                    print(output_list)
                    if len(output_list[0]) == 1:
                        if len(output_list[0][0])==2:
                            new_list = []
                            for bad_list in output_list:
                                new_list.append(bad_list[0])
                            return new_list
                        else:
                            return output_list
                    else:
                        if isinstance(output_list[1][1], float) or isinstance(output_list[1][1], int):
                            if isinstance(output_list[1][0], int) or isinstance(output_list[1][0], float):
                                if output_list[0][0]==1 and output_list[1][0]==2 and output_list[2][0]==3 and output_list[3][0]==4 and output_list[4][0]==5:
                                    return output_list
                                else:
                                    attempt += 1
                                    continue
                            else:
                                return output_list
                        elif (isinstance(output_list[0][0], float) or isinstance(output_list[0][0], int)) and isinstance(output_list[0][1], str):
                            swapped_list = [[string, number] for number, string in output_list]
                            return swapped_list
                        else:
                            attempt += 1
                            continue
                elif len(output_list) == 1:
                    if len(output_list[0])==5 and (len(output_list[0][0])==2 or len(output_list[0][0])==1):
                        return output_list[0]
                    else:
                        attempt += 1
                        continue
                else:
                    attempt += 1
                    continue
            else:
                print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
                    response.request_id, response.status_code,
                    response.code, response.message
                ))
                attempt += 1
                continue
        except KeyboardInterrupt:
            print("Interrupted by user.")
            break
        except Exception as e:
            print(f"An error occurred: {e}. Retrying... ({attempt + 1}/{max_retries})")
            attempt += 1
    return None


In [63]:
for index, row in df.iloc[2007:2008].iterrows():
    topic = row['topic']
    stance = row['stance']
    argument = row['argument']
    filtered_df = prediction_df[(prediction_df['topic']==topic) & (prediction_df['stance']== stance)]
    candidate_kps = filtered_df['predict_kps'].values.tolist()
    print(str(index)+": ")
    print(candidate_kps[0])
    input_kps_list = eval(candidate_kps[0])
    # input_kps = ['\"' + sentence + '\"' for sentence in input_kps_list]
    best_kp = ' '
    confidence_score = 0
    result_list = generating(topic, argument,str(input_kps_list))
    if len(result_list[0])==1:
        flattened_scores = [score[0] for score in result_list]
        max_score = max(flattened_scores)
        max_index = flattened_scores.index(max_score)
        confidence_score = max_score
        best_kp = input_kps_list[max_index]
    else:

        best_score_kp = max(result_list, key=lambda x: x[1])
        if isinstance(best_score_kp[0],str):
            best_kp = best_score_kp[0]
        elif isinstance(best_score_kp[0],int):
            best_kp = input_kps_list[best_score_kp[0]-1]
        else:
            print("Error")
            break
        confidence_score = best_score_kp[1]
    print("_______________________________")
    print("best kp: "+best_kp)
    print("score: "+str(confidence_score))
    df.at[index,'matched_kp(group)'] = best_kp
    df.at[index,'confidence_score(group)'] = confidence_score

2007: 
['An austerity regime unfairly impacts the poorest members of society.', 'An austerity regime can harm the economy by reducing demand and slowing growth.', 'Cutting social programs under an austerity regime disproportionately affects vulnerable populations.', 'Adopting an austerity regime may lead to increased unemployment.', 'An austerity regime is viewed as a potentially damaging and unjust approach to managing the economy.']
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[["An austerity regime unfairly impacts the poorest members of society.", 0.0], ["An austerity regime can harm the economy by reducing demand and slowing growth.", 0.0], ["Cutting social programs under an austerity regime disproportionately affects vulnerable populations.", 0.0], ["Adopting an austerity regime may lead to increased unemployment.", 0.0], ["An austerity regime is viewed as a potentially damaging and unjust approach to managing the economy.", 0.0]]
5
[['An austerity regime unfairly impacts 

In [64]:
df.to_csv('./data/KPM_qwen_in_hybrid.csv', index=False)
print("Add new data successfully!!!!!")

Add new data successfully!!!!!


In [1]:
import pandas as pd
df = pd.read_csv('./data/KPM_qwen_in_hybrid.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [3]:
score_list = df['confidence_score(1by1)'].values.tolist()
print(sum(score_list)/len(score_list))

0.8476471400939597
