### Import Data

In [24]:
import pandas as pd
df = pd.read_csv('./data/KPM_LLama2_in_hybrid.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [25]:
prediction_df = pd.read_csv('./data/1by1+predictions_in_group.csv')
print(prediction_df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points',
       'predict_kps(avg_embedding)', 'predict_kps(best_embedding)',
       'predict_kps(PCA_embedding)'],
      dtype='object')


### Predict matched KP in group and select best one by ourselves

In [None]:
import yaml
with open("../conf/index.yaml") as f:
    credentials = yaml.safe_load(f)
Llama2_api_token = credentials['environment_variables']['LLAMA2_API_TOKEN']

In [26]:
import ast
import re
def generating(topic, argument,kps):
    import replicate
    replicate = replicate.Client(api_token=Llama2_api_token)
    output = replicate.run(
        "meta/llama-2-70b-chat",
        input={
            "debug": False,
            "top_p": 1,
            "prompt": "argument:"+argument+", kps:"+kps,
            "temperature": 0.5,
            "system_prompt": f"""
        You need to calculate the confidence score of the given argument to each kp of the candidate kps based on the topic "{topic}". The confidence score between 0 and 1 means to what extend the argument matches to the kp. 0 represents not match, 1 represents match. You need to return a list of lists which contain each kp and the corresponding confidence score. The result format is as followed: [[kp1, confidence_score], [kp2, confidence_score],...]
        """,
            "max_new_tokens": 500,
            "min_new_tokens": -1,
            "prompt_template": "[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{prompt} [/INST]",
            "repetition_penalty": 1.15
        }
    )
    # print(output)
    # The meta/llama-2-70b-chat model can stream output as it's running.
    # The predict method returns an iterator, and you can iterate over that output.
    output_sentence = ' '
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    for item in output:
        output_sentence+=item
        print(item, end="")

    try:
        # Define a regular expression pattern to match the list of lists structure
        pattern = r'\[((?:\[.*?\],?\s*)+)\]'

        # Use re.search to find the first match of the pattern
        match = re.search(pattern, output_sentence)

        # Check if a match is found
        if match:
            # Extract the matched text
            matched_text = match.group(0)
            # Extract the content inside the outermost square brackets
            inner_content = match.group(1)
            # Split the inner content into individual list elements
            list_elements = re.findall(r'\[.*?\]', inner_content)
            # Extracted list of lists
            list_of_lists = [ast.literal_eval(element) for element in list_elements]
            # Print the extracted list of lists
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(list_of_lists)
            return list_of_lists
        else:
            print("No 2D list found in the given text.")
            return generating(topic,argument,kps)
    except (SyntaxError, ValueError) as e:
        print(f"Error: {e}. Regenerating string...")
        return generating(topic,argument,kps)

In [28]:
for index, row in df.iloc[7182:7183].iterrows():
    topic = row['topic']
    stance = row['stance']
    argument = row['argument']
    filtered_df = prediction_df[(prediction_df['topic']==topic) & (prediction_df['stance']== stance)]
    candidate_kps = filtered_df['predict_kps(avg_embedding)'].values.tolist()
    print(str(index)+": ")
    print(candidate_kps[0])
    input_kps_list = eval(candidate_kps[0])
    input_kps = ['\"' + sentence + '\"' for sentence in input_kps_list]
    result_list = generating(topic, argument,str(input_kps))
    best_score_kp = max(result_list, key=lambda x: x[1])
    best_kp = best_score_kp[0]
    confidence_score = best_score_kp[1]
    print("_______________________________")
    print("best kp: "+best_kp)
    print("score: "+str(confidence_score))
    df.at[index,'matched_kp(1by1)'] = best_kp
    df.at[index,'confidence_score(1by1)'] = confidence_score

7182: 
["The USA's military involvement and vulnerability to natural disasters are potential drawbacks to consider when evaluating its livability.", 'The high cost of living and healthcare expenses in the USA can make it challenging for individuals to afford a decent standard of living.', 'The high crime rate in the USA raises concerns about safety and security, potentially making it a less desirable place to live.', 'The USA is a good country to live in despite its fast-paced and stressful lifestyle.', 'The USA offers numerous opportunities for personal and professional growth, making it a good country to live in.']
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Here's a list of lists, where each inner list contains a kp and the corresponding confidence score, calculated based on the topic "The USA is a good country to live in" and the given argument:

[['"The USA's military involvement and vulnerability to natural disasters are potential drawbacks to consider when evaluating its livabili

In [29]:
df.to_csv('./data/KPM_LLama2_in_hybrid.csv', index=False)
print("Add new data successfully!!!!!")


Add new data successfully!!!!!


### Check matched KP

In [18]:
import pandas as pd
df = pd.read_csv('./data/KPM_LLama2_in_hybrid.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [21]:
prediction_df = pd.read_csv('./data/1by1+predictions_in_group.csv')
print(prediction_df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points',
       'predict_kps(avg_embedding)', 'predict_kps(best_embedding)',
       'predict_kps(PCA_embedding)'],
      dtype='object')


In [32]:
import re
import ast
for index, row in df.iterrows():
    pattern = r'[Kk][Pp]([1-5])'
    match = re.match(pattern, row['matched_kp(1by1)'])
    if match:
        print(str(index)+": ")
        print(int(match.group(1)))
        kp_id = int(match.group(1))
        topic = row['topic']
        stance = row['stance']
        filtered_df = prediction_df[(prediction_df['topic']==topic) & (prediction_df['stance']== stance)]
        candidate_kps = filtered_df['predict_kps(avg_embedding)'].values.tolist()
        kps = ast.literal_eval(candidate_kps[0])
        print(kps)
        # print(kps[kp_id-1])
        # df.at[index,'matched_kp(1by1)'] = kps[kp_id-1]

1850: 
["The right to keep and bear arms is an absolute right for individuals.\n\nHowever, it's important to note that this right should be limited in certain circumstances, such as in the case of convicted felons or those with a history of violent behavior, in order to protect public safety. Additionally, stricter regulations on gun ownership and use can help prevent tragedies such as mass shootings while still allowing law-abiding citizens to defend themselves. Ultimately, finding a balance between individual rights and collective safety is crucial in the debate over gun control.", 'The right to keep and bear arms is unnecessary and increases the risk of gun violence, outweighing its potential benefits for personal protection.', 'The right to keep and bear arms is necessary for self-defense and personal protection.', 'The right to bear arms is an outdated concept that serves no purpose in modern society and only contributes to gun violence and senseless deaths.', 'The right to keep a

In [31]:
df.to_csv('./data/KPM_LLama2_in_hybrid.csv', index=False)
print("Add new data successfully!!!!!")

Add new data successfully!!!!!


In [1]:
import pandas as pd
df = pd.read_csv('./data/KPM_LLama2_in_hybrid.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [4]:
score = df['confidence_score(1by1)'].values.tolist()
print(sum(score)/len(score))

0.8621193699916682
