### Import Data

In [1]:
import pandas as pd
df = pd.read_csv('./data/KPM_LLama2_in_group.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [2]:
prediction_df = pd.read_csv('./data/1by1+predictions_in_group.csv')
print(prediction_df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points',
       'predict_kps(avg_embedding)', 'predict_kps(best_embedding)',
       'predict_kps(PCA_embedding)'],
      dtype='object')


### Predict matched KP and calculate score directly from LLaMa2

In [None]:
import yaml
with open("../conf/index.yaml") as f:
    credentials = yaml.safe_load(f)
Llama2_api_token = credentials['environment_variables']['LLAMA2_API_TOKEN']

In [3]:
import ast
import re
def generating(topic, argument,kps):
    import replicate
    replicate = replicate.Client(api_token=Llama2_api_token)
    output = replicate.run(
        "meta/llama-2-70b-chat",
        input={
            "debug": False,
            "top_p": 1,
            "prompt": "argument:"+argument+", kps:"+kps,
            "temperature": 0.5,
            "system_prompt": f"""
        You need to find the best matched kp to the given argument from a list of candidate Kps based on the topic "{topic}",
        and then return this best kp and its confidence score between 0 and 1. 0 represents not match, 1 represents match.
        The result format is as followed: best_kp: .... confidence_score:a number
        """,
            "max_new_tokens": 500,
            "min_new_tokens": -1,
            "prompt_template": "[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{prompt} [/INST]",
            "repetition_penalty": 1.15
        }
    )
    # print(output)
    # The meta/llama-2-70b-chat model can stream output as it's running.
    # The predict method returns an iterator, and you can iterate over that output.
    output_sentence = ' '
    print("---------------------------------")
    for item in output:
        output_sentence+=item
        print(item, end="")
    print("---------------------------------")

    # Define regular expressions to match the patterns
    # kp_pattern = r'Best KP: "(.*?)"'
    kp_pattern = r'Best KP:(.*)'
    # score_pattern = r'Confidence Score: (\d+\.\d+)'
    score_pattern = r'Confidence Score: (\d+(\.\d+)?)'

    # Extract the KP and the confidence score using regular expressions
    kp_match = re.search(kp_pattern, output_sentence)
    score_match = re.search(score_pattern, output_sentence)
    # Check if both patterns are matched
    if kp_match and score_match  :
        kp = kp_match.group(1).strip()
        confidence_score = float(score_match.group(1))
        # Print or use the extracted information
        print("Key Point:", kp)
        print("Confidence Score:", confidence_score)
        return kp,confidence_score
    else:
        print(f"Error. Regenerating string...")
        return generating(topic, argument, kps)


In [12]:
for index, row in df.iloc[7000:].iterrows():
    topic = row['topic']
    stance = row['stance']
    argument = row['argument']
    filtered_df = prediction_df[(prediction_df['topic']==topic) & (prediction_df['stance']== stance)]
    candidate_kps = filtered_df['predict_kps(avg_embedding)'].values.tolist()
    print(str(index)+": ")
    print(candidate_kps[0])
    kp, confidence_score = generating(topic, argument,candidate_kps[0])
    df.at[index,'matched_kp(1by1)'] = kp
    df.at[index,'confidence_score(1by1)'] = confidence_score

7000: 
['The government should not regulate social media platforms as it would infringe upon the freedom of speech and expression of individuals.', "The government should regulate social media platforms to ensure they are not misused for spreading false information and to protect users' privacy, despite being private companies.", 'Government regulation of social media platforms raises concerns about censorship and potential misuse of personal data.', 'The First Amendment protects social media, and government regulation would violate freedom of speech and expression.', "Government regulation of social media platforms could potentially infringe upon freedom of expression, as it may restrict individuals' ability to share their thoughts and ideas freely."]
---------------------------------
 Best KP: "The government should regulate social media platforms to ensure they are not misused for spreading false information and to protect users' privacy, despite being private companies."

Confidenc

In [13]:
df.to_csv('./data/KPM_LLama2_in_group.csv', index=False)
print("Add new data successfully!!!!!")

Add new data successfully!!!!!


In [1]:
import pandas as pd
df = pd.read_csv('./data/KPM_LLama2_in_group.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [3]:
score = df['confidence_score(1by1)'].values.tolist()
print(sum(score)/len(score))

0.7788546559823853
