### Import Data

In [27]:
import pandas as pd
df = pd.read_csv('./data/KPM_LLama2_in_1by1.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [28]:
prediction_df = pd.read_csv('./data/llama2_1by1+predictions_in_group.csv')
print(prediction_df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points',
       'predict_kps(avg_embedding)', 'predict_kps(best_embedding)',
       'predict_kps(PCA_embedding)'],
      dtype='object')


### Calculate confidence score one by one and get best one

In [None]:
import yaml
with open("../conf/index.yaml") as f:
    credentials = yaml.safe_load(f)
Llama2_api_token = credentials['environment_variables']['LLAMA2_API_TOKEN']

In [29]:
import ast
import re
def generating(topic, argument,kp):
    try:
        import replicate
        replicate = replicate.Client(api_token=Llama2_api_token)
        output = replicate.run(
            "meta/llama-2-70b-chat",
            input={
                "debug": False,
                "top_p": 1,
                "prompt": "argument:"+argument+", kp:"+kp,
                "temperature": 0.5,
                "system_prompt": f"""
            You need to evaluate to what extend the 2 sentences matches to each other based on the topic "{topic}" and return a confidence score between 0 and 1. 0 represents totally not match, 1 represents very match. Your return should be in this format:Confidence Score: number
            """,
                "max_new_tokens": 500,
                "min_new_tokens": -1,
                "prompt_template": "[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{prompt} [/INST]",
                "repetition_penalty": 1.15
            }
        )
        # print(output)
        # The meta/llama-2-70b-chat model can stream output as it's running.
        # The predict method returns an iterator, and you can iterate over that output.
        output_sentence = ' '
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        for item in output:
            output_sentence+=item
            # print(item, end="")

        print(output_sentence)
        confidence_score_match = re.findall(r'Confidence Score: (\d+\.\d+)', output_sentence)

        if confidence_score_match:
            score = float(confidence_score_match[0])
            print("KP score: "+str(score))
            print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
            return score
        else:
            print(f"Error. Regenerating string...")
            return generating(topic, argument, kp)
    except (SyntaxError, ValueError) as e:
        print(f"Error: {e}. Regenerating string...")
        return generating(topic, argument, kp)


In [30]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_bert_similarity(sentence1, sentence2):
    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Tokenize and obtain embeddings for each sentence
    tokens1 = tokenizer(sentence1, return_tensors='pt')
    tokens2 = tokenizer(sentence2, return_tensors='pt')

    with torch.no_grad():
        outputs1 = model(**tokens1)
        outputs2 = model(**tokens2)

    # Extract the embeddings from the model outputs
    embedding1 = outputs1.last_hidden_state.mean(dim=1).numpy()
    embedding2 = outputs2.last_hidden_state.mean(dim=1).numpy()

    # Reshape embeddings for cosine similarity calculation
    embedding1 = np.squeeze(embedding1)
    embedding2 = np.squeeze(embedding2)

    # Calculate cosine similarity
    similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]

    return similarity_score

In [31]:
import ast
for index,row in df.iloc[4170:4171].iterrows():
    topic = row['topic']
    stance = row['stance']
    argument = row['argument']
    filtered_df = prediction_df[(prediction_df['topic']==topic) & (prediction_df['stance']== stance)]
    # print(filtered_df)
    candidate_kps = filtered_df['predict_kps(avg_embedding)'].values.tolist()
    candidate_kps = ast.literal_eval(candidate_kps[0])
    print(index)

    best_kp = ''
    confidence_score = 0.0
    for kp in candidate_kps:
        print(kp)
        new_score = generating(topic, argument, kp)
        if new_score >= confidence_score:
            if new_score == confidence_score:
                old_cos = calculate_bert_similarity(argument, best_kp)
                new_cos = calculate_bert_similarity(argument, kp)
                if new_cos>old_cos:
                    best_kp = kp
                    confidence_score = new_score
            else:
                best_kp = kp
                confidence_score = new_score
    print("Best KP: "+best_kp)
    print("Confidence Score: "+ str(confidence_score))
    df.at[index,'matched_kp(1by1)'] = best_kp
    df.at[index,'confidence_score(1by1)'] = confidence_score

4170
Urbanization is responsible for the destruction of natural habitats and the decline of animal populations, making it essential to combat urbanization to protect the environment and preserve biodiversity.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  Confidence Score: 0.8

The two sentences share a similar topic and convey a similar message, which is the negative impact of urbanization on the environment and the importance of combating it to protect nature and preserve biodiversity. The keywords "destruction of nature", "ecosystem", "natural habitats", "animal populations", and "combat urbanization" are all related to the topic and suggest a strong match between the two sentences. Therefore, the confidence score is 0.8, indicating a high degree of similarity between the two sentences.
KP score: 0.8
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Urbanization can lead to the displacement and disintegration of local communities, threatening their cultural identity and sense of belonging.
~~~

In [None]:
df.to_csv('./data/KPM_LLama2_in_1by1.csv', index=False)
print("Add new data successfully!!!!!")

In [1]:
import pandas as pd
df = pd.read_csv('./data/KPM_LLama2_in_1by1.csv')
print(df.columns)

Index(['topic', 'stance', 'argument', 'matched_kp(group)',
       'confidence_score(group)', 'matched_kp(1by1)',
       'confidence_score(1by1)'],
      dtype='object')


In [2]:
score = df['confidence_score(1by1)'].values.tolist()
print(sum(score)/len(score))


0.742794970986513
