In [80]:
import openai
import yaml
with open("conf/index.yaml") as f:
    credentials = yaml.safe_load(f)
openai.api_key = credentials['environment_variables']['OPENAI_API_KEY']
model_name = "gpt-4"
system_prompt = ""
user_query = ""

In [81]:
import pandas as pd

df = pd.read_csv('./data/ground_truth.csv')
# print(df.head(10))
random_samples = df.sample(100)
print(random_samples.head(10))
arguments = list(random_samples['argument'].values)
target_KPs = list(random_samples['key_point'].values)
print(arguments)
print(len(arguments))

                                                  topic  \
1551                We should adopt an austerity regime   
4535                     We should subsidize journalism   
2199           We should ban private military companies   
1431  We should abolish the right to keep and bear arms   
2262           We should ban private military companies   
1791                            We should adopt atheism   
2514              We should ban the use of child actors   
5064     Routine child vaccinations should be mandatory   
2227           We should ban private military companies   
4318                    We should prohibit flag burning   

                                               argument  \
1551  all services have to be paid for, and until th...   
4535  journalism acts as a check on public instituti...   
2199  private military companies are often run by fo...   
1431  the right to keep and bear arms has caused too...   
2262  private military companies tend to save the co...

In [82]:
import ast
predict_KPs = []
system_prompt = \
        """
        You need to do key point analysis on the query of user and
        generate 3 key point sentences, store them in a list and return this list. It should be like this:
        ["...","...","..."]
        """
for argument in arguments:
    rep = openai.ChatCompletion.create(
        model = model_name,
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": argument}
        ]
    )

    result = rep["choices"][0]["message"]["content"]
    print(len(predict_KPs))
    result = result.rstrip('.').rstrip()
    print(result)
    predict_KPs.append(ast.literal_eval(result))

0
["All services require payment, indicating a necessary allocation of funds", "A significant amount of money is being wasted on interest payments due to the existing deficit", "Once the debts are reduced, the funds spent on interest payments will be available for services."]
1
["Journalism serves as a monitor for public institutions", "It requires subsidies for its sustenance", "Private funds are inadequate to uphold journalism's mission."]
2
["Private military companies are predominantly managed by ex-military personnel.", "These companies provide services that enable the actual military to focus solely on military operations.", "The use of private military companies provides additional resources and expertise to support conventional military forces."]
3
["The right to keep and bear arms is linked to incidences of mass shootings.", "The frequency and severity of mass shootings necessitate the ban on keeping and bearing arms.", "The current policy of arms possession contributes to pub

In [83]:
for kps in predict_KPs:
    for kp in kps:
        print(kp)
    print("--------------------------------------")

All services require payment, indicating a necessary allocation of funds
A significant amount of money is being wasted on interest payments due to the existing deficit
Once the debts are reduced, the funds spent on interest payments will be available for services.
--------------------------------------
Journalism serves as a monitor for public institutions
It requires subsidies for its sustenance
Private funds are inadequate to uphold journalism's mission.
--------------------------------------
Private military companies are predominantly managed by ex-military personnel.
These companies provide services that enable the actual military to focus solely on military operations.
The use of private military companies provides additional resources and expertise to support conventional military forces.
--------------------------------------
The right to keep and bear arms is linked to incidences of mass shootings.
The frequency and severity of mass shootings necessitate the ban on keeping and

### Evaluation Candidates with Metrices

In [91]:
import sacrebleu
import nltk
from nltk.translate import meteor_score
from rouge import Rouge
from evaluate import load
bertscore = load("bertscore")

def get_best_sentence(reference_sentence, candidate_sentences):
    total_bleu_score = []
    total_bert_score = []
    total_length = []
    total_rouge = []
    total_meteor = []
    total_rank = []

    #BLEU SCORE Bert Score
    for candidate_sentence in candidate_sentences:
        # Calculate BLEU score
        bleu_score = sacrebleu.sentence_bleu(candidate_sentence, [reference_sentence])
        # print(candidate_sentence)
        # print(bleu_score.score)
        total_bleu_score.append((candidate_sentence,bleu_score.score))

        bert_score = bertscore.compute(predictions=[candidate_sentence], references=[reference_sentence], lang="en")['f1'][0]
        # print(bert_score)
        total_bert_score.append((candidate_sentence, bert_score))

        length = len(candidate_sentence)
        total_length.append((candidate_sentence, length))

        rouge = Rouge()
        scores = rouge.get_scores(candidate_sentence, reference_sentence)
        rouge_f1 = scores[0]["rouge-1"]["f"]
        total_rouge.append((candidate_sentence, rouge_f1))

        reference_tokens = nltk.word_tokenize(reference_sentence)
        candidate_tokens = nltk.word_tokenize(candidate_sentence)
        meteor = meteor_score.single_meteor_score(reference_tokens, candidate_tokens)
        total_meteor.append((candidate_sentence, meteor))


    # print(total_bleu_score)
    # print(total_bert_score)

    total_bleu_score.sort(key=lambda x: x[1], reverse=True)
    total_bert_score.sort(key=lambda x: x[1], reverse=True)
    total_length.sort(key=lambda x: x[1], reverse=True)
    total_rouge.sort(key=lambda x: x[1], reverse=True)
    total_meteor.sort(key=lambda x: x[1], reverse=True)

    print(total_bleu_score)
    print(total_bert_score)
    print(total_length)
    print(total_rouge)
    print(total_meteor)

    for candidate in candidate_sentences:
        index_of_bleu = next((index for index, item in enumerate(total_bleu_score) if item[0] == candidate), None)
        index_of_bert = next((index for index, item in enumerate(total_bert_score) if item[0] == candidate), None)
        index_of_length = next((index for index, item in enumerate(total_length) if item[0] == candidate), None)
        index_of_rouge = next((index for index, item in enumerate(total_rouge) if item[0] == candidate), None)
        index_of_meteor = next((index for index, item in enumerate(total_meteor) if item[0] == candidate), None)
        total_rank.append((candidate, index_of_bleu+index_of_bert+index_of_length+index_of_rouge+index_of_meteor))

    print(total_rank)
    total_rank.sort(key=lambda x: x[1])
    best_sentence = total_rank[0][0]
    print(best_sentence)
    index_of_bleu_bs = next((index for index, item in enumerate(total_bleu_score) if item[0] == best_sentence), None)
    bleu_score = total_bleu_score[index_of_bleu_bs][1]
    index_of_bert_bs = next((index for index, item in enumerate(total_bert_score) if item[0] == best_sentence), None)
    bert_score = total_bert_score[index_of_bert_bs][1]
    index_of_len_bs = next((index for index, item in enumerate(total_length) if item[0] == best_sentence), None)
    length_score = total_length[index_of_len_bs][1]
    index_of_rouge_bs = next((index for index, item in enumerate(total_rouge) if item[0] == best_sentence), None)
    rouge_score = total_rouge[index_of_rouge_bs][1]
    index_of_meteor_bs = next((index for index, item in enumerate(total_meteor) if item[0] == best_sentence), None)
    METEOR_score = total_meteor[index_of_meteor_bs][1]

    # print(bleu_score)
    # print(bert_score)


    return best_sentence, bleu_score, bert_score,length_score,rouge_score,METEOR_score

In [92]:
result = []
for i in range(len(arguments)):
    one_result = [arguments[i], target_KPs[i]]
    print("-------------------------------------")
    print(target_KPs[i])
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    best_sentence, bleu_score, bert_score, len_score, rouge_score, METEOR_score = get_best_sentence(target_KPs[i],predict_KPs[i])
    one_result.append(best_sentence)
    one_result.append(bleu_score)
    one_result.append(bert_score)
    one_result.append(len_score)
    one_result.append(rouge_score)
    one_result.append(METEOR_score)
    result.append(one_result)

-------------------------------------
Austerity is necessary to reduce deficits or pay debts
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[('All services require payment, indicating a necessary allocation of funds', 3.7477767366779213), ('A significant amount of money is being wasted on interest payments due to the existing deficit', 2.908317710573757), ('Once the debts are reduced, the funds spent on interest payments will be available for services.', 2.1476912089159055)]
[('A significant amount of money is being wasted on interest payments due to the existing deficit', 0.8651386499404907), ('Once the debts are reduced, the funds spent on interest payments will be available for services.', 0.8622931241989136), ('All services require payment, indicating a necessary allocation of funds', 0.8520768284797668)]
[('Once the debts are reduced, the funds spent on interest payments will be available for services.', 96), ('A significant amount of money is being wasted on interest payments due to the e

In [93]:
print(result)

[['all services have to be paid for, and until the defecit has been reduced, too much money is being wasted on interst payments. once debts have been reduced, that money will be availbale for services.', 'Austerity is necessary to reduce deficits or pay debts', 'A significant amount of money is being wasted on interest payments due to the existing deficit', 2.908317710573757, 0.8651386499404907, 94, 0.15999999539200013, 0.15463917525773196], ['journalism acts as a check on public institutions and should be subsidized, as private funds are no longer enough to sustain their mission.', 'Journalism is important to information-spreading/accountability', 'Journalism serves as a monitor for public institutions', 5.522397783539471, 0.8845152854919434, 54, 0.1538461491124262, 0.09433962264150944], ['private military companies are often run by former military men and women and provide a service that allows the military to concentrate on military operations.', 'Private military companies can supp

### Export Result

In [95]:
f = open("./prompt/gpt-4+3Cans(advanced).txt",'w', encoding='UTF-8')
for batch_id, re in enumerate(result):
    print(batch_id)
    print("-------------------Processing-------------------")
    f.write(str(batch_id)+"\n")
    f.write("argument:\n")
    f.write(re[0]+"\n")
    f.write("target:\n")
    f.write(re[1]+"\n")
    f.write("predict:\n")
    f.write(re[2]+"\n")
    f.write("Matrices:\n")
    f.write("BLEU_SCORE: " + str(re[3]) + "\n")
    f.write("BERT_SCORE: " + str(re[4]) + "\n")
    f.write("LENGTH_SCORE: " + str(re[5]) + "\n")
    f.write("ROUGE_SCORE: " + str(re[6]) + "\n")
    f.write("METEOR_SCORE: " + str(re[7]) + "\n")
    f.write("\n")

print("-----------------Finished--------------------------")

0
-------------------Processing-------------------
1
-------------------Processing-------------------
2
-------------------Processing-------------------
3
-------------------Processing-------------------
4
-------------------Processing-------------------
5
-------------------Processing-------------------
6
-------------------Processing-------------------
7
-------------------Processing-------------------
8
-------------------Processing-------------------
9
-------------------Processing-------------------
10
-------------------Processing-------------------
11
-------------------Processing-------------------
12
-------------------Processing-------------------
13
-------------------Processing-------------------
14
-------------------Processing-------------------
15
-------------------Processing-------------------
16
-------------------Processing-------------------
17
-------------------Processing-------------------
18
-------------------Processing-------------------
19
-------------------

### Calculate the overall metrices

In [96]:
total_bleu_score = [row[3] for row in result]
bleu_score = sum(total_bleu_score) / len(total_bleu_score)
total_bert_score = [row[4] for row in result]
bert_score = sum(total_bert_score) / len(total_bert_score)
total_length_score = [row[5] for row in result]
length_score = sum(total_length_score) / len(total_length_score)
total_rouge_score = [row[6] for row in result]
rouge_score = sum(total_rouge_score) / len(total_rouge_score)
total_meteor_score = [row[7] for row in result]
meteor_score = sum(total_meteor_score) / len(total_meteor_score)
print("BLEU SCORE: ",bleu_score)
print("Bert SCORE: ",bert_score)
print("LENGTH SCORE: ",length_score)
print("ROUGE SCORE: ",rouge_score)
print("METEOR SCORE: ",meteor_score)

BLEU SCORE:  5.442233693853853
Bert SCORE:  0.890458682179451
LENGTH SCORE:  76.27
ROUGE SCORE:  0.2028410354463864
METEOR SCORE:  0.2246882546684238
