In [None]:
!pip install summac

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nltk
nltk.download('punkt') # this is a required dependency for SummaC it seems

Test SummaC

In [None]:
from summac.model_summac import SummaCZS, SummaCConv
# This code is copied from the SummaC github example to start using the metric: https://github.com/tingofurro/summac
model_zs = SummaCZS(granularity="sentence", model_name="vitc", device="cuda") # If you have a GPU: switch to: device="cuda"
model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device="cuda", start_file="default", agg="mean")

document = """Scientists are studying Mars to learn about the Red Planet and find landing sites for future missions.
One possible site, known as Arcadia Planitia, is covered instrange sinuous features.
The shapes could be signs that the area is actually made of glaciers, which are large masses of slow-moving ice.
Arcadia Planitia is in Mars' northern lowlands."""

summary1 = "There are strange shape patterns on Arcadia Planitia. The shapes could indicate the area might be made of glaciers. This makes Arcadia Planitia ideal for future missions."
score_zs1 = model_zs.score([document], [summary1])
score_conv1 = model_conv.score([document], [summary1])
print("[Summary 1] SummaCZS Score: %.3f; SummacConv score: %.3f" % (score_zs1["scores"][0], score_conv1["scores"][0])) # [Summary 1] SummaCZS Score: 0.582; SummacConv score: 0.536

summary2 = "There are strange shape patterns on Arcadia Planitia. The shapes could indicate the area might be made of glaciers."
score_zs2 = model_zs.score([document], [summary2])
score_conv2 = model_conv.score([document], [summary2])
print("[Summary 2] SummaCZS Score: %.3f; SummacConv score: %.3f" % (score_zs2["scores"][0], score_conv2["scores"][0])) # [Summary 2] SummaCZS Score: 0.877; SummacConv score: 0.709

In [None]:
model_name = "mistral" # Make sure this matches the name of the predictions filepath

In [None]:
import pandas as pd

df = pd.read_csv(f"/content/drive/MyDrive/Lang Gen Project/Results/{model_name}_predictions.csv")

documents = df['Input'].tolist()
summaries = df['Prediction'].tolist()

if len(documents) != len(summaries):
        raise ValueError("The lengths of the documents and summaries lists do not match.")


In [None]:
print(documents[31])

In [None]:
print(summaries[31])

In [None]:
test_score = model_conv.score([documents[2]], [summaries[2]])
print(test_score)

In [None]:
total_score = 0
scores = []
num = 0

for document, summary in zip(documents, summaries):
  print(f"Calculate SummaC Conv score for summary number {num+1}")
  num += 1
  if pd.isna(summary):
    summary = "Null"
  score = model_conv.score([document], [summary])
  {'Document': document, 'Summary': summary, 'Score': score}
  scores.append({'Document': document, 'Summary': summary, 'Score': score['scores'][0]})
  total_score += score['scores'][0]

results_df = pd.DataFrame(scores)
results_df.to_csv(f'/content/drive/MyDrive/Lang Gen Project/Metrics/{model_name}_summac_results.csv', index=False)
# Calculate and write average score
average_score = total_score / len(scores)
with open(f'/content/drive/MyDrive/Lang Gen Project/Metrics/{model_name}_average_summac.txt', 'w') as file:
    file.write(f"Average Score: {average_score}\n")

In [None]:
print(f"Average Score: {average_score}")

In [None]:
model_names = ["llama_prompt_sum_llama_holdings", "legalBertLarge_sum_llama_holdings", "legalBertLarge_sum_Mistal_holdings", "legalBertLarge_sum_Mistal_holdings_without_hint", "legalBertLarge_sum_llama_holdings_without_hint"]

In [None]:
import pandas as pd

# This code is the same as above, but it reads file names in a loop to score multiple results files from gdrive.
for model_name in model_names:
  df = pd.read_csv(f"/content/drive/MyDrive/Lang Gen Project/Formatted Holding Predictions for SummaC and Bleurt /{model_name}_predictions.csv")

  documents = df['Input'].tolist()
  summaries = df['Prediction'].tolist()

  if len(documents) != len(summaries):
          raise ValueError("The lengths of the documents and summaries lists do not match.")

  total_score = 0
  scores = []
  num = 0

  for document, summary in zip(documents, summaries):
    print(f"Calculate SummaC Conv score for summary number {num+1}")
    num += 1
    if pd.isna(summary):
      summary = "Null"
    score = model_conv.score([document], [summary])
    {'Document': document, 'Summary': summary, 'Score': score}
    scores.append({'Document': document, 'Summary': summary, 'Score': score['scores'][0]})
    total_score += score['scores'][0]

  results_df = pd.DataFrame(scores)
  results_df.to_csv(f'/content/drive/MyDrive/Lang Gen Project/Metrics/{model_name}_summac_results.csv', index=False)
  # Calculate and write average score
  average_score = total_score / len(scores)
  with open(f'/content/drive/MyDrive/Lang Gen Project/Metrics/{model_name}_average_summac.txt', 'w') as file:
      file.write(f"Average Score: {average_score}\n")

This is actually separate data cleaning that I used to assemble the Likert scores

In [None]:
file_path_test = '/content/drive/MyDrive/Lang Gen Project/qlora_data/cleaned_test_qlora.jsonl'

In [None]:
test_df = pd.read_json(file_path_test, lines=True)

In [None]:
pred_files = ["mistral", "llama2", "longformer", "T5"]

In [None]:
likert_df = pd.DataFrame(columns=["input", "reference", "mistral", "mistral_likert", "llama2", "llama2_likert", "longformer", "longformer_likert", "T5", "T5_likert"])

for i in range(10):
  input_txt = test_df.iloc[i]['input']
  reference_txt = test_df.iloc[i]['output']
  temp_df = pd.DataFrame({'input': [input_txt],'reference': [reference_txt]})

  likert_df = pd.concat([likert_df, temp_df], ignore_index=True)

for model in pred_files:
  model_df = pd.read_csv(f"/content/drive/MyDrive/Lang Gen Project/Results/{model}_predictions.csv")

  for i in range(10):
    pred = model_df.iloc[i]['Prediction']
    likert_df.at[i, model] = pred



In [None]:
likert_df.iloc[0]

In [None]:
likert_df.to_csv(f'/content/drive/MyDrive/Lang Gen Project/Metrics/parenthetical_likert.csv', index=False)