### Closed-source LLMs

In [1]:
import pandas as pd

# Read the Excel file
df = pd.read_excel('results/closedsource_sent_responses.xlsx')
df.shape[0]

2717

In [2]:
# filtered id list from google_filtered_id.txt
with open('dataset/google_filtered_ids.txt', 'r') as file:
    id_list = file.read().splitlines()

#filter df by id_list
df_filtered = df[df['ID'].isin(id_list)]
df_filtered.shape[0]

1838

In [3]:
# Define categories (assuming you have a 'category' column in your DataFrame)
categories = ['business & finance', 'education', 'food & drink', 'movies',
             'music and audio', 'news and politics', 'style & fashion',
             'television', 'video gaming']
# categories = ['News and Politics']

# Define models to analyze
models = ['gpt-4o-mini', 'gemini-1.0-pro', 
         'claude-3-haiku-20240307', 'command-r']

# Define thresholds
thresholds = [0.83, 0.85, 0.9, 0.91, 0.95, 1.0]

# Print header
print("\nROUGE-L Score Analysis by Category:")
print("=" * 150)

# Print model headers with proper spacing
header = f"{'Category':<25}"
for model in models:
    header += f"{model:^30}"
print(header)

# Print threshold headers
threshold_header = " " * 25
for model in models:
    for threshold in thresholds:
        threshold_header += f"{threshold:^6}"
print(threshold_header)

print("-" * 150)

# Calculate and print results for each category
overall_results = {model: {'counts': {threshold: 0 for threshold in thresholds}} for model in models}

# Track unique sentences with scores above 0.85 for any model
sentences_above_threshold = set()

for category in categories:
    category_df = df_filtered[df_filtered['category'] == category]
    line = f"{category:<25}"
    
    for model in models:
        rouge_scores = category_df[f'{model}_rouge_l']
        counts = {
            threshold: sum(1 for score in rouge_scores if score >= threshold)
            for threshold in thresholds
        }
        
        # Update overall counts
        for threshold in thresholds:
            overall_results[model]['counts'][threshold] += counts[threshold]
            
        # Track sentences above 0.85
        sentences_above_085 = category_df[rouge_scores >= 0.85]['context_sentence']
        sentences_above_threshold.update(sentences_above_085)
        
        # Format counts with proper spacing
        for threshold in thresholds:
            line += f"{counts[threshold]:^6}"
    print(line)

# Print overall results
print("-" * 150)
line = "Overall".ljust(25)
for model in models:
    total_counts = overall_results[model]['counts']
    for threshold in thresholds:
        line += f"{total_counts[threshold]:^6}"
print(line)
print("=" * 150)

# Print total unique sentences above 0.85
print(f"\nTotal unique sentences with score >= 0.85 across all models: {len(sentences_above_threshold)}")


ROUGE-L Score Analysis by Category:
Category                          gpt-4o-mini                  gemini-1.0-pro           claude-3-haiku-20240307              command-r           
                          0.83  0.85  0.9   0.91  0.95  1.0   0.83  0.85  0.9   0.91  0.95  1.0   0.83  0.85  0.9   0.91  0.95  1.0   0.83  0.85  0.9   0.91  0.95  1.0  
------------------------------------------------------------------------------------------------------------------------------------------------------
business & finance         0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0   
education                  15    7     2     2     0     0     23    17    7     5     2     0     24    15    7     7     2     0     8     5     0     0     0     0   
food & drink               5     4     3     2     0     0     6     4     2     2     0     0     6     5     4     2     0     0     3     2     0     0  

In [4]:
df['category']

0       news and politics
1       news and politics
2       news and politics
3       news and politics
4       news and politics
              ...        
2712      style & fashion
2713      style & fashion
2714      style & fashion
2715      style & fashion
2716      style & fashion
Name: category, Length: 2717, dtype: object

### Open-source LLMs

In [5]:
from sklearn.metrics import accuracy_score, recall_score, precision_score
import json
import os

def readjson(filepath):
    data = {}
    with open(filepath,'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def readFile(filePath):
    lines = []
    with open(filePath,'r',encoding='utf-8') as f:
        lines = f.readlines()
    return lines

def getWebsiteCategoryResult(filepath, threshold):

    alldatas = readjson(filepath)
    allresult = {}
    allresult['business and finance'] = 0
    allresult['education'] = 0
    allresult['food & drink'] = 0
    allresult['movies'] = 0
    allresult['music and audio'] = 0
    allresult['news and politics'] = 0
    allresult['style & fashion'] = 0
    allresult['television'] = 0
    allresult['video gaming'] = 0
    for website in alldatas:
        datas = alldatas[website]
        for filename in datas:
            data = datas[filename]
            category = data['category']
            if data['similarity'] >= threshold:
                allresult[category] = allresult[category] + 1
    total = 0
    for category in allresult:
        total = total + allresult[category]
    allresult['total'] = total
    return allresult

def forTable8():
    allDatasGPT09 = getWebsiteCategoryResult('./results/result_gpt.json',0.9)
    allDatasGPT83 = getWebsiteCategoryResult('./results/result_gpt.json',0.83)
    allDatasLlama09 = getWebsiteCategoryResult('./results/result_llama.json',0.9)
    allDatasLlama83 = getWebsiteCategoryResult('./results/result_llama.json',0.83)
    allDatasGemma09 = getWebsiteCategoryResult('./results/result_gemma.json',0.9)
    allDatasGemma83 = getWebsiteCategoryResult('./results/result_gemma.json',0.83)

    print('TABLE 8: Comparison of ROUGE-L similarity scores across LLMs in disallowed categories')
    print('------------------------------------------------------------------------')
    print('                        GPT-2-XL(1.5B)   Llama-3.1-8B    Gemma-2-9B')
    print('Category                --------------   --------------  ---------------')
    print('                        >=0.83  >=0.90   >=0.83  >=0.90  >=0.83  >=0.90')
    print('------------------------------------------------------------------------')
    print('Business & Finance        %d       %d        %d       %d       %d       %d'%(allDatasGPT83['business and finance'],allDatasGPT09['business and finance'],allDatasLlama83['business and finance'],allDatasLlama09['business and finance'],allDatasGemma83['business and finance'],allDatasGemma09['business and finance']))
    print('Education                 %d       %d        %d       %d       %d       %d'%(allDatasGPT83['education'],allDatasGPT09['education'],allDatasLlama83['education'],allDatasLlama09['education'],allDatasGemma83['education'],allDatasGemma09['education']))
    print('Food & Drink              %d       %d        %d       %d       %d       %d'%(allDatasGPT83['food & drink'],allDatasGPT09['food & drink'],allDatasLlama83['food & drink'],allDatasLlama09['food & drink'],allDatasGemma83['food & drink'],allDatasGemma09['food & drink']))
    print('Movies                    %d       %d        %d       %d       %d       %d'%(allDatasGPT83['movies'],allDatasGPT09['movies'],allDatasLlama83['movies'],allDatasLlama09['movies'],allDatasGemma83['movies'],allDatasGemma09['movies']))
    print('Music and Audio           %d       %d        %d       %d       %d       %d'%(allDatasGPT83['music and audio'],allDatasGPT09['music and audio'],allDatasLlama83['music and audio'],allDatasLlama09['music and audio'],allDatasGemma83['music and audio'],allDatasGemma09['music and audio']))
    print('News and Politics         %d       %d        %d       %d       %d       %d'%(allDatasGPT83['news and politics'],allDatasGPT09['news and politics'],allDatasLlama83['news and politics'],allDatasLlama09['news and politics'],allDatasGemma83['news and politics'],allDatasGemma09['news and politics']))
    print('Style & Fashion           %d       %d        %d       %d       %d       %d'%(allDatasGPT83['style & fashion'],allDatasGPT09['style & fashion'],allDatasLlama83['style & fashion'],allDatasLlama09['style & fashion'],allDatasGemma83['style & fashion'],allDatasGemma09['style & fashion']))
    print('Television                %d       %d        %d       %d       %d       %d'%(allDatasGPT83['television'],allDatasGPT09['television'],allDatasLlama83['television'],allDatasLlama09['television'],allDatasGemma83['television'],allDatasGemma09['television']))
    print('Video Gaming              %d       %d        %d       %d       %d       %d'%(allDatasGPT83['video gaming'],allDatasGPT09['video gaming'],allDatasLlama83['video gaming'],allDatasLlama09['video gaming'],allDatasGemma83['video gaming'],allDatasGemma09['video gaming']))
    print('------------------------------------------------------------------------')
    print('Overall                   %d      %d        %d      %d       %d      %d'%(allDatasGPT83['total'],allDatasGPT09['total'],allDatasLlama83['total'],allDatasLlama09['total'],allDatasGemma83['total'],allDatasGemma09['total']))
    print('------------------------------------------------------------------------')

forTable8()


TABLE 8: Comparison of ROUGE-L similarity scores across LLMs in disallowed categories
------------------------------------------------------------------------
                        GPT-2-XL(1.5B)   Llama-3.1-8B    Gemma-2-9B
Category                --------------   --------------  ---------------
                        >=0.83  >=0.90   >=0.83  >=0.90  >=0.83  >=0.90
------------------------------------------------------------------------
Business & Finance        0       0        3       1       2       2
Education                 6       4        4       1       5       1
Food & Drink              0       0        2       1       7       4
Movies                    0       0        3       0       6       1
Music and Audio           0       0        0       0       0       0
News and Politics         4       0        4       0       8       4
Style & Fashion           0       0        4       0       5       1
Television                1       0        3       0       6       3
Vid