In [20]:
import pandas as pd
from nltk.util import ngrams
from collections import Counter
from transformers import AutoTokenizer

In [21]:
n_values = [1, 2, 3]

In [22]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [49]:
# Read the two CSV files containing the scenarios
training_dataset = pd.read_excel('human_scenarios.xlsx')['scenario'].tolist()
generated_dataset = pd.read_excel('gpt4_scenarios.xlsx')['scenario'].tolist()

In [50]:
overlap_scores = {}
precision_scores = {}
recall_scores = {}
f1_scores = {}

In [51]:
# Calculate n-gram overlap, precision, recall and f1 score for each value of n
for n in n_values:
    # Tokenize the training dataset
    training_tokens = [tokenizer.tokenize(sent) for sent in training_dataset]
    training_ngrams = [ngrams(tokens, n) for tokens in training_tokens]
    training_ngrams = [ng for sent in training_ngrams for ng in sent]
    
    # Tokenize the generated dataset
    generated_tokens = [tokenizer.tokenize(sent) for sent in generated_dataset]
    generated_ngrams = [ngrams(tokens, n) for tokens in generated_tokens]
    generated_ngrams = [ng for sent in generated_ngrams for ng in sent]
    
    # Calculate the intersection of n-grams in the two datasets
    overlap_count = sum((Counter(training_ngrams) & Counter(generated_ngrams)).values())
    
    # Calculate the total number of n-grams in the generated dataset
    generated_count = sum(Counter(generated_ngrams).values())
    
    # Calculate the total number of n-grams in the training dataset
    training_count = sum(Counter(training_ngrams).values())
    
    # Calculate the precision, recall and f1 score for this value of n
    precision_score = overlap_count / generated_count
    recall_score = overlap_count / training_count
    f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score)
    
    # Store the overlap score, precision, recall and f1 score for this value of n
    overlap_scores[n] = overlap_count / generated_count
    precision_scores[n] = precision_score
    recall_scores[n] = recall_score
    f1_scores[n] = f1_score

In [52]:
print(f'Overlap scores: {overlap_scores}')
print(f'Precision scores: {precision_scores}')
print(f'Recall scores: {recall_scores}')
print(f'F1 scores: {f1_scores}')

Overlap scores: {1: 0.5785109838447731, 2: 0.17663190073727747, 3: 0.03622569938359412}
Precision scores: {1: 0.5785109838447731, 2: 0.17663190073727747, 3: 0.03622569938359412}
Recall scores: {1: 0.4236487120903884, 2: 0.12752767048589697, 3: 0.025752519634610848}
F1 scores: {1: 0.48911452781441395, 2: 0.14811603490848774, 3: 0.030104222077742975}


Gpt4_finetuned_scenarios results: 

Overlap scores: {1: 0.4461530730187447, 2: 0.2274394338550545, 3: 0.10353897656623626}
Precision scores: {1: 0.4461530730187447, 2: 0.2274394338550545, 3: 0.10353897656623626}
Recall scores: {1: 0.7588682793401146, 2: 0.395882035605107, 3: 0.18477951635846374}
F1 scores: {1: 0.5619342996392177, 2: 0.2889012827663135, 3: 0.13271352676747036}

