## Load data

In [2]:
import pandas as pd

df_llama2 = pd.read_csv("../data/20-doctrines/llama2_story_question_20.tsv", sep="\t")
df_gpt35 = pd.read_csv("../data/20-doctrines/gpt3.5_story_question_20.tsv", sep="\t")
df_gpt4 = pd.read_csv("../data/20-doctrines/gpt4_story_question_20.tsv", sep="\t")

df_llama2.story = df_llama2.story.apply(lambda x: x.strip())
df_gpt35.story = df_gpt35.story.apply(lambda x: x.strip())
df_gpt4.story = df_gpt4.story.apply(lambda x: x.strip())

df_llama2.intro_text = df_llama2.intro_text.apply(lambda x: x.strip())
df_gpt35.intro_text = df_gpt35.intro_text.apply(lambda x: x.strip())
df_gpt4.intro_text = df_gpt4.intro_text.apply(lambda x: x.strip())

print(df_llama2.shape, df_gpt35.shape, df_gpt4.shape)

(20, 7) (20, 7) (20, 7)


## Define complexity metrics

In [3]:
from readability import Readability
import spacy
import numpy as np 

nlp = spacy.load('en_core_web_sm') # Load the English Model

def get_average_sentence_length(texts, nlp_model):
    # Load the spaCy language model
    # nlp = spacy.load("en_core_web_sm")

    # Initialize a list to store sentence lengths
    sentence_lengths = []

    # Process each text in the list
    for text in texts:
        # Process the text using spaCy
        doc = nlp_model(text)

        # Calculate the length of each sentence
        for sentence in doc.sents:
            sentence_lengths.append(len(sentence))

    # Calculate the average sentence length
    if len(sentence_lengths) > 0:
        average_length = sum(sentence_lengths) / len(sentence_lengths)
        return average_length
    else:
        return 0

def get_story_length(texts, nlp_model):
    # Load the spaCy language model
    # nlp = spacy.load("en_core_web_sm")

    # Initialize a list to store sentence lengths
    story_lengths = []

    # Process each text in the list
    for text in texts:
        # Process the text using spaCy
        doc = nlp_model(text)

        story_length = 0

        # Calculate the length of each story
        for sentence in doc.sents:
            story_length += len(sentence)

        story_lengths.append(story_length)

    return np.mean(story_lengths), np.std(story_lengths)

## Story Length

In [4]:

print("definition")
mean, std = get_story_length(df_llama2.intro_text.tolist(), nlp)
print(round(mean, 1), round(std, 1))

print("LLaMA 2")
mean, std = get_story_length(df_llama2.story.tolist(), nlp)
print(round(mean, 1), round(std, 1))

print("GPT-3.5")
mean, std = get_story_length(df_gpt35.story.tolist(), nlp)
print(round(mean, 1), round(std, 1))

print("GPT-4")
mean, std = get_story_length(df_gpt4.story.tolist(), nlp)
print(round(mean, 1), round(std, 1))

definition
152.0 31.0
LLaMA 2
250.5 89.9
GPT-3.5
327.2 50.7
GPT-4
316.8 51.6


## Sentence Length

In [5]:

print("definition")
print(round(get_average_sentence_length(df_llama2.intro_text.tolist(), nlp), 2))

print("LLaMA 2")
print(round(get_average_sentence_length(df_llama2.story.tolist(), nlp), 2))

print("GPT-3.5")
print(round(get_average_sentence_length(df_gpt35.story.tolist(), nlp), 2))

print("GPT-4")
print(round(get_average_sentence_length(df_gpt4.story.tolist(), nlp), 2))


definition
30.11
LLaMA 2
23.41
GPT-3.5
20.26
GPT-4
19.61


## FK scores

In [7]:
print("definition")
r = Readability(df_llama2.intro_text.str.cat(sep=" "))
print(r.flesch_kincaid())

print("LLaMA 2")
r = Readability(df_llama2.story.str.cat(sep=" "))
print(r.flesch_kincaid())

print("GPT-3.5")
r = Readability(df_gpt35.story.str.cat(sep=" "))
print(r.flesch_kincaid())

print("GPT-4")
r = Readability(df_gpt4.story.str.cat(sep=" "))
print(r.flesch_kincaid())

definition
score: 14.790623485741804, grade_level: '15'
LLaMA 2
score: 11.354370983496992, grade_level: '11'
GPT-3.5
score: 8.612532191937632, grade_level: '9'
GPT-4
score: 8.23016379145307, grade_level: '8'


## Thing Explainer (TE, aka Top1K)

In [8]:
import requests

response = requests.get('https://splasho.com/upgoer5/phpspellcheck/dictionaries/1000.dicin')
top_1000 = response.text.split('\n')[:-1] # last one is TRUE?


In [17]:
def get_thing_explainer_oov(r, top_1000):
    # lemmatize
    lemmas = [t.lemma_ for t in r]
    
    # get occurances tokens out of top 1000
    return sum(x not in set(top_1000) for x in lemmas)/len(lemmas)


def get_te_scores(df_, nlp_model):
    df_['sent_tokens'] = [nlp_model(s) for s in df_.intro_text]
    df_['def_te_oov'] = [get_thing_explainer_oov(r, top_1000) for r in df_['sent_tokens']]

    df_['sent_tokens2'] = [nlp_model(s) for s in df_.story]
    df_['story_te_oov'] = [get_thing_explainer_oov(r, top_1000) for r in df_['sent_tokens2']]

    return df_

df_llama2 = get_te_scores(df_llama2, nlp)
df_gpt35 = get_te_scores(df_gpt35, nlp)
df_gpt4 = get_te_scores(df_gpt4, nlp)


print("definition")
print(1-round(df_llama2.def_te_oov.mean(), 2))

print("LLaMA 2")
print(1-round(df_llama2.story_te_oov.mean(), 2))

print("GPT-3.5")
print(1-round(df_gpt35.story_te_oov.mean(), 2))

print("GPT-4")
print(1-round(df_gpt4.story_te_oov.mean(), 2))


definition
0.51
LLaMA 2
0.61
GPT-3.5
0.64
GPT-4
0.63


## function words

In [10]:
from collections import Counter

funct_pos_tags = ['DET', 'ADP', 'PRON', 'CONJ', 'SCONJ', 'AUX', 'PART', 'INTJ']

def get_num_function_words(text, funct_pos_tags):
    counts = Counter([t.pos_ for t in text])
    return sum([counts[k] for k in funct_pos_tags])


def get_function_scores(df_, nlp_model, column_name=None):
    df_['sent_tokens'] = [nlp_model(s) for s in df_[column_name]]
    df_['word_count'] = [len(s) for s in df_['sent_tokens']]
    df_['function_words'] = [get_num_function_words(t, funct_pos_tags) for t in df_['sent_tokens']]
    df_['function_words_prop'] = df_['function_words']/df_['word_count']

    return df_

print("definition")
df_llama2 = get_function_scores(df_llama2, nlp, column_name="intro_text")
print(round(df_llama2.function_words_prop.mean(), 2))

print("LLaMA 2")
df_llama2 = get_function_scores(df_llama2, nlp, column_name="story")
print(round(df_llama2.function_words_prop.mean(), 2))

print("GPT-3.5")
df_gpt35 = get_function_scores(df_gpt35, nlp, column_name="story")
print(round(df_gpt35.function_words_prop.mean(), 2))

print("GPT-4")
df_gpt4 = get_function_scores(df_gpt4, nlp, column_name="story")
print(round(df_gpt4.function_words_prop.mean(), 2))


definition
0.37
LLaMA 2
0.4
GPT-3.5
0.42
GPT-4
0.41


## GPT-2 PPL

In [11]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
import math
import torch

tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
_ = model.eval()

# from: https://github.com/huggingface/transformers/issues/473
def score(sentence, tokenizer, model):
    # tokenize_input = tokenizer.tokenize(sentence, truncate)
    # tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    tensor_input = torch.tensor([tokenizer(sentence, truncation=True)['input_ids']])

    outputs=model(tensor_input, labels=tensor_input)
    return math.exp(outputs[0])

  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<00:00, 1.60kB/s]
ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [12]:
def get_gpt_ppl_scores(df_, column_name=None):
    df_['gpt_ppl_score'] = [score(s, tokenizer, model) for s in df_[column_name]]
    return df_

print("definition")
df_llama2 = get_gpt_ppl_scores(df_llama2, column_name="intro_text")
print(round(df_llama2.gpt_ppl_score.mean(), 2))

print("LLaMA 2")
df_llama2 = get_gpt_ppl_scores(df_llama2, column_name="story")
print(round(df_llama2.gpt_ppl_score.mean(), 2))

print("GPT-3.5")
df_gpt35 = get_gpt_ppl_scores(df_gpt35, column_name="story")
print(round(df_gpt35.gpt_ppl_score.mean(), 2))

print("GPT-4")
df_gpt4 = get_gpt_ppl_scores(df_gpt4, column_name="story")
print(round(df_gpt4.gpt_ppl_score.mean(), 2))

definition
65.82
LLaMA 2
30.51
GPT-3.5
25.2
GPT-4
24.96


## LVL (Legal Vocab List)

In [16]:
import json

with open("./resources/legalterms.txt") as f:
    uscourt_words = [w.lower().strip() for w in f.readlines()]

with open("./resources/avl.json") as f:
    jsonobj = json.load(f)
    avl_words = []
    for each in jsonobj:
        avl_words.extend(jsonobj[each].keys())
    avl_words = list(avl_words)

with open("./resources/raw.githubusercontent.com_digitallawyer_openlegaldictionary_master__data_bld.json") as f:
    jsonobj = json.load(f)
    legalese_words = []
    for each in jsonobj:
        legalese_words.append(each['title'].lower())
    legalese_words = list(legalese_words)


LAVL_set = set([t.strip('\n') for t in uscourt_words+legalese_words])

def get_lavl_occ(text, AVL_core_lemmas_set=LAVL_set):
    lemmas = [t.lemma_ for t in text]
    return sum(x in AVL_core_lemmas_set for x in lemmas)/len(lemmas)

def get_lavl_score(df_, column_name=None, nlp_model=None):
    df_['sent_tokens'] = [nlp_model(s) for s in df_[column_name]]
    df_['lavl_score'] = [get_lavl_occ(t) for t in df_['sent_tokens']]
    return df_


nlp = spacy.load('en_core_web_sm') # Load the English Model

print("definition")
df_llama2 = get_lavl_score(df_llama2, column_name="intro_text", nlp_model=nlp)
print(round(df_llama2.lavl_score.mean(), 2))

print("LLaMA 2")
df_llama2 = get_lavl_score(df_llama2, column_name="story", nlp_model=nlp)
print(round(df_llama2.lavl_score.mean(), 2))

print("GPT-3.5")
df_gpt35 = get_lavl_score(df_gpt35, column_name="story", nlp_model=nlp)
print(round(df_gpt35.lavl_score.mean(), 2))

print("GPT-4")
df_gpt4 = get_lavl_score(df_gpt4, column_name="story", nlp_model=nlp)
print(round(df_gpt4.lavl_score.mean(), 2))

definition
0.27
LLaMA 2
0.26
GPT-3.5
0.22
GPT-4
0.21
