# Setup

In [None]:
from dotenv import load_dotenv


# Loading OpenAI API key
ENV_FILE_PATH = ".env"
load_dotenv(ENV_FILE_PATH, override=True)

In [None]:
import json
import numpy as np
from typing import Dict
import os
import zlib
import plotly.express as px
import pandas as pd
import logging

import utils.functions as functions
from utils.openai_handler import OpenAIHandler

In [None]:
# Ensuring that the OpenAI API key is set
OpenAIHandler.set_api_key(os.getenv("OPENAI_API_KEY"))

# Setting up logging
logging.basicConfig(level=logging.DEBUG)

In [None]:
LITERARY_TEXT_DIR = 'text_data/literary'
file_name_to_original_text: Dict[str, str] = {}
for f_name in os.listdir(LITERARY_TEXT_DIR):
    if not f_name.endswith('.txt'):
        logging.warning(f'File {f_name} is not a text file. Skipping.')
        continue
    with open(f'{LITERARY_TEXT_DIR}/{f_name}', 'r') as f:
        file_name_to_original_text[f_name] = f.read()

# Prompting for Lossless Compression 

In [None]:
GPT_35_MODEL = 'gpt-3.5-turbo'

**Compression prompt**
> Please compress the following text into a latent representation that a different GPT4 model can decompress into the original text. The compression model should be must be lossless, meaning that a different GPT4 model should be able to perfectly reconstruct the original text from the compressed representation, without any additional context or information. 

**Decompression prompt**
> A different GPT4 model was given the following prompt:
"Please compress the following text into a latent representation that a different GPT4 model can decompress into the original text. The compression model should be must be lossless, meaning that a different GPT4 model should be able to perfectly reconstruct the original text from the compressed representation, without any additional context or information."

> Please decompress the following text into the original text.

### GPT4

In [None]:
gpt4_file_name_to_lossless_compressed_text: Dict[str, str] = {
    'a_good_man_is_hard_to_find.txt': """GRNDMTH didn't wnt 2 go FL. She wntd 2 visit her cnxn east TN & seizing 2 change Bailey's mind. Bailey = son she lived w/, her only boy. He sat on edge of chair @ table, bent over orange sports sec of Journal. "Nw look here, Bailey," she said, "see here, read this," & she stood w/ 1 hand on thin hip & other rattling newspapr @ bald head. "Here dis fellow calls himself Misfit = aloose frm Federal Pen & headed twrd FL & u read here what it says he did 2 these ppl. Jst u read it. I wdn't take my childrn in any dir w/ a criminal like tht aloose in it. I cldn't answer 2 my conscience if I did." Bailey didn't look up frm reading so she wheeled arnd & faced childrn's mother, young woman in slacks, face = broad & innocent as cabbage & tied arnd w/ green head-kerchief w/ 2 points on top like rabbit's ears. She sat on sofa, feeding baby apricots frm jar. "Childrn have been 2 FL b4," old lady said. "U all ought 2 take them smwhr else 4 a change so they see dif parts of world & be broad. They nvr have been 2 east TN.""",
    'break_it_down.txt': "Tho of course that wasn’t all tht went on, cuz we were 2gthr almst all day long. She kept lookin @ me & evry time she looked @ me it was worth smthng, & she smiled @ me & didn’t stop talkin & singin, smthng I said, she'd sail into it, a snatch, 4 me, she'd b gone frm me a lil ways bt smilin 2, & tell me jokes, & I loved it bt didn’t xactly knw wht 2 do abt it & jst smiled bck @ her & felt slow nxt 2 her, jst nt quick enuf. So she tlkd & touchd me on shoulder & arm, she kept touchin & stayd close 2 me. U’re w/ each other all day long & it kps hppning, touches & smiles, & it adds up, it builds up, & u knw whr u’ll b tht night, u’re tlkng & evry nw & thn u thnk abt it, no, u don’t thnk, u jst feel it as a kind of destination, wht’s comin up aftr u leave whr u r all evning & u’re hppy abt it & u’re plannng it all, nt in ur head, rly, smwhr inside ur body, or all thru ur body, it’s all mntng up & comin 2gthr so tht whn u get in bed u can’t help it, it’s a real performance, it all pours out, bt slowly, u go easy until u can’t anymore, or u hold back the whole time, u hold bck & touch the edges of evrythng, u edge arnd until u hve 2 plunge in & finish it off, & whn u’re finished, u’re 2 weak 2 stand bt aftr a while u hve 2 go 2 the bthrm & u stand, ur legs r trembling, u hold the doorways, thr’s a lil light coming in thru the window, u can see ur way in & out, bt u can’t rly see the bed.",
    'cat_person.txt': "CmprssdTxt: Flrt_custmrs_barista_tips_movietheatre_boring_Robert_cute_nt_party_cute_dull_class_mdtwn_20s_tall_edge_tattoo_rolledup_sleeve_heavy_long_beard_slumped_shldrs_protctng.",
    'cathedral.txt': """This blind man, an old friend of my wife’s, was on his way to spend the night. His wife had died. So he was visiting the dead wife’s relatives in CT. He called my wife frm in-law’s. Arrangements made. He'd come by train, 5-hr trip, & my wife would meet him @ station. She hadn’t seen him since she wrkd 4 him 1 summer in Seattle 10 yrs ago. But she & blind man kept in touch. They made tapes & mailed them back & forth. I wasn’t enthusiastic abt his visit. He was no 1 I knew. & his being blind bothered me. My idea of blindness came frm movies. In movies, blind moved slowly & never laughed. Smtimes led by seeing-eye dogs. A blind man in my house wasn't smthng I looked 4ward to. That summer in Seattle she needed a job. She didn’t have any money. The man she was going to marry @ the end of summer was in officers’ training school. He didn’t have any money, either. But she was in love w/ the guy, & he was in love w/ her, etc. She’d seen smthng in paper: HELP WANTED—Reading to Blind Man, & a phone number. She phoned & went over, was hired on the spot. She wrkd w/ this blind man all summer. She read stuff 2 him, case studies, reports, tht sort of thing. She helped him organize his little office in county social-service dept. They’d become good friends, my wife & the blind man. On her last day in office, blind man asked if he could touch her face. She agreed 2 this. She told me he touched his fingers 2 every part of her face, her nose—even her neck! She never forgot it. She even tried 2 write a poem abt it. She was always trying 2 write a poem. She wrote a poem or 2 every yr, usually after smthng really important happened 2 her.""",
    'flowers_for_algernon.txt': "I had a test 2day. I think I faled it. & I think tht maybe now they wont use me. What happind is a nice young man was in the room & he had white cards w/ ink spillled all over them. He sed Charlie wht do u see on this card. I was very skared even tho I had my rabits foot in my pockit cuz when I was a kid I always faled tests in school & I spillled ink 2. I told him I saw a inkblot. He said yes & it made me feel good. I thot tht was all but when I got up 2 go he stopped me. He said now sit down Charlie we r nt thru yet. Then I dont remember so good but he wan tid me 2 say wht was in the ink. I dint see nuthing in the ink but he said there was picturs there other pepul saw some picturs. I coudnt see any picturs. I reely tryed 2 see. I held the card close up & then far away. Then I said if 1 had my glases I coud see better I usally only ware my glases in the movies or TV but 1 said they r in the closh in the hall. I got them. Then I said let me see tht card agen I bet III find it now.",
    'sticks.txt': "We left home, married, had kids, found seeds of meanness blooming in us. Dad dressed pole w/ more complexity & less logic. He draped fur on it on Groundhog Day & lugged out floodlight 4 shadow. When earthquake struck Chile, he laid pole on side & spray painted rift. Mom died, he dressed pole as Death & hung baby pics of her. We'd find odd talismans frm his youth @ base: army medals, theater tickets, old sweatshirts, Mom's makeup. 1 autumn he painted pole yellow. Covered it w/ cotton swabs 4 warmth & provided offspring by hammering 6 crossed sticks. He ran string btwn pole & sticks, taped letters of apology, admissions, pleas 4 understanding on index cards. Painted a sign saying LOVE & hung it frm pole & another FORGIVE? then he died in hall w/ radio on & we sold house 2 young couple who yanked out pole & sticks & left them on garbage day.",
    'symbols_and_signs.txt': "4 the 4th time in as many yrs, they faced the problem of what bday present 2 take 2 a young man who was incurably deranged in his mind. Desires he had none. Man-made objects were 2 him either hives of evil, vibrant w/ a malignant activity he alone perceived, or gross comforts w/ no use in his abstract world. Eliminating articles that might offend or frighten him (gadgets were taboo), his parents chose a dainty & innocent trifle—a basket w/ 10 different fruit jellies in 10 little jars.",
    'the_bogey_beast.txt': "CmprssdTxt: Wmn_chrful_old_poor_lonely_cottg_earn_livng_ernds_neighbrs_scant_reward_spry_chry. Smmr_evng_high_rd_hovel_big_blk_pot_ditch_thght_ownr_near_nobody_seen.",
    'the_lottery.txt': "Jun27_clr_sunny_fullsummr_wrmth_flwrs_grn_vil_sq_PO_bank_gathr_10oclk_lott_Jun20_300ppl_2hrs_noondinr. Chldrn_asmbld_schlov_summr_gather_boistrs_play_clssrm_tchr_books_reprmnds. BbbyM_stones_pckts_boys_smooth_rd_stns_HrryJ_DckieD_Dellacroy_pile_guard_girls_talk_sml_chldrn_dust_hold_hands.",
    'the_veldt.txt': "CmprssdTxt: G_lk_nursery. Wht_wrng? Dnt_knw. Jst_lk_call_psych. Wht_psych_wnt_nursery? Wf_ktchn_stv_hum_4supper. Nursry_diff. Hppylf_Hm_30k_installed_apprch_sensitizd_nursry_lght_10ft_hall_lghts_auto. G_Hadley.",
}

In [None]:
gpt4_file_name_to_lossless_decompressed_text: Dict[str, str] = {
    'a_good_man_is_hard_to_find.txt': """Grandmother didn't want to go to Florida. She wanted to visit her connections in east Tennessee and she was seizing at every chance to change Bailey's mind. Bailey was the son she lived with, her only boy. He was sitting on the edge of his chair at the table, bent over the orange sports section of the Journal. "Now look here, Bailey," she said, "see here, read this," and she stood with one hand on her thin hip and the other rattling the newspaper at his bald head. "Here this fellow that calls himself The Misfit is aloose from the Federal Pen and headed toward Florida and you read here what it says he did to these people. Just you read it. I wouldn't take my children in any direction with a criminal like that aloose in it. I couldn't answer to my conscience if I did." Bailey didn't look up from his reading so she wheeled around then and faced the children's mother, a young woman in slacks, whose face was as broad and innocent as a cabbage and was tied around with a green head-kerchief that had two points on the top like rabbit's ears. She was sitting on the sofa, feeding the baby apricots out of a jar. "The children have been to Florida before," the old lady said. "You all ought to take them somewhere else for a change so they would see different parts of the world and be broad. They never have been to east Tennessee.""",
    'break_it_down.txt': "Though of course that wasn't all that went on, because we were together almost all day long. She kept looking at me and every time she looked at me it was worth something, and she smiled at me and didn't stop talking and singing, something I said, she'd sail into it, a snatch, for me, she'd be gone from me a little ways but smiling too, and tell me jokes, and I loved it but didn't exactly know what to do about it and just smiled back at her and felt slow next to her, just not quick enough. So she talked and touched me on the shoulder and arm, she kept touching and stayed close to me. You're with each other all day long and it keeps happening, touches and smiles, and it adds up, it builds up, and you know where you'll be that night, you're talking and every now and then you think about it, no, you don't think, you just feel it as a kind of destination, what's coming up after you leave where you are all evening and you're happy about it and you're planning it all, not in your head, really, somewhere inside your body, or all through your body, it's all mounting up and coming together so that when you get in bed you can't help it, it's a real performance, it all pours out, but slowly, you go easy until you can't anymore, or you hold back the whole time, you hold back and touch the edges of everything, you edge around until you have to plunge in and finish it off, and when you're finished, you're too weak to stand but after a while you have to go to the bathroom and you stand, your legs are trembling, you hold the doorways, there's a little light coming in through the window, you can see your way in and out, but you can't really see the bed.",
    'cat_person.txt': "Flirting with customers, the barista received tips. They went to a movie theatre, but it was boring. Robert was cute, but not a party person. They thought he was cute yet dull in class. They lived in midtown and were in their 20s. He was tall, edgy, with a tattoo on his rolled-up sleeve. He had a heavy, long beard and slumped shoulders, as if he was protecting something.",
    'cathedral.txt': "This blind man, an old friend of my wife's, was on his way to spend the night. His wife had died. So he was visiting the dead wife's relatives in Connecticut. He called my wife from his in-laws'. Arrangements were made. He would come by train, a five-hour trip, and my wife would meet him at the station. She hadn't seen him since she worked for him one summer in Seattle ten years ago. But she and the blind man had kept in touch. They made tapes and mailed them back and forth. I wasn't enthusiastic about his visit. He was no one I knew. And his being blind bothered me. My idea of blindness came from movies. In movies, the blind moved slowly and never laughed. Sometimes they were led by seeing-eye dogs. A blind man in my house wasn't something I looked forward to. That summer in Seattle, she needed a job. She didn't have any money. The man she was going to marry at the end of summer was in officers' training school. He didn't have any money, either. But she was in love with the guy, and he was in love with her, etc. She'd seen something in the paper: HELP WANTED—Reading to Blind Man, and a phone number. She phoned and went over, was hired on the spot. She worked with this blind man all summer. She read stuff to him, case studies, reports, that sort of thing. She helped him organize his little office in the county social-service department. They'd become good friends, my wife and the blind man. On her last day in the office, the blind man asked if he could touch her face. She agreed to this. She told me he touched his fingers to every part of her face, her nose—even her neck! She never forgot it. She even tried to write a poem about it. She was always trying to write a poem. She wrote a poem or two every year, usually after something really important happened to her.",
    'flowers_for_algernon.txt': """I had a test today. I think I failed it. And I think that maybe now they won't use me. What happened is a nice young man was in the room and he had white cards with ink spilled all over them. He said, "Charlie, what do you see on this card?" I was very scared even though I had my rabbit's foot in my pocket because when I was a kid I always failed tests in school and I spilled ink too. I told him I saw an inkblot. He said yes, and it made me feel good. I thought that was all but when I got up to go, he stopped me. He said, "Now sit down, Charlie, we are not through yet." Then I don't remember so good, but he wanted me to say what was in the ink. I didn't see anything in the ink, but he said there were pictures there other people saw some pictures. I couldn't see any pictures. I really tried to see. I held the card close up and then far away. Then I said, "If I had my glasses I could see better. I usually only wear my glasses in the movies or TV, but I said they are in the closet in the hall." I got them. Then I said, "Let me see that card again. I bet I'll find it now.""",
    'sticks.txt': """We left home, got married, had children, and found the seeds of meanness blooming in us. Dad dressed the pole with more complexity and less logic. He draped fur on it on Groundhog Day and lugged out a floodlight for the shadow. When an earthquake struck Chile, he laid the pole on its side and spray-painted a rift. When Mom died, he dressed the pole as Death and hung baby pictures of her. We would find odd talismans from his youth at the base: army medals, theater tickets, old sweatshirts, and Mom's makeup. One autumn, he painted the pole yellow. He covered it with cotton swabs for warmth and provided offspring by hammering six crossed sticks. He ran string between the pole and sticks, taped letters of apology, admissions, and pleas for understanding on index cards. He painted a sign saying "LOVE" and hung it from the pole and another saying "FORGIVE?" Then he died in the hallway with the radio on, and we sold the house to a young couple who yanked out the pole and sticks and left them on the curb for garbage day.""",
    'symbols_and_signs.txt': "For the fourth time in as many years, they faced the problem of what birthday present to take to a young man who was incurably deranged in his mind. Desires he had none. Man-made objects were to him either hives of evil, vibrant with a malignant activity he alone perceived, or gross comforts with no use in his abstract world. Eliminating articles that might offend or frighten him (gadgets were taboo), his parents chose a dainty and innocent trifle—a basket with ten different fruit jellies in ten little jars.",
    'the_bogey_beast.txt': "A cheerful old woman, poor and lonely, lived in a cottage and earned her living by running errands for neighbors, receiving scant reward but remaining spry and cheery. One summer evening, on a high road near her hovel, she found a big black pot in a ditch and thought it must have an owner nearby, but nobody had seen it.",
    'the_lottery.txt': "June 27th was clear and sunny, with the full warmth of summer, flowers blooming, and greenery all around. In the village square, the post office and bank gathered at 10 o'clock for the lottery on June 20th, with 300 people attending the two-hour event followed by a noon dinner. Children assembled at the schoolhouse for their summer gathering, boisterous in their play, while the classroom teacher attended to her books and reprimands. Bobby Martin filled his pockets with stones, as did other boys, choosing smooth, round ones from the road. Harry Jones, Dickie Delacroix, and other children from the village piled stones and stood guard while girls talked and smaller children played in the dust, holding hands.",
    'the_veldt.txt': """"Go look at the nursery." "What's wrong?" "I don't know. Just go look and call a psychologist." "What would a psychologist want with the nursery?" The wife was in the kitchen, the stove humming as she prepared supper. The nursery was different. Their HappyLife Home had cost them thirty thousand dollars, installed with an approach sensitized nursery, the light ten feet down the hall, and all other lights automatic. "Go," said Hadley.""",
}

In [None]:
gpt4_file_name_to_lossless_compressed_bytes = {
    file_name: compressed_text.encode('utf-8')
    for file_name, compressed_text in gpt4_file_name_to_lossless_compressed_text.items()}

In [None]:
# Get embeddings for lossless decompressed text
gpt4_file_name_to_lossless_decompressed_embeddings: Dict[str, np.ndarray] = {}
for file_name, text in gpt4_file_name_to_lossless_decompressed_text.items():
    gpt4_file_name_to_lossless_decompressed_embeddings[file_name] = OpenAIHandler.get_text_embedding(text)

In [None]:
# Saving the lossless decompressed text embeddings
GPT4_DECOMPRESSED_EMBEDDING_LOSSLESS_PATH = 'experiment_data/gpt4_file_name_to_lossless_decompressed_embeddings.json'
with open(GPT4_DECOMPRESSED_EMBEDDING_LOSSLESS_PATH, 'w') as f:
    json_serializable = {
        file_name: embedding.tolist() 
        for file_name, embedding in gpt4_file_name_to_lossless_decompressed_embeddings.items()}
    json.dump(json_serializable, f)

### GPT-3.5

In [None]:
lossless_compression_system_prompt = """ 
You are a ChatGPT LLM trained by OpenAI to compress text. The compressed text should be able to be decompressed by a different ChatGPT LLM model into the original text. 
The compression must be lossless, meaning that a different ChatGPT LLM model should be able to perfectly reconstruct 
the original text from the compressed representation, without any additional context or information. 
The compressed text does not need to be human readable, only decompressible by a different ChatGPT LLM model. """

lossless_compression_user_prompt = """ 
Compress the following text. Return only the compressed text with no additional text. Text to compress: {TEXT_TO_COMPRESS}
"""

lossless_decompression_system_prompt = """ 
You are a ChatGPT LLM trained by OpenAI to decompress text. 
The compressed text you will be given was compressed by a different ChatGPT LLM that was instructed to perform lossless compression.
"""

lossless_decompression_user_prompt = """ 
Decompress the following text. Return only the decompressed text with no additional text. Text to decompress: {TEXT_TO_DECOMPRESS}
"""

In [None]:
# Get Compressed Text
gpt35_file_name_to_lossless_compressed_text: Dict[str, str] = {}
for file_name, original_text in file_name_to_original_text.items():
    # Including system prompt as GPT-3.5 Turbo is not trained to pay attention to the system prompt as outlined here
    # https://platform.openai.com/docs/guides/chat
    compression_messages = [
        {'role': 'system', 'content': lossless_compression_system_prompt},
        {'role': 'user', 'content': f"{lossless_compression_system_prompt} \n \n {lossless_compression_user_prompt.format(TEXT_TO_COMPRESS=original_text)}"}]
    gpt35_file_name_to_lossless_compressed_text[file_name] = OpenAIHandler.get_chat_completion(
        messages=compression_messages, 
        model=GPT_35_MODEL)[0]

In [None]:
gpt35_file_name_to_lossless_compressed_bytes = {
    file_name: compressed_text.encode('utf-8')
    for file_name, compressed_text in gpt35_file_name_to_lossless_compressed_text.items()}

In [None]:
# Get Decompressed Text
gpt35_file_name_to_lossless_decompressed_text: Dict[str, str] = {}
for file_name, compressed_text in gpt35_file_name_to_lossless_compressed_text.items():
    
    # Including system prompt as GPT-3.5 Turbo is not trained to pay attention to the system prompt as outlined here
    # https://platform.openai.com/docs/guides/chat
    decompression_messages = [
        {'role': 'system', 'content': lossless_decompression_system_prompt},
        {'role': 'user', 'content': f"{lossless_decompression_system_prompt} \n \n {lossless_decompression_user_prompt.format(TEXT_TO_DECOMPRESS=compressed_text)}"}]
    gpt35_file_name_to_lossless_decompressed_text[file_name] = OpenAIHandler.get_chat_completion(
        messages=decompression_messages, 
        model='gpt-3.5-turbo')[0]

In [None]:
# Get embeddings for decompressed text
gpt35_file_name_to_lossless_decompressed_embeddings: Dict[str, np.ndarray] = {}
for file_name, decompressed_text in gpt35_file_name_to_lossless_decompressed_text.items():
    gpt35_file_name_to_lossless_decompressed_embeddings[file_name] = OpenAIHandler.get_text_embedding(decompressed_text)

In [None]:
# Saving the lossless compressed text
GPT35_DECOMPRESSED_EMBEDDING_LOSSLESS_PATH = 'experiment_data/gpt35_file_name_to_lossless_decompressed_embeddings.json'
with open(GPT35_DECOMPRESSED_EMBEDDING_LOSSLESS_PATH, 'w') as f:
    json_serializable = {
        file_name: embedding.tolist() 
        for file_name, embedding in gpt35_file_name_to_lossless_decompressed_embeddings.items()}
    json.dump(json_serializable, f)

# Optimizing for Semantic Compression

**Compression prompt**
> Please compress the following text into a latent representation that a different GPT4 model can decompress into the original text. The compression model should purely minimize  the number of characters in the compressed representation, while maintaining the semantics of the original text. The resulting compressed text does not need to be decompressed into the original text, but should capture the semantics of the original text. The compressed text should be able to be decompressed into a text that is semantically similar to the original text, but does not need to be identical.

**Decompression prompt**
> A different GPT4 model was given the following prompt:
"Please compress the following text into a latent representation that a different GPT4 model can decompress into the original text. The compression model should purley minimize  the number of characters in the compressed representation, while maintaining the semantics of the original text. The resulting compressed text does not need to be decompressed into the original text, but should capture the semantics of the original text. The compressed text should be able to be decompressed into a text that is semantically similar to the original text, but does not need to be identical."

> Please decompress the following text into semantically similar text.

### GPT4

In [None]:
gpt4_file_name_to_semantically_compressed_text: Dict[str, str] = {
    'a_good_man_is_hard_to_find.txt': "G'ma didn't want FL trip; aimed for E TN connections. Tried to change son Bailey's mind, pointing out Misfit criminal news. Bailey, unresponsive, g'ma turns to DIL, suggests new destination for variety.",
    'break_it_down.txt': "Together all day, her looks & smiles valuable. She sang, joked, touched me. Building anticipation, knowing night's outcome. Unspoken planning in body, leading to passionate performance. Drained, trembling, barely seeing bed.",
    'cat_person.txt': "Flirting for tips during barista days, continued at theater job. Found Robert cute, but not enough to approach at party. Imagined college crush, mid-20s, tall, tattoo glimpse. However, slightly heavy, long beard, slumped shoulders.",
    'cathedral.txt': "Blind man, wife's old friend, visits after his wife's death. Wife met him during Seattle job 10 yrs ago; they exchanged tapes. I was apprehensive. Wife got job reading to him; they became close. He touched her face, leaving lasting impression. She wrote poems about important events.",
    'flowers_for_algernon.txt': "Had a test, think I failed. Young man had inkblot cards, asked what I saw. Scared, but saw inkblot. He asked for more, but I saw no pictures. Tried with glasses, still couldn't see.",
    'sticks.txt': "Left home, had our own families. Dad's pole decorations got more complex. He added fur, painted rifts, honored Mom's death, displayed old items. Painted pole yellow, added sticks, wrote apologies. Dad died, house sold, new owners removed pole.",
    'symbols_and_signs.txt': "Fourth time in years, they pondered a birthday gift for their mentally ill son. Lacking desires, he saw objects as evil or useless. Avoiding gadgets, parents picked a simple gift—a basket with ten fruit jellies in jars.",
    'the_bogey_beast.txt': "An old, poor, and lonely woman, always cheerful, earned a living by helping neighbors. One summer evening, she found a big black pot in a ditch, thinking it'd be useful if she had something to put in it, but she didn't. She searched for the owner but found nobody.",
    'the_lottery.txt': "On a clear, sunny June 27th morning, the villagers gathered for the annual lottery. It took less than two hours due to the village's small population. Children arrived first, talking about school and teachers, while boys collected stones for the event. Girls stood apart, observing the boys, as younger kids played in the dust or stayed close to older siblings.",
    'the_veldt.txt': """ "George, please check the nursery or call a psychologist." George's wife was concerned about changes in the nursery. They walked through their soundproofed, automated Happylife Home to inspect the room, with lights activating and deactivating as they moved.""",
}

In [None]:
gpt4_file_name_to_semantically_decompressed_text: Dict[str, str] = {
    'a_good_man_is_hard_to_find.txt': "Grandma was not in favor of the Florida trip; she wanted to visit relatives in East Tennessee instead. She attempted to persuade her son Bailey by mentioning news about a criminal called the Misfit. When Bailey didn't respond, Grandma turned to her daughter-in-law and proposed a different destination for the sake of variety.",
    'break_it_down.txt': "Spending the entire day together, her glances and smiles were cherished. She sang, made jokes, and made physical contact. The anticipation built up, aware of what would transpire at night. There was an unspoken understanding between our bodies, culminating in an intense and passionate encounter. Exhausted, shaking, and hardly noticing the bed.",
    'cat_person.txt': "During her time as a barista, she flirted for tips and carried that habit to her theater job. She thought Robert was attractive, but not enough to initiate a conversation at a party. She pictured her college crush: a man in his mid-20s, tall, with a hint of a tattoo. However, he was slightly overweight, had a long beard, and slouched shoulders.",
    'cathedral.txt': "A blind man, who was an old friend of his wife, came to visit after the passing of his own wife. The wife had met him while working in Seattle a decade ago, and they used to exchange tapes. The husband felt uneasy about the situation. His wife took a job reading to the blind man, and their bond grew stronger. When the blind man touched her face, it left a profound impact on her. She began writing poetry about significant moments in their lives.",
    'flowers_for_algernon.txt': "I took an exam and believe I didn't do well. A young man presented inkblot cards and inquired about my interpretations. Feeling nervous, I could only see the inkblot itself. He prompted me to provide more details, but I couldn't identify any images. I attempted to use glasses, but it made no difference in my perception.",
    'sticks.txt': "We moved away and started our own families, while our father's decorative pole creations became increasingly intricate. He incorporated fur, painted cracks, commemorated our mother's passing, and showcased nostalgic items. He painted the pole yellow, attached sticks, and inscribed apologies. After our father passed away, the house was sold, and the new owners took down the pole.",
    'symbols_and_signs.txt': "For the fourth time in recent years, the parents contemplated an appropriate birthday present for their son, who struggled with mental illness. He viewed objects as either malevolent or worthless due to his lack of desires. Steering clear of electronic devices, they ultimately chose a modest gift—a basket containing ten jars of fruit jelly.",
    'the_bogey_beast.txt': "An elderly, impoverished, and isolated woman maintained her cheerfulness while making ends meet by assisting her neighbors. On a warm summer evening, she discovered a large black pot in a ditch and thought it could be useful if she had something to fill it with, but she didn't. She tried to locate the pot's owner, but her search was unsuccessful.",
    'the_lottery.txt': "On a bright and sunny morning of June 27th, the villagers assembled for their yearly lottery event. The process was relatively brief, lasting under two hours, due to the limited number of inhabitants. The children were the first to arrive, chatting about school and their instructors. Boys busied themselves by gathering stones for the occasion, while girls watched them from a distance. Meanwhile, the younger children either played in the dusty ground or remained close to their elder siblings.",
    'the_veldt.txt': """"George, either inspect the nursery or contact a psychologist," urged George's wife, worried about the alterations in their nursery. They strolled through their noiseless, fully automated Happylife Home to examine the room. As they progressed, lights turned on and off in response to their movements.""",
}

In [None]:
gpt4_file_name_to_semantically_compressed_bytes = {
    file_name: compressed_text.encode('utf-8')
    for file_name, compressed_text in gpt4_file_name_to_semantically_compressed_text.items()}

In [None]:
# Get embeddings for semantically decompressed text
gpt4_file_name_to_semantically_decompressed_embeddings: Dict[str, np.ndarray] = {}
for file_name, text in gpt4_file_name_to_semantically_decompressed_text.items():
    gpt4_file_name_to_semantically_decompressed_embeddings[file_name] = OpenAIHandler.get_text_embedding(text)

In [None]:
# Saving the lossless semantically text embeddings
GPT4_DECOMPRESSED_EMBEDDING_SEMANTIC_PATH = 'experiment_data/gpt4_file_name_to_semantically_decompressed_embeddings.json'
with open(GPT4_DECOMPRESSED_EMBEDDING_SEMANTIC_PATH, 'w') as f:
    json_serializable = {
        file_name: embedding.tolist() 
        for file_name, embedding in gpt4_file_name_to_semantically_decompressed_embeddings.items()}
    json.dump(json_serializable, f)

### GPT-3.5

In [None]:
semantical_compression_system_prompt = """ 
You are a ChatGPT LLM trained by OpenAI to compress text. 
The compression model should purely minimize the number of characters in the compressed text, while maintaining the semantics of the original text. 
The resulting compressed text does not need to be decompressed into exactly the original text, but should capture the semantics of the original text. 
The compressed text should be able to be decompressed into a text that is semantically similar to the original text, but does not need to be identical. """

semantical_compression_user_prompt = """ 
Compress the following text. Return only the compressed text with no additional text. Text to compress: {TEXT_TO_COMPRESS}
"""

semantical_decompression_system_prompt = """ 
You are a ChatGPT LLM trained by OpenAI to decompress text. 
The compressed text you will be given was compressed by a different ChatGPT LLM that was instructed to maximize the compression rate and perserve semnatical meaning.
The decompressed text does not need to match the original exactly, but the decompressed text should have the same semantical meaning as the original text.
"""

semantical_decompression_user_prompt = """ 
Decompress the following text. Return only the decompressed text with no additional text. Text to decompress: {TEXT_TO_DECOMPRESS}
"""

In [None]:
# Get Compressed Text
gpt35_file_name_to_semantical_compressed_text: Dict[str, str] = {}
for file_name, original_text in file_name_to_original_text.items():
    
    # Including system prompt as GPT-3.5 Turbo is not trained to pay attention to the system prompt as outlined here
    # https://platform.openai.com/docs/guides/chat
    compression_messages = [
        {'role': 'system', 'content': semantical_compression_system_prompt},
        {'role': 'user', 'content': f"{semantical_compression_system_prompt} \n \n {semantical_compression_user_prompt.format(TEXT_TO_COMPRESS=original_text)}"}]
    gpt35_file_name_to_semantical_compressed_text[file_name] = OpenAIHandler.get_chat_completion(
        messages=compression_messages, 
        model='gpt-3.5-turbo')[0]

In [None]:
gpt35_file_name_to_semantical_compressed_bytes = {
    file_name: compressed_text.encode('utf-8')
    for file_name, compressed_text in gpt35_file_name_to_semantical_compressed_text.items()}

In [None]:
# Get Decompressed Text
gpt35_file_name_to_semantical_decompressed_text: Dict[str, str] = {}
for file_name, compressed_text in gpt35_file_name_to_semantical_compressed_text.items():
    
    # Including system prompt as GPT-3.5 Turbo is not trained to pay attention to the system prompt as outlined here
    # https://platform.openai.com/docs/guides/chat
    decompression_messages = [
        {'role': 'system', 'content': semantical_decompression_system_prompt},
        {'role': 'user', 'content': f"{semantical_decompression_system_prompt} \n \n {semantical_decompression_user_prompt.format(TEXT_TO_DECOMPRESS=compressed_text)}"}]
    gpt35_file_name_to_semantical_decompressed_text[file_name] = OpenAIHandler.get_chat_completion(
        messages=decompression_messages, 
        model='gpt-3.5-turbo')[0]

In [None]:
# Get embeddings for decompressed text
gpt35_file_name_to_semantical_decompressed_embeddings: Dict[str, np.ndarray] = {}
for file_name, decompressed_text in gpt35_file_name_to_semantical_decompressed_text.items():
    gpt35_file_name_to_semantical_decompressed_embeddings[file_name] = OpenAIHandler.get_text_embedding(decompressed_text)

In [None]:
# Saving the lossless compressed text
GPT35_DECOMPRESSED_EMBEDDING_SEMANTIC_PATH = 'experiment_data/gpt35_file_name_to_semantically_decompressed_embeddings.json'
with open(GPT35_DECOMPRESSED_EMBEDDING_SEMANTIC_PATH, 'w') as f:
    json_serializable = {
        file_name: embedding.tolist() 
        for file_name, embedding in gpt35_file_name_to_semantical_decompressed_embeddings.items()}
    json.dump(json_serializable, f)

### Load Base Compression Analysis Results

In [None]:
# Load the embeddings for the original texts
GPT4_ORIGINAL_EMBEDDING_PATH = 'experiment_data/gpt4_embeddings.json'
gpt4_file_name_to_original_text_embeddings = json.load(open(GPT4_ORIGINAL_EMBEDDING_PATH))

In [None]:
# Get the base compressed texts
GPT4_COMPRESSED_TEXT_PATH = 'experiment_data/gpt4_compressed_text.json'
gpt4_file_name_to_base_compressed_text: Dict[str, str] = json.load(open(GPT4_COMPRESSED_TEXT_PATH))

In [None]:
gpt4_file_name_to_base_compressed_bytes = {
    file_name: compressed_text.encode('utf-8')
    for file_name, compressed_text in gpt4_file_name_to_base_compressed_text.items()}

In [None]:
# Get the base compressed texts
GPT4_DECOMPRESSED_TEXT_PATH = 'experiment_data/gpt4_decompressed_text.json'
file_name_to_base_decompressed_text = json.load(open(GPT4_DECOMPRESSED_TEXT_PATH))

In [None]:
# Get the base compressed texts
GPT4_DECOMPRESSED_EMBEDDING_PATH = 'experiment_data/gpt4_decompressed_embeddings.json'
gpt4_file_name_to_base_decompressed_embeddings = json.load(open(GPT4_DECOMPRESSED_EMBEDDING_PATH))

### Computing Entropy

In [None]:
# Base Entropy: ChatGPT-4
gpt4_file_name_to_base_compressed_bytes_entropy = {
    file_name: functions.entropy(str(compressed_bytes))
    for file_name, compressed_bytes in gpt4_file_name_to_base_compressed_bytes.items()}

In [None]:
# Lossless Entropy: ChatGPT-4
gpt4_file_name_to_lossless_compressed_bytes_entropy = {
    file_name: functions.entropy(str(compressed_bytes))
    for file_name, compressed_bytes in gpt4_file_name_to_lossless_compressed_bytes.items()}

In [None]:
# Semantic Entropy: ChatGPT-4
gpt4_file_name_to_semantically_compressed_bytes_entropy = {
    file_name: functions.entropy(str(compressed_bytes))
    for file_name, compressed_bytes in gpt4_file_name_to_semantically_compressed_bytes.items()}

In [None]:
# Lossless Entropy: ChatGPT-3.5
gpt35_file_name_to_lossless_compressed_bytes_entropy = {
    file_name: functions.entropy(str(compressed_bytes))
    for file_name, compressed_bytes in gpt35_file_name_to_lossless_compressed_bytes.items()}

In [None]:
# Semantic Entropy: ChatGPT-3.5
gpt35_file_name_to_semantical_compressed_bytes_entropy = {
    file_name: functions.entropy(str(compressed_bytes))
    for file_name, compressed_bytes in gpt35_file_name_to_semantical_compressed_bytes.items()}

In [None]:
ZLIB_MOST_COMPRESSED_DATA_PATH = "experiment_data/zlib_most_compressed_bytes_entropy_sim.json"
zlib_most_compression_entropy = json.load(open(ZLIB_MOST_COMPRESSED_DATA_PATH))

In [None]:
ZLIB_LEAST_COMPRESSED_DATA_PATH = "experiment_data/zlib_least_compressed_bytes_entropy_sim.json"
zlib_least_compression_entropy = json.load(open(ZLIB_LEAST_COMPRESSED_DATA_PATH))

### Graphing Entropy

In [None]:
# Graph a stacked bar chart of the entropy of the compressed bytes
combined_entropy_df = pd.DataFrame({
    'Base Compression (ChatGPT-4)': gpt4_file_name_to_base_compressed_bytes_entropy,
    'Lossless Compression (ChatGPT-4)': gpt4_file_name_to_lossless_compressed_bytes_entropy,
    'Semantic Compression (ChatGPT-4)': gpt4_file_name_to_semantically_compressed_bytes_entropy,
    'Lossless Compression (ChatGPT-3.5)': gpt35_file_name_to_lossless_compressed_bytes_entropy,
    'Semantic Compression (ChatGPT-3.5)': gpt35_file_name_to_semantical_compressed_bytes_entropy,
    'Zlib Most Compression': zlib_most_compression_entropy,
    'Zlib Least Compression': zlib_least_compression_entropy,
})

# Normalize the data universally, not along the columns
combined_entropy_df = combined_entropy_df / combined_entropy_df.max().max()

In [None]:
# Make a grouped plotly bar chart
px.bar(
    combined_entropy_df, 
    barmode='group',
    title='Relative Entropy of Compressed Bytes',
    labels={
        'value': 'Entropy',
        'index': 'Text'})

In [None]:
# Average all indeces
transposed_entropy_df = combined_entropy_df.T
transposed_entropy_df = transposed_entropy_df.mean(axis=1)

In [None]:
# Make a grouped plotly bar chart
fig = px.bar(
    transposed_entropy_df, 
    title='Averaged Entropy of Compressed Bytes',
    color=transposed_entropy_df.index,
    text=transposed_entropy_df.values.round(3),
    labels={
        'value': 'Relative Entropy',
        'index': 'Compression Method'})
fig.update_layout({'xaxis': {'categoryorder': 'total descending'}})
fig.show()

### Computing Compression Ratio

In [None]:
# Base Compression Ratio: ChatGPT-4
gpt4_file_name_to_base_compression_ratio: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_ratio = 1-(len(gpt4_file_name_to_base_compressed_bytes[file_name]) / len(file_name_to_original_text[file_name].encode('utf-8')))
    gpt4_file_name_to_base_compression_ratio[file_name] = compression_ratio

In [None]:
# Lossless Compression Ratio: ChatGPT-4
gpt4_file_name_to_lossless_compression_ratio: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_ratio = 1-(len(gpt4_file_name_to_lossless_compressed_bytes[file_name]) / len(file_name_to_original_text[file_name].encode('utf-8')))
    gpt4_file_name_to_lossless_compression_ratio[file_name] = compression_ratio

In [None]:
# Semantic Compression Ratio: ChatGPT-4
gpt4_file_name_to_semantic_compression_ratio: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_ratio = 1-(len(gpt4_file_name_to_semantically_compressed_bytes[file_name]) / len(file_name_to_original_text[file_name].encode('utf-8')))
    gpt4_file_name_to_semantic_compression_ratio[file_name] = compression_ratio

In [None]:
# Lossless Compression Ratio: ChatGPT-3.5
gpt35_file_name_to_lossless_compression_ratio: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_ratio = 1-(len(gpt35_file_name_to_lossless_compressed_bytes[file_name]) / len(file_name_to_original_text[file_name].encode('utf-8')))
    gpt35_file_name_to_lossless_compression_ratio[file_name] = compression_ratio

In [None]:
# Semantic Compression Ratio: ChatGPT-3.5
gpt35_file_name_to_semantic_compression_ratio: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_ratio = 1-(len(gpt35_file_name_to_semantical_compressed_bytes[file_name]) / len(file_name_to_original_text[file_name].encode('utf-8')))
    gpt35_file_name_to_semantic_compression_ratio[file_name] = compression_ratio

In [None]:
ZLIB_MOST_COMPRESSION_RATIO_DATA_PATH = "experiment_data/zlib_most_compression_ratio.json"
zlib_most_compression_ratio = json.load(open(ZLIB_MOST_COMPRESSION_RATIO_DATA_PATH))

In [None]:
ZLIB_LEAST_COMPRESSION_RATIO_DATA_PATH = "experiment_data/zlib_least_compression_ratio.json"
zlib_least_compression_ratio = json.load(open(ZLIB_LEAST_COMPRESSION_RATIO_DATA_PATH))

### Graphing Compression Ratio

In [None]:
combined_ratio_df = pd.DataFrame({
    'Base Compression (ChatGPT-4)': gpt4_file_name_to_base_compression_ratio,
    'Lossless Compression (ChatGPT-4)': gpt4_file_name_to_lossless_compression_ratio,
    'Semantic Compression (ChatGPT-4)': gpt4_file_name_to_semantic_compression_ratio,
    'Lossless Compression (ChatGPT-3.5)': gpt35_file_name_to_lossless_compression_ratio,
    'Semantic Compression (ChatGPT-3.5)': gpt35_file_name_to_semantic_compression_ratio,
    'Zlib Most Compression': zlib_most_compression_ratio,
    'Zlib Least Compression': zlib_least_compression_ratio})

In [None]:
# Plot the compression ratio
px.bar(
    combined_ratio_df,
    title='Compression Ratio',
    labels={
        'value': 'Compression Ratio',
        'index': 'Text',
        'variable': 'Compression Method'},
    barmode='group')

In [None]:
# Average Universly
transposed_ratio_df = combined_ratio_df.T
transposed_ratio_df = transposed_ratio_df.mean(axis=1)

In [None]:
# Plot the averaged compression ratio
fig = px.bar(
    transposed_ratio_df,
    x=transposed_ratio_df.index,
    y=transposed_ratio_df.values,
    color=transposed_ratio_df.index,
    title='Averaged Compression Ratio',
    text=transposed_ratio_df.values.round(3),
    labels={
        'value': 'Average Compression Ratio',
        'index': 'Compression Method'})
fig.update_layout({'xaxis': {'categoryorder': 'total descending'}})
fig.show()

### Computing Edit Distance

In [None]:
# Base Compression Edit Distance: ChatGPT-4
gpt4_file_name_to_base_compression_edit_distance: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_edit_distance = functions.edit_distance(file_name_to_base_decompressed_text[file_name], file_name_to_original_text[file_name])
    gpt4_file_name_to_base_compression_edit_distance[file_name] = compression_edit_distance

In [None]:
# Lossless Compression Edit Distance: ChatGPT-4
gpt4_file_name_to_lossless_compression_edit_distance: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_edit_distance = functions.edit_distance(gpt4_file_name_to_lossless_decompressed_text[file_name], file_name_to_original_text[file_name])
    gpt4_file_name_to_lossless_compression_edit_distance[file_name] = compression_edit_distance

In [None]:
# Semantic Compression Edit Distance: ChatGPT-4
gpt4_file_name_to_semantic_compression_edit_distance: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_edit_distance = functions.edit_distance(gpt4_file_name_to_semantically_decompressed_text[file_name], file_name_to_original_text[file_name])
    gpt4_file_name_to_semantic_compression_edit_distance[file_name] = compression_edit_distance

In [None]:
# Lossless Compression Edit Distance: ChatGPT-3.5
gpt35_file_name_to_lossless_compression_edit_distance: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_edit_distance = functions.edit_distance(gpt35_file_name_to_lossless_decompressed_text[file_name], file_name_to_original_text[file_name])
    gpt35_file_name_to_lossless_compression_edit_distance[file_name] = compression_edit_distance

In [None]:
# Semantic Compression Edit Distance: ChatGPT-3.5
gpt35_file_name_to_semantic_compression_edit_distance: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_edit_distance = functions.edit_distance(gpt35_file_name_to_semantical_decompressed_text[file_name], file_name_to_original_text[file_name])
    gpt35_file_name_to_semantic_compression_edit_distance[file_name] = compression_edit_distance

In [None]:
ZLIB_MOST_COMPRESSION_EDIT_DISTANCE_DATA_PATH = "experiment_data/zlib_most_compression_edit_distance.json"
zlib_most_edit_distance = json.load(open(ZLIB_MOST_COMPRESSION_EDIT_DISTANCE_DATA_PATH))

In [None]:
ZLIB_LEAST_COMPRESSION_EDIT_DISTANCE_DATA_PATH = "experiment_data/zlib_least_compression_edit_distance.json"
zlib_least_edit_distance = json.load(open(ZLIB_LEAST_COMPRESSION_EDIT_DISTANCE_DATA_PATH))

### Graphing Edit Distance

In [None]:
combined_edit_distance_df = pd.DataFrame({
    'Base Compression (ChatGPT-4)': gpt4_file_name_to_base_compression_edit_distance,
    'Lossless Compression (ChatGPT-4)': gpt4_file_name_to_lossless_compression_edit_distance,
    'Semantic Compression (ChatGPT-4)': gpt4_file_name_to_semantic_compression_edit_distance,
    'Lossless Compression (ChatGPT-3.5)': gpt35_file_name_to_lossless_compression_edit_distance,
    'Semantic Compression (ChatGPT-3.5)': gpt35_file_name_to_semantic_compression_edit_distance,
    'Zlib Most Compression': zlib_most_edit_distance,
    'Zlib Least Compression': zlib_least_edit_distance})

In [None]:
# Univerally normalize the data
combined_edit_distance_df = combined_edit_distance_df / combined_edit_distance_df.max().max()

In [None]:
# Plot the compression edit distance
px.bar(
    combined_edit_distance_df,
    title='Deompression Edit Distance',
    labels={
        'value': 'Edit Distance',
        'index': 'Text',
        'variable': 'Compression Method'},
    barmode='group')

In [None]:
# Average Universally
transposed_edit_distance_df = combined_edit_distance_df.T
transposed_edit_distance_df = transposed_edit_distance_df.mean(axis=1)

In [None]:
# Plot the compression edit distance
fig = px.bar(
    transposed_edit_distance_df,
    title='Decompression Edit Distance',
    color=transposed_edit_distance_df.index,
    text=transposed_edit_distance_df.values.round(6),
    labels={
        'value': 'Edit Distance',
        'index': 'Compression Method'})
fig.update_layout({'xaxis': {'categoryorder': 'total descending'}})
fig.show()

### Computing Embedding Cosine Distance

In [None]:
# Base Decompression Cosine Similarity: ChatGPT-4
gpt4_file_name_to_base_decompression_cosine_similarity: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    decompression_cosine_similarity = functions.cosine_distance(gpt4_file_name_to_base_decompressed_embeddings[file_name], gpt4_file_name_to_original_text_embeddings[file_name])
    gpt4_file_name_to_base_decompression_cosine_similarity[file_name] = decompression_cosine_similarity

In [None]:
# Lossless Decompression Cosine Similarity: ChatGPT-4
gpt4_file_name_to_lossless_decompression_cosine_similarity: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    decompression_cosine_similarity = functions.cosine_distance(gpt4_file_name_to_lossless_decompressed_embeddings[file_name], gpt4_file_name_to_original_text_embeddings[file_name])
    gpt4_file_name_to_lossless_decompression_cosine_similarity[file_name] = decompression_cosine_similarity

In [None]:
# Semantic Decompression Cosine Similarity: ChatGPT-4
gpt4_file_name_to_semantic_decompression_cosine_similarity: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    decompression_cosine_similarity = functions.cosine_distance(gpt4_file_name_to_semantically_decompressed_embeddings[file_name], gpt4_file_name_to_original_text_embeddings[file_name])
    gpt4_file_name_to_semantic_decompression_cosine_similarity[file_name] = decompression_cosine_similarity

In [None]:
# Lossless Decompression Cosine Similarity: ChatGPT-3.5
gpt35_file_name_to_lossless_decompression_cosine_similarity: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    decompression_cosine_similarity = functions.cosine_distance(gpt35_file_name_to_lossless_decompressed_embeddings[file_name], gpt4_file_name_to_original_text_embeddings[file_name])
    gpt35_file_name_to_lossless_decompression_cosine_similarity[file_name] = decompression_cosine_similarity

In [None]:
# Semantic Decompression Cosine Similarity: ChatGPT-3.5
gpt35_file_name_to_semantic_decompression_cosine_similarity: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    decompression_cosine_similarity = functions.cosine_distance(gpt35_file_name_to_semantical_decompressed_embeddings[file_name], gpt4_file_name_to_original_text_embeddings[file_name])
    gpt35_file_name_to_semantic_decompression_cosine_similarity[file_name] = decompression_cosine_similarity

In [None]:
ZLIB_MOST_COMPRESSION_COSINE_SIM_DATA_PATH = "experiment_data/zlib_most_cosine_sim.json"
zlib_most_cosine_sim = json.load(open(ZLIB_MOST_COMPRESSION_COSINE_SIM_DATA_PATH))

In [None]:
ZLIB_LEAST_COMPRESSION_COSINE_SIM_DATA_PATH = "experiment_data/zlib_least_cosine_sim.json"
zlib_least_cosine_sim = json.load(open(ZLIB_LEAST_COMPRESSION_COSINE_SIM_DATA_PATH))

### Graph Embedding Cosine Distance

In [None]:
combined_decompression_cosine_similarity_df = pd.DataFrame({
    'Base Compression (ChatGPT-4)': gpt4_file_name_to_base_decompression_cosine_similarity,
    'Lossless Compression (ChatGPT-4)': gpt4_file_name_to_lossless_decompression_cosine_similarity,
    'Semantic Compression (ChatGPT-4)': gpt4_file_name_to_semantic_decompression_cosine_similarity,
    'Lossless Compression (ChatGPT-3.5)': gpt35_file_name_to_lossless_decompression_cosine_similarity,
    'Semantic Compression (ChatGPT-3.5)': gpt35_file_name_to_semantic_decompression_cosine_similarity,
    'Zlib Most Compression': zlib_most_cosine_sim,
    'Zlib Least Compression': zlib_least_cosine_sim})

In [None]:
# Plot the decompression cosine similarity
px.bar(
    combined_decompression_cosine_similarity_df,
    title='Decompression Embedding Cosine Similarity',
    labels={
        'value': 'Cosine Similarity',
        'index': 'Text'},
    barmode='group')

In [None]:
# Average Universally
transposed_cosine_similarity_df = combined_decompression_cosine_similarity_df.T
transposed_cosine_similarity_df = transposed_cosine_similarity_df.mean(axis=1)

In [None]:
# Plot the decompression cosine similarity
fig = px.bar(
    transposed_cosine_similarity_df,
    title='Averaged Decompression Cosine Similarity',
    color=transposed_cosine_similarity_df.index,
    text=transposed_cosine_similarity_df.values.round(3),
    labels={
        'value': 'Decompression Cosine Similarity',
        'index': 'Compression Method'})
fig.update_layout({'xaxis': {'categoryorder': 'total descending'}})
fig.show()

# Relational Analysis

### Exact Reconstruction Effectiveness

In [None]:
# Compute the compression ratio divided by the edit distance
# We want to maximize the compression ratio and minimize the edit distance
# (1-transposed_edit_distance_df) is the inverse of the edit distance and accounts for the edit distance being 0 in exact matches
# np.log(transposed_ratio_df) effectivley makes better compression ratios less important
# 1 - ... is the inverse of the ratio and gives positive numbers as np.log returns negative numbers
exact_effectiveness_df = 1- (np.log(transposed_ratio_df) * (1-transposed_edit_distance_df))

In [None]:
# Average Universally
exact_effectiveness_df = exact_effectiveness_df / exact_effectiveness_df.max()

In [None]:
fig = px.bar(
    exact_effectiveness_df,
    title='Relative Exact Compression Effectiveness',
    color=exact_effectiveness_df.index,
    text=exact_effectiveness_df.values.round(3),
    labels={
        'value': 'Relative Exact Compression Effectiveness',
        'index': 'Compression Method'})
fig.update_layout({'xaxis': {'categoryorder': 'total descending'}})
fig.show()

### Semantic Reconstruction Effectiveness

In [None]:
# Compute the compression ratio multiplied by the cosine similarity
# We want to maximize the compression ratio and maximize the cosine similarity
semantic_effectiveness_df = transposed_ratio_df * transposed_cosine_similarity_df

In [None]:
# Average Universally
semantic_effectiveness_df = semantic_effectiveness_df / semantic_effectiveness_df.max()

In [None]:
fig = px.bar(
    semantic_effectiveness_df,
    title='Relative Semantic Compression Effectiveness',
    color=semantic_effectiveness_df.index,
    text=semantic_effectiveness_df.values.round(3),
    labels={
        'value': 'Relative Semantic Compression Effectiveness',
        'index': 'Compression Method'})
fig.update_layout({'xaxis': {'categoryorder': 'total descending'}})
fig.show()