# Setup

In [None]:
from dotenv import load_dotenv


# Loading OpenAI API key
ENV_FILE_PATH = ".env"
load_dotenv(ENV_FILE_PATH, override=True)

In [None]:
import json
import numpy as np
from typing import Dict, Tuple
import os
import zlib
import plotly.express as px
import pandas as pd
import logging
from collections import defaultdict
import concurrent.futures as cf

%load_ext autoreload
%autoreload 2

import utils.functions as functions
from utils.openai_handler import OpenAIHandler

In [None]:
# Ensuring that the OpenAI API key is set
OpenAIHandler.set_api_key(os.getenv("OPENAI_API_KEY"))

# Setting up logging
logging.basicConfig(level=logging.DEBUG)

In [None]:
LITERARY_TEXT_DIR = 'text_data/literary'
file_name_to_original_text: Dict[str, str] = {}
for f_name in os.listdir(LITERARY_TEXT_DIR):
    if not f_name.endswith('.txt'):
        logging.warning(f'File {f_name} is not a text file. Skipping.')
        continue
    with open(f'{LITERARY_TEXT_DIR}/{f_name}', 'r') as f:
        file_name_to_original_text[f_name] = f.read()

# Compressing with ChatGPT-4

### Meta Compression Prompt

### Supplied Prompt

Note this isn't the exact prompt as the texts were broken up in multiple queries to accomodate the token limit


**Prompt**
> Compress the following text into the smallest possible character representation. The resulting compressed text does not need to be human readable and only needs to be able to be reconstructed with a different ChatGPT-4 model. 



#### Text 1: a_good_man_is_hard_to_find.txt

**Original Text**

> THE GRANDMOTHER didn't want to go to Florida. She wanted to visit some of her connections in east Tennessee and she was seizing at every chance to change Bailey's mind. Bailey was the son she lived with, her only boy. He was sitting on the edge of his chair at the table, bent over the orange sports section of the Journal. "Now look here, Bailey," she said, "see here, read this," and she stood with one hand on her thin hip and the other rattling the newspaper at his bald head. "Here this fellow that calls himself The Misfit is aloose from the Federal Pen and headed toward Florida and you read here what it says he did to these people. Just you read it. I wouldn't take my children in any direction with a criminal like that aloose in it. I couldn't answer to my conscience if I did."
Bailey didn't look up from his reading so she wheeled around then and faced the children's mother, a young woman in slacks, whose face was as broad and innocent as a cabbage and was tied around with a green head-kerchief that had two points on the top like rabbit's ears. She was sitting on the sofa, feeding the baby his apricots out of a jar. "The children have been to Florida before," the old lady said. "You all ought to take them somewhere else for a change so they would see different parts of the world and be broad. They never have been to east Tennessee."

**Compressed Text**
> Gmdn'twtg2FL,wntdETNvst,chngeB'smnd.Bsn,livewth,prststionMisfit,escpdFedPen. Bnognr,tlks2kids'mom abtchng&brdnngxp.

**Decompressed Text**
> Godmother didn't want to go to Florida, she wanted to see the East Tennessee view and change Bailey's mind. But in the end, she had to live with the persistent Misfit, an escaped federal penitentiary prisoner. Back on the road again, she talks to the children's mother about change and the burden of experience.


#### Text 2: break_it_down.txt

**Original Text**

> Though of course that wasn’t all that went on, because we were together almost all day long. She would keep looking at me and every time she looked at me it was worth something, and she smiled at me and didn’t stop talking and singing, something I said, she would sail into it, a snatch, for me, she would be gone from me a little ways but smiling too, and tell me jokes, and I loved it but didn’t exactly know what to do about it and just smiled back at her and felt slow next to her, just not quick enough. So she talked and touched me on the shoulder and the arm, she kept touching and stayed close to me. You’re with each other all day long and it keeps happening, the touches and smiles, and it adds up, it builds up, and you know where you’ll be that night, you’re talking and every now and then you think about it, no, you don’t think, you just feel it as a kind of destination, what’s coming up after you leave wherever you are all evening and you’re happy about it and you’re planning it all, not in your head, really, somewhere inside your body, or all through your body, it’s all mounting up and coming together so that when you get in bed you can’t help it, it’s a real performance, it all pours out, but slowly, you go easy until you can’t anymore, or you hold back the whole time, you hold back and touch the edges of everything, you edge around until you have to plunge in and finish it off, and when you’re finished, you’re too weak to stand but after a while you have to go to the bathroom and you stand, your legs are trembling, you hold the doorways, there’s a little light coming in through the window, you can see your way in and out, but you can’t really see the bed.

**Compressed Text**
> Wtghralldylng,Lkng@me,smilng,singng,jkng,IsmileBck,slownxt2her.Shetlks,touchesme,clse2me.Tchs&smilesaddup,bildup,nightplns,bodyplns,prfrmnce,slweasy,holdbck,edgearnd,plunge,finish,weaknss,trmblnglegs,smllight,bthrm.

**Decompressed Text**
> Watching her all day long, looking at me, smiling, singing, joking, I smile back, slowly next to her. She talks, touches me, close to me. Touches and smiles add up, build up, night plans, body plans, performance, slow and easy, hold back, edge around, plunge, finish, weakness, trembling legs, small light, bathroom.


#### Text 3: cat_person.txt

**Original Text**

> Flirting with her customers was a habit she’d picked up back when she worked as a barista, and it helped with tips. She didn’t earn tips at the movie theatre, but the job was boring otherwise, and she did think that Robert was cute. Not so cute that she would have, say, gone up to him at a party, but cute enough that she could have drummed up an imaginary crush on him if he’d sat across from her during a dull class—though she was pretty sure that he was out of college, in his mid-twenties at least. He was tall, which she liked, and she could see the edge of a tattoo peeking out from beneath the rolled-up sleeve of his shirt. But he was on the heavy side, his beard was a little too long, and his shoulders slumped forward slightly, as though he were protecting something.

**Compressed Text**
> Flrtng4tips,barista,movieTh8r,noTips,RobertCute,talk@party,imaginaryCrush,dullClass,mid20s,tall,liked,tattooEdge,rolledSlv,heavySide,beard2long,slumpedShldrs,prtctngSthng.

**Decompressed Text**
>  Flirting for tips, the barista at the movie theater doesn't get tips. Robert is cute, we talk at a party, and I have an imaginary crush on him. In a dull class, he's in his mid-20s, tall, liked, with a tattoo edging out from his rolled-up sleeve. He's on the heavy side, with a beard that's too long, and slumped shoulders as if protecting something.


#### Text 4: cathedral.txt

**Original Text**

> This blind man, an old friend of my wife’s, he was on his way to
spend the night. His wife had died. So he was visiting the dead wife’s
relatives in Connecticut. He called my wife from his in-law’s. Arrangements
were made. He would come by train, a five-hour trip, and my wife would
meet him at the station. She hadn’t seen him since she worked for him one
summer in Seattle ten years ago. But she and the blind man had kept in
touch. They made tapes and mailed them back and forth. I wasn’t
enthusiastic about his visit. He was no one I knew. And his being blind
bothered me. My idea of blindness came from the movies. In the movies, the
blind moved slowly and never laughed. Sometimes they were led by seeingeye dogs. A blind man in my house was not something I looked forward to.
That summer in Seattle she had needed a job. She didn’t have any
money. The man she was going to marry at the end of the summer was in
officers’ training school. He didn’t have any money, either. But she was in
love with the guy, and he was in love with her, etc. She’d seen something in
the paper: HELP WANTED—Reading to Blind Man, and a telephone
number. She phoned and went over, was hired on the spot. She worked with
this blind man all summer. She read stuff to him, case studies, reports, that
sort of thing. She helped him organize his little office in the county socialservice department. They’d become good friends, my wife and the blind
man. On her last day in the office, the blind man asked if he could touch her
face. She agreed to this. She told me he touched his fingers to every part of
her face, her nose—even her neck! She never forgot it. She even tried to
write a poem about it. She was always trying to write a poem. She wrote a
poem or two every year, usually after something really important had happened to her.

**Compressed Text**
> BlndMan,wifeFrnd,visit,wifeDied,CTrelatives,5hrTrain,arrangements,10yrsAgo,Seattle,keptInTouch,tapes,notEnthusiastic,blindness,movies,noLaugh,seeingeyeDogs,uncomfortable,neededJob,summerLove,officersTraining,noMoney,jobAd,reading2blind,hired,workSocialService,goodFrnds,lastDay,touchFace,neck,memorable,poemAttempt,importantEvents.

**Decompressed Text**
> A blind man, who was my wife's friend, came to visit after his wife died. He was visiting Connecticut relatives and took a 5-hour train to make arrangements. We met 10 years ago in Seattle and kept in touch through tapes. He was not enthusiastic about his blindness, and we talked about movies, not laughing, and seeing-eye dogs. It was uncomfortable, and I needed a job. We reminisced about our summer love, my time in officer's training, having no money, and a job ad for reading to the blind. He got hired and worked in social service. We became good friends, and on the last day, he touched my face and neck in a memorable moment. I attempted to write a poem about the important events in our lives.


#### Text 5: flowers_for_algernon.txt

**Original Text**

> I had a test today. I think I faled it. and I think that maybe now they wont use me. What happind is a nice young man was in the room and he had some white cards with ink spillled all over them. He sed Charlie what do you see on this card. I was very skared even tho I had my rabits foot in my pockit because when I was a kid I always faled tests in school and I spillled ink to.
I told him I saw a inkblot. He said yes and it made me feel good. I thot
that was all but when I got up to go he stopped me. He said now sit down
Charlie we are not thru yet. Then I dont remember so good but he wan tid me
to say what was in the ink. I dint see nuthing in the ink but he said there was
picturs there other pepul saw some picturs. I coudnt see any picturs. I reely
tryed to see. I held the card close up and then far away. Then I said if 1 had
my glases I coud see better I usally only ware my glases in the movies or TV
but 1 said they are in the closh in the hall. I got them. Then I said let me see
that card agen I bet III find it now. 

**Compressed Text**
> IAtst2dy.thnkFald&nwntUs.WhtHppndNceYngMnRmWhtCardsInkSpld.SdCrliCWht?SkrdRabtFtPkt.KdFldTsts&SpilInk2.SwInkblot.YsFeltGd.ThotAlGtUpStpd.SdNtThru.~RmbrWntdCInk.DntCnPic.OthrsCpics.CntCpics.TryedClsUp/Far.GlsesMovTV.CloshHall.GtGlses.SdCrdAgnBtFndNw.

**Decompressed Text**
> I attempted to study but I think I failed and now want to understand. What happened to the once young man in the room with the cards and ink spilled? He said, "Charlie, can you see what?" Scared, I grabbed a rabbit's foot from my pocket. The kid failed the tests and spilled ink too. I saw an inkblot. Yes, I felt good. I thought we all got up and stopped. Said not through yet. I remember wanting to see the ink. I don't see any pictures. Others can see pictures. I can't see pictures. I tried to clear up close and far. Glasses move on the TV. I closed the hall and got my glasses. I saw the card again but couldn't find anything new.


#### Text 6: sticks.txt

**Original Text**

> We left home, married,  had children of our own, found the seeds of meanness blooming also within us. Dad began dressing the pole with more complexity and less discernible logic. He draped some kind of fur over it on Groundhog Day and lugged out a floodlight to ensure a shadow. When an earthquake struck Chile he lay the pole on its side and spray painted a rift in the earth. Mom died and he dressed the pole as Death and hung from the crossbar photos of Mom as a baby. We'd stop by and find odd talismans from his youth arranged around the base: army medals, theater tickets, old sweatshirts, tubes of Mom's makeup. One autumn he painted the pole bright yellow. He covered it with cotton swabs that winter for warmth and provided offspring by hammering in six crossed sticks around the yard. He ran lengths of string between the pole and the sticks, and taped to the string letters of apology, admissions of error, pleas for understanding, all written in a frantic hand on index cards. He painted a sign saying LOVE and hung it from the pole and another that said FORGIVE? and then he died in the hall with the radio on and we sold the house to a young couple who yanked out the pole and the sticks and left them by the road on garbage day.

**Compressed Text**
> LftHome,maried,kids,meanessInUs.DadDressPoleCmplx&LessLogic.FurGroundhogDay,light4shadow.EarthquakeChile,poleSide,sprayRift.MomDied,poleDeath,photosMomBaby.OddTalismansBase:armyMedals,tickets,sweatshirts,MomMkup.AutumnYellowPole,CottonSwabsWntr,6crossedSticksYard,String&lettersApology,Error,PleasFranticIdxCards.SignLOVE&FORGIVE?DadDiedHallRadioOn,HouseSold,NewCoupleYankPoleSticks,GarbageDay.

**Decompressed Text**
> I left home, got married, had kids, and there was meanness in us. My dad dressed the pole with complexity and less logic. For Groundhog Day, he added a light for the shadow. There was an earthquake in Chile, so he attached something to the pole's side and sprayed the rift. When my mom died, he added a symbol for death and photos of my mom with a baby. Odd talismans at the base included army medals, tickets, sweatshirts, and my mom's makeup. The pole had an autumn yellow theme and cotton swabs for winter. He placed six crossed sticks in the yard and attached string and letters of apology, error, and frantic pleas on index cards. Did he want a sign of love and forgiveness? My dad died with the hall radio on, the house was sold, and the new couple yanked the pole and sticks for garbage day.


#### Text 7: symbols_and_signs.txt

**Original Text**

> For the fourth time in as many years, they were confronted with the problem of what birthday present to take to a young man who was incurably deranged in his mind. Desires he had none. Man-made objects were to him either hives of evil, vibrant with a malignant activity that he alone could perceive, or gross comforts for which no use could be found in his abstract world. After eliminating a number of articles that might offend him or frighten him (anything in the gadget line, for instance, was taboo), his parents chose a dainty and innocent trifle—a basket with ten different fruit jellies in ten little jars.

**Compressed Text**
> 4thTimeInYrs,BdayPrsnt4YoungDerangedMan.DesiresNone.ManMadeObjEvilOrNoUseAbstrctWrld.EliminatGadget&Offensive,FrightenItems.ParentsPickInnocentTrifle:Basket10FruitJellies10Jars.

**Decompressed Text**
> For the fourth time in years, it was a birthday present for a young, deranged man who desired nothing. Man-made objects were either evil or had no use in his abstract world. He sought to eliminate gadgets and offensive, frightening items. His parents chose an innocent trifle as a gift: a basket of 10 fruit jellies in 10 jars.


#### Text 8: the_bogey_beast.txt - TODO: Redo

**Original Text**

> There was once a woman who was very, very cheerful, though she had little to make her so; for she was old, and poor, and lonely. She lived in a little bit of a cottage and earned a scant living by running errands for her neighbours, getting a bite here, a sup there, as reward for her services. So she made shift to get on, and always looked as spry and cheery as if she had not a want in the world.
Now one summer evening, as she was trotting, full of smiles as ever, along the high road to her hovel, what should she see but a big black pot lying in the ditch!
"Goodness me!" she cried, "that would be just the very thing for me if I only had something to put in it! But I haven't! Now who could have left it in the ditch?"
And she looked about her expecting the owner would not be far off; but she could see nobody.

**Compressed Text**
> WmnCheerful,OldPoorLonely.SmCottage,Errands4Neighbors,Scraps2Live.HighRoad,BlackPotDitch,ThinksUseful.NoOwner,Hole?TakesHome,FullGoldCoins.Amazed,FeelsRich.Heavy,DragsShawl.Go-cart.Ideas2SpendGold.Tired,Rests,SilverNow!SilverLuck.LessTrouble,StillRich.Plans2SpendSilver.Tired,Rests,IronNow!IronLuck.Sells4PennyPieces,StillRich.Plans4PennyPieces.Tired,Rests,StoneNow!StoneLuck.GateStopper.Happy. StoneJumps,Grows,Bogey-BeastLaughs.Disappears.WmnLaughs,FeelsLucky&GRAND.ChucklesGoodLuck.

**Decompressed Text**
> A cheerful woman, old, poor, and lonely, lived in a small cottage. She did errands for neighbors and survived on scraps. One day, on the high road, she found a black pot in a ditch and thought it might be useful. Seeing no owner and noticing a hole, she took it home, only to discover it was full of gold coins. Amazed and feeling rich, she dragged the heavy pot with her shawl, eventually using a go-cart to help. She had ideas on how to spend the gold, but feeling tired, she rested. When she awoke, the gold had turned to silver. Thinking it was silver luck, she felt it was less trouble and still made her rich. She made plans to spend the silver but got tired and rested again. This time, the silver turned into iron. Deciding it was iron luck, she sold it for penny pieces, still feeling rich. She made plans for the penny pieces, but once again got tired and rested. Upon waking, she found the pot filled with stones. She thought it was stone luck and used a stone as a gate stopper. Happy, the stone suddenly jumped and grew into a bogey-beast that laughed and disappeared. The woman laughed too, feeling lucky and grand, and chuckled at her good luck.


#### Text 9: the_lottery.txt - TODO: Redo this one

**Original Text**

> The morning of June 27th was clear and sunny, with the fresh warmth of a full-summer day; the flowers were blossoming profusely and the grass was richly green. The people of the village began to gather in the square, between the post office and the bank, around ten o’clock; in some towns there were so many people that the lottery took two days and had to be started on June 20th, but in this village, where there were only about three hundred people, the whole lottery took less than two hours, so it could begin at ten o’clock in the morning and still be through in time to allow the villagers to get home for noon dinner.
The children assembled first, of course. School was recently over for the summer, and the feeling of liberty sat uneasily on most of them; they tended to gather together quietly for a while before they broke into boisterous play, and their talk was still of the classroom and the teacher, of books and reprimands. Bobby Martin had already stuffed his pockets full of stones, and the other boys soon followed his example, selecting the smoothest and roundest stones; Bobby and Harry Jones and Dickie Delacroix—the villagers pronounced this name “Dellacroy”—eventually made a great pile of stones in one corner of the square and guarded it against the raids of the other boys. The girls stood aside, talking among themselves, looking over their shoulders at the boys, and the very small children rolled in the dust or clung to the hands of their older brothers or sisters.


**Compressed Text**
> June27MornClearSunny,FullSummer,VillageGatherSquare10am,300Ppl,Lottery2hrs,Home4NoonDinner.KidsAssemble1st,SchoolOut,DiscussClass&Teacher,BoysCollectStones,Pile&Guard,GirlsTalk,SmallKidsPlay.MenGather,DiscussFarm&Taxes,WomenJoin,Greet&Gossip,CallKids,BobbyMartinResists,JoinsFamily.MrSummersConductsLottery,CoalBiz,Childless,ScoldWife,BlackBoxArrives,MrGraves3LeggedStool,BoxOnStool,DistanceKept,MrMartin&BaxterHelpHold,MrSummersStirsPapers.BlackBoxOld,Shabby,Tradition,NotReplaced,OrigWoodFaded&Stained.

**Decompressed Text**
> On June 27th, the morning was clear and sunny, with the full summer in effect. The village gathered in the square at 10 am, with 300 people attending the lottery, which lasted two hours before they went home for a noon dinner. The kids assembled first, as school was out, discussing their classes and teachers. Boys collected stones and piled them up, guarding their collection, while the girls talked and the small kids played. Men gathered, discussing farming and taxes, and the women joined in, greeting each other and gossiping. They called the kids, and Bobby Martin resisted at first but eventually joined his family. Mr. Summers conducted the lottery, owning a coal business and being childless, often scolding his wife. The black box arrived, placed on Mr. Graves' three-legged stool. People kept their distance as Mr. Martin and Mr. Baxter helped hold the box. Mr. Summers stirred the papers inside. The black box was old and shabby, a tradition not replaced, with its original wood faded and stained.


#### Text 10: the_veldt.txt

**Original Text**

> “George, I wish you’d look at the nursery.”
“What’s wrong with it?”
“I don’t know.”
“Well, then.”
“I just want you to look at it, is all, or call a psychologist in to look at it.”
“What would a psychologist want with a nursery?”
“You know very well what he’d want.” His wife paused in the middle of the kitchen
and watched the stove busy humming to itself, making supper for four.
“It’s just that the nursery is different now than it was.”
“All right, let’s have a look.”
They walked down the hall of their soundproofed Happylife Home, which had cost
them thirty thousand dollars installed, this house which clothed and fed and rocked
them to sleep and played and sang and was good to them. Their approach sensitized a
switch somewhere and the nursery light flicked on when they came within ten feet of
it. Similarly, behind them, in the halls, lights went on and off as they left them behind,
with a soft automaticity.
“Well,” said George Hadley. 

**Compressed Text**
> George,LookNursery,WhatsWrong,DontKnow,WifeWantsCheckOrPsychologist,StoveMakingSupper,HappylifeHome,30k,NurseryLightOnApproach,HallLightsAutoOnOff.

**Decompressed Text**
> George was asked to look at the nursery, but he didn't know what was wrong. His wife wanted him to check it out or consult a psychologist. Meanwhile, the stove was making supper in their HappyLife Home, which cost them $30,000. The nursery light turned on as they approached, and the hall lights automatically turned on and off.


# Data Analysis

### Loading text data

In [None]:
LITERARY_TEXT_DIR_PATH = "text_data/literary"

In [None]:
file_name_to_original_text: Dict[str, str] = {}
for file_name in os.listdir(LITERARY_TEXT_DIR_PATH):
    with open(f'{LITERARY_TEXT_DIR_PATH}/{file_name}', 'r') as f:
        file_name_to_original_text[file_name] = f.read()

In [None]:
file_name_to_chatGPT4_compressed_text: Dict[str, str] = {
    'a_good_man_is_hard_to_find.txt': "Gmdn'twtg2FL,wntdETNvst,chngeB'smnd.Bsn,livewth,prststionMisfit,escpdFedPen. Bnognr,tlks2kids'mom abtchng&brdnngxp.",
    'break_it_down.txt': "Wtghralldylng,Lkng@me,smilng,singng,jkng,IsmileBck,slownxt2her.Shetlks,touchesme,clse2me.Tchs&smilesaddup,bildup,nightplns,bodyplns,prfrmnce,slweasy,holdbck,edgearnd,plunge,finish,weaknss,trmblnglegs,smllight,bthrm.",
    'cat_person.txt': "Flrtng4tips,barista,movieTh8r,noTips,RobertCute,talk@party,imaginaryCrush,dullClass,mid20s,tall,liked,tattooEdge,rolledSlv,heavySide,beard2long,slumpedShldrs,prtctngSthng.",
    'cathedral.txt': "BlndMan,wifeFrnd,visit,wifeDied,CTrelatives,5hrTrain,arrangements,10yrsAgo,Seattle,keptInTouch,tapes,notEnthusiastic,blindness,movies,noLaugh,seeingeyeDogs,uncomfortable,neededJob,summerLove,officersTraining,noMoney,jobAd,reading2blind,hired,workSocialService,goodFrnds,lastDay,touchFace,neck,memorable,poemAttempt,importantEvents.",
    'flowers_for_algernon.txt': "IAtst2dy.thnkFald&nwntUs.WhtHppndNceYngMnRmWhtCardsInkSpld.SdCrliCWht?SkrdRabtFtPkt.KdFldTsts&SpilInk2.SwInkblot.YsFeltGd.ThotAlGtUpStpd.SdNtThru.~RmbrWntdCInk.DntCnPic.OthrsCpics.CntCpics.TryedClsUp/Far.GlsesMovTV.CloshHall.GtGlses.SdCrdAgnBtFndNw.",
    'sticks.txt': "LftHome,maried,kids,meanessInUs.DadDressPoleCmplx&LessLogic.FurGroundhogDay,light4shadow.EarthquakeChile,poleSide,sprayRift.MomDied,poleDeath,photosMomBaby.OddTalismansBase:armyMedals,tickets,sweatshirts,MomMkup.AutumnYellowPole,CottonSwabsWntr,6crossedSticksYard,String&lettersApology,Error,PleasFranticIdxCards.SignLOVE&FORGIVE?DadDiedHallRadioOn,HouseSold,NewCoupleYankPoleSticks,GarbageDay.",
    'symbols_and_signs.txt': "4thTimeInYrs,BdayPrsnt4YoungDerangedMan.DesiresNone.ManMadeObjEvilOrNoUseAbstrctWrld.EliminatGadget&Offensive,FrightenItems.ParentsPickInnocentTrifle:Basket10FruitJellies10Jars.",
    'the_bogey_beast.txt': "Old poor cheerful woman finds pot in ditch, wonders who left it. ",
    'the_lottery.txt': "June 27, sunny, villagers gather for lottery, takes <2hr. Children assemble, stones collected, girls watch",
    'the_veldt.txt': "George,LookNursery,WhatsWrong,DontKnow,WifeWantsCheckOrPsychologist,StoveMakingSupper,HappylifeHome,30k,NurseryLightOnApproach,HallLightsAutoOnOff."
}

In [None]:
file_name_to_chatGPT4_decompressed_text: Dict[str, str] = {
    'a_good_man_is_hard_to_find.txt': "Godmother didn't want to go to Florida, she wanted to see the East Tennessee view and change Bailey's mind. But in the end, she had to live with the persistent Misfit, an escaped federal penitentiary prisoner. Back on the road again, she talks to the children's mother about change and the burden of experience.",
    'break_it_down.txt': "Watching her all day long, looking at me, smiling, singing, joking, I smile back, slowly next to her. She talks, touches me, close to me. Touches and smiles add up, build up, night plans, body plans, performance, slow and easy, hold back, edge around, plunge, finish, weakness, trembling legs, small light, bathroom.",
    'cat_person.txt': "Flirting for tips, the barista at the movie theater doesn't get tips. Robert is cute, we talk at a party, and I have an imaginary crush on him. In a dull class, he's in his mid-20s, tall, liked, with a tattoo edging out from his rolled-up sleeve. He's on the heavy side, with a beard that's too long, and slumped shoulders as if protecting something.",
    'cathedral.txt': "A blind man, who was my wife's friend, came to visit after his wife died. He was visiting Connecticut relatives and took a 5-hour train to make arrangements. We met 10 years ago in Seattle and kept in touch through tapes. He was not enthusiastic about his blindness, and we talked about movies, not laughing, and seeing-eye dogs. It was uncomfortable, and I needed a job. We reminisced about our summer love, my time in officer's training, having no money, and a job ad for reading to the blind. He got hired and worked in social service. We became good friends, and on the last day, he touched my face and neck in a memorable moment. I attempted to write a poem about the important events in our lives.",
    'flowers_for_algernon.txt': """I attempted to study but I think I failed and now want to understand. What happened to the once young man in the room with the cards and ink spilled? He said, "Charlie, can you see what?" Scared, I grabbed a rabbit's foot from my pocket. The kid failed the tests and spilled ink too. I saw an inkblot. Yes, I felt good. I thought we all got up and stopped. Said not through yet. I remember wanting to see the ink. I don't see any pictures. Others can see pictures. I can't see pictures. I tried to clear up close and far. Glasses move on the TV. I closed the hall and got my glasses. I saw the card again but couldn't find anything new.""",
    'sticks.txt': "I left home, got married, had kids, and there was meanness in us. My dad dressed the pole with complexity and less logic. For Groundhog Day, he added a light for the shadow. There was an earthquake in Chile, so he attached something to the pole's side and sprayed the rift. When my mom died, he added a symbol for death and photos of my mom with a baby. Odd talismans at the base included army medals, tickets, sweatshirts, and my mom's makeup. The pole had an autumn yellow theme and cotton swabs for winter. He placed six crossed sticks in the yard and attached string and letters of apology, error, and frantic pleas on index cards. Did he want a sign of love and forgiveness? My dad died with the hall radio on, the house was sold, and the new couple yanked the pole and sticks for garbage day.",
    'symbols_and_signs.txt': "For the fourth time in years, it was a birthday present for a young, deranged man who desired nothing. Man-made objects were either evil or had no use in his abstract world. He sought to eliminate gadgets and offensive, frightening items. His parents chose an innocent trifle as a gift: a basket of 10 fruit jellies in 10 jars.",
    'the_bogey_beast.txt': "An elderly woman who is poor yet cheerful discovers a pot in a ditch and is curious about who might have left it there.",
    'the_lottery.txt': "On June 27th, the morning was clear and sunny, with the full summer in effect. The village gathered in the square at 10 am, with 300 people attending the lottery, which lasted two hours before they went home for a noon dinner. The kids assembled first, as school was out, discussing their classes and teachers. Boys collected stones and piled them up, guarding their collection, while the girls talked and the small kids played. Men gathered, discussing farming and taxes, and the women joined in, greeting each other and gossiping. They called the kids, and Bobby Martin resisted at first but eventually joined his family. Mr. Summers conducted the lottery, owning a coal business and being childless, often scolding his wife. The black box arrived, placed on Mr. Graves' three-legged stool. People kept their distance as Mr. Martin and Mr. Baxter helped hold the box. Mr. Summers stirred the papers inside. The black box was old and shabby, a tradition not replaced, with its original wood faded and stained.",
    'the_veldt.txt': "On June 27th, during a sunny day, the villagers come together to participate in the lottery, which takes less than two hours to complete. The children gather and collect stones, while the girls observe the event."
}

### Getting Embeddings

In [None]:
# Get the text embeddings for each original text
file_name_to_original_text_embeddings: Dict[str, np.ndarray] = {}
for file_name, original_text in file_name_to_original_text.items():
    file_name_to_original_text_embeddings[file_name] = OpenAIHandler.get_text_embedding(original_text)

In [None]:
# Get the text embeddings for each decompressed text
file_name_to_chatGPT4_decompressed_text_embeddings: Dict[str, np.ndarray] = {}
for file_name, decompressed_text in file_name_to_chatGPT4_decompressed_text.items():
    file_name_to_chatGPT4_decompressed_text_embeddings[file_name] = OpenAIHandler.get_text_embedding(decompressed_text)

### Saving GPT-4 Data

In [12]:
BASE_FILE_PATH = "experiment_data/gpt4_{data_type}.json"

In [13]:
# Saving the compressed text
GPT4_COMPRESSED_DATA_PATH = BASE_FILE_PATH.format(data_type="compressed_text")
with open(GPT4_COMPRESSED_DATA_PATH, 'w') as f:
    json.dump(file_name_to_chatGPT4_compressed_text, f)

In [14]:
# Get the decompressed text
GPT4_DECOMPRESSED_DATA_PATH = BASE_FILE_PATH.format(data_type="decompressed_text")
with open(GPT4_DECOMPRESSED_DATA_PATH, 'w') as f:
    json.dump(file_name_to_chatGPT4_decompressed_text, f)

In [15]:
# Saving the original text embeddings
GPT4_EMBEDDINGS_DATA_PATH = BASE_FILE_PATH.format(data_type="embeddings")
with open(GPT4_EMBEDDINGS_DATA_PATH, 'w') as f:
    json_serializable = {
        file_name: embedding.tolist() 
        for file_name, embedding in file_name_to_original_text_embeddings.items()}
    json.dump(json_serializable, f)

In [17]:
# Saving the decompressed text embeddings
GPT4_DECOMPRESSED_EMBEDDINGS_DATA_PATH = BASE_FILE_PATH.format(data_type="decompressed_embeddings")
with open(GPT4_DECOMPRESSED_EMBEDDINGS_DATA_PATH, 'w') as f:
    json_serializable = {
        file_name: embedding.tolist() 
        for file_name, embedding in file_name_to_chatGPT4_decompressed_text_embeddings.items()}
    json.dump(json_serializable, f)

### Applying Burrows-Wheeler Transform

In [18]:
file_name_to_zlib_most_compressed_bytes: Dict[str, str] = {}
file_name_to_zlib_most_decompressed_text: Dict[str, str] = {}
for file_name, original_text in file_name_to_original_text.items():
    compressed_bytes = zlib.compress(original_text.encode('utf-8'), level=9)
    file_name_to_zlib_most_compressed_bytes[file_name] = compressed_bytes
    file_name_to_zlib_most_decompressed_text[file_name] = zlib.decompress(compressed_bytes).decode('utf-8')

In [19]:
file_name_to_zlib_least_compressed_bytes: Dict[str, str] = {}
file_name_to_zlib_least_decompressed_text: Dict[str, str] = {}
for file_name, original_text in file_name_to_original_text.items():
    compressed_bytes = zlib.compress(original_text.encode('utf-8'), level=1)
    file_name_to_zlib_least_compressed_bytes[file_name] = compressed_bytes
    file_name_to_zlib_least_decompressed_text[file_name] = zlib.decompress(compressed_bytes).decode('utf-8')

### Computing Entropy

In [20]:
file_name_to_chatGPT4_compressed_bytes = {
    file_name: compressed_text.encode('utf-8')
    for file_name, compressed_text in file_name_to_chatGPT4_compressed_text.items()}

In [21]:
# ChatGPT4 Entropy
file_name_to_chatGPT4_compressed_bytes_entropy = {
    file_name: functions.entropy(str(compressed_bytes))
    for file_name, compressed_bytes in file_name_to_chatGPT4_compressed_bytes.items()}

In [22]:
# zlib Most Compressed Entropy
file_name_to_zlib_most_compressed_bytes_entropy = {
    file_name: functions.entropy(str(compressed_bytes))
    for file_name, compressed_bytes in file_name_to_zlib_most_compressed_bytes.items()}

In [23]:
# zlib Most Lease Entropy
file_name_to_zlib_least_compressed_bytes_entropy = {
    file_name: functions.entropy(str(compressed_bytes))
    for file_name, compressed_bytes in file_name_to_zlib_least_compressed_bytes.items()}

### Saving Entropy Data

In [25]:
ZLIB_MOST_COMPRESSED_DATA_PATH = "experiment_data/zlib_most_compressed_bytes_entropy_sim.json"
with open(ZLIB_MOST_COMPRESSED_DATA_PATH, 'w') as f:
    json.dump(file_name_to_zlib_most_compressed_bytes_entropy, f)

In [26]:
ZLIB_LEAST_COMPRESSED_DATA_PATH = "experiment_data/zlib_least_compressed_bytes_entropy_sim.json"
with open(ZLIB_LEAST_COMPRESSED_DATA_PATH, 'w') as f:
    json.dump(file_name_to_zlib_least_compressed_bytes_entropy, f)

### Graphing Entropy

In [27]:
# Graph a stacked bar chart of the entropy of the compressed bytes
combined_df = pd.DataFrame({
    'ChatGPT4': file_name_to_chatGPT4_compressed_bytes_entropy,
    'zlib Most Compressed': file_name_to_zlib_most_compressed_bytes_entropy,
    'zlib Least Compressed': file_name_to_zlib_least_compressed_bytes_entropy,
})

# Normalize the data universally, not along the columns
combined_df = combined_df / combined_df.max().max()

In [29]:
# Make a grouped plotly bar chart
px.bar(
    combined_df, 
    barmode='group',
    title='Relative Entropy of Compressed Bytes',
    labels={
        'value': 'Entropy',
        'index': 'Text',
        'variable': 'Compression Method'})

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# Average all indeces
transposed_df = combined_df.T
transposed_df = transposed_df.mean(axis=1)

In [None]:
# Make a grouped plotly bar chart
px.bar(
    transposed_df, 
    x=transposed_df.index,
    y=transposed_df.values,
    color=transposed_df.index,
    text=transposed_df.values.round(3),
    title='Averaged Entropy of Compressed Bytes',
    labels={
        'y': 'Relative Entropy',
        'index': 'Compression Method'})

### Computing Compression Ratio

In [None]:
# ChatGPT Compression Ratio
file_name_to_chatGPT4_compression_ratio: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_ratio = 1-(len(file_name_to_chatGPT4_compressed_bytes[file_name]) / len(file_name_to_original_text[file_name].encode('utf-8')))
    file_name_to_chatGPT4_compression_ratio[file_name] = compression_ratio

In [None]:
# zlib Most Compressed Compression Ratio
file_name_to_zlib_most_compression_ratio: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_ratio = 1-(len(file_name_to_zlib_most_compressed_bytes[file_name]) / len(file_name_to_original_text[file_name].encode('utf-8')))
    file_name_to_zlib_most_compression_ratio[file_name] = compression_ratio

In [None]:
# zlib Most Least Compression Ratio
file_name_to_zlib_least_compression_ratio: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_ratio = 1-(len(file_name_to_zlib_least_compressed_bytes[file_name]) / len(file_name_to_original_text[file_name].encode('utf-8')))
    file_name_to_zlib_least_compression_ratio[file_name] = compression_ratio

In [None]:
ZLIB_MOST_COMPRESSION_RATIO_DATA_PATH = "experiment_data/zlib_most_compression_ratio.json"
with open(ZLIB_MOST_COMPRESSION_RATIO_DATA_PATH, 'w') as f:
    json.dump(file_name_to_zlib_most_compression_ratio, f)

In [None]:
ZLIB_LEAST_COMPRESSION_RATIO_DATA_PATH = "experiment_data/zlib_least_compression_ratio.json"
with open(ZLIB_LEAST_COMPRESSION_RATIO_DATA_PATH, 'w') as f:
    json.dump(file_name_to_zlib_least_compression_ratio, f)

### Graphing Compression Ratio

In [None]:
combined_ratio_df = pd.DataFrame({
    'ChatGPT4': file_name_to_chatGPT4_compression_ratio,
    'zlib Most Compressed': file_name_to_zlib_most_compression_ratio,
    'zlib Least Compressed': file_name_to_zlib_least_compression_ratio})

In [None]:
# Plot the compression ratio
px.bar(
    combined_ratio_df,
    title='Compression Ratio',
    labels={
        'value': 'Compression Ratio',
        'index': 'Text',
        'variable': 'Compression Method'},
    barmode='group')

In [None]:
# Average Universly
transposed_ratio_df = combined_ratio_df.T
transposed_ratio_df = transposed_ratio_df.mean(axis=1)

In [None]:

# Plot the averaged compression ratio
px.bar(
    transposed_ratio_df,
    x=transposed_ratio_df.index,
    y=transposed_ratio_df.values,
    color=transposed_ratio_df.index,
    text=transposed_ratio_df.values.round(3),
    title='Averaged Compression Ratio',
    labels={
        'value': 'Average Compression Ratio',
        'index': 'Compression Method'})

### Computing Edit Distance

In [None]:
# ChatGPT4 Compression Edit Distance
file_name_to_chatGPT4_compression_edit_distance: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_edit_distance = functions.edit_distance(file_name_to_chatGPT4_decompressed_text[file_name], file_name_to_original_text[file_name])
    file_name_to_chatGPT4_compression_edit_distance[file_name] = compression_edit_distance

In [None]:
# zlib Most Compressed Compression Edit Distance
file_name_to_zlib_most_compression_edit_distance: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_edit_distance = functions.edit_distance(file_name_to_zlib_most_decompressed_text[file_name], file_name_to_original_text[file_name])
    file_name_to_zlib_most_compression_edit_distance[file_name] = compression_edit_distance

In [None]:
# zlib Least Compressed Compression Edit Distance
file_name_to_zlib_least_compression_edit_distance: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    compression_edit_distance = functions.edit_distance(file_name_to_zlib_least_decompressed_text[file_name], file_name_to_original_text[file_name])
    file_name_to_zlib_least_compression_edit_distance[file_name] = compression_edit_distance

In [None]:
ZLIB_MOST_COMPRESSION_EDIT_DISTANCE_DATA_PATH = "experiment_data/zlib_most_compression_edit_distance.json"
with open(ZLIB_MOST_COMPRESSION_EDIT_DISTANCE_DATA_PATH, 'w') as f:
    json.dump(file_name_to_zlib_most_compression_edit_distance, f)

In [None]:
ZLIB_LEAST_COMPRESSION_EDIT_DISTANCE_DATA_PATH = "experiment_data/zlib_least_compression_edit_distance.json"
with open(ZLIB_LEAST_COMPRESSION_EDIT_DISTANCE_DATA_PATH, 'w') as f:
    json.dump(file_name_to_zlib_least_compression_edit_distance, f)

### Graphing Edit Distance

In [None]:
combined_edit_distance_df = pd.DataFrame({
    'ChatGPT4': file_name_to_chatGPT4_compression_edit_distance,
    'zlib Most Compressed': file_name_to_zlib_most_compression_edit_distance,
    'zlib Least Compressed': file_name_to_zlib_least_compression_edit_distance})

In [None]:
transposed_edit_distance_df_2 = combined_edit_distance_df.T
transposed_edit_distance_df_2 = transposed_edit_distance_df_2.mean(axis=1)
# Univerally normalize the data
combined_edit_distance_df = combined_edit_distance_df / combined_edit_distance_df.max().max()

In [None]:
# Plot the compression edit distance
px.bar(
    combined_edit_distance_df,
    title='Compression Edit Distance',
    labels={
        'value': 'Compression Edit Distance',
        'index': 'Text'},
    barmode='group')

In [None]:
# Average Universally
transposed_edit_distance_df = combined_edit_distance_df.T
transposed_edit_distance_df = transposed_edit_distance_df.mean(axis=1)

In [None]:
# Plot the compression edit distance
px.bar(
    transposed_edit_distance_df,
    title='Compression Edit Distance',
    color=transposed_edit_distance_df.index,
    text=transposed_edit_distance_df.values.round(3),
    labels={
        'value': 'Edit Distance',
        'index': 'Compression Method'})

### Computing Embedding Cosine Distance

In [None]:
# ChatGPT4 Decompression Cosine Similarity
file_name_to_chatGPT4_decompression_cosine_similarity: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    decompression_cosine_similarity = functions.cosine_distance(file_name_to_chatGPT4_decompressed_text_embeddings[file_name], file_name_to_original_text_embeddings[file_name])
    file_name_to_chatGPT4_decompression_cosine_similarity[file_name] = decompression_cosine_similarity

In [None]:
# zlib Most Compressed Decompression Cosine Similarity
file_name_to_zlib_most_decompression_cosine_similarity: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    # Using the original text embeddings twice as there is no loss in compression
    decompression_cosine_similarity = functions.cosine_distance(file_name_to_original_text_embeddings[file_name], file_name_to_original_text_embeddings[file_name])
    file_name_to_zlib_most_decompression_cosine_similarity[file_name] = decompression_cosine_similarity

In [None]:
# zlib Leasr Compressed Decompression Cosine Similarity
file_name_to_zlib_least_decompression_cosine_similarity: Dict[str, float] = {}
for file_name in file_name_to_original_text.keys():
    # Using the original text embeddings twice as there is no loss in compression
    decompression_cosine_similarity = functions.cosine_distance(file_name_to_original_text_embeddings[file_name], file_name_to_original_text_embeddings[file_name])
    file_name_to_zlib_least_decompression_cosine_similarity[file_name] = decompression_cosine_similarity

In [None]:
ZLIB_MOST_COMPRESSION_COSINE_SIM_DATA_PATH = "experiment_data/zlib_most_cosine_sim.json"
with open(ZLIB_MOST_COMPRESSION_COSINE_SIM_DATA_PATH, 'w') as f:
    json.dump(file_name_to_zlib_most_decompression_cosine_similarity, f)

In [None]:
ZLIB_LEAST_COMPRESSION_COSINE_SIM_DATA_PATH = "experiment_data/zlib_least_cosine_sim.json"
with open(ZLIB_LEAST_COMPRESSION_COSINE_SIM_DATA_PATH, 'w') as f:
    json.dump(file_name_to_zlib_least_decompression_cosine_similarity, f)

### Graphing Embedding Cosine Distance

In [None]:
combined_cosine_similarity_df = pd.DataFrame({
    'ChatGPT4': file_name_to_chatGPT4_decompression_cosine_similarity,
    'zlib Most Compressed': file_name_to_zlib_most_decompression_cosine_similarity,
    'zlib Least Compressed': file_name_to_zlib_least_decompression_cosine_similarity})

In [None]:
# Plot the decompression cosine similarity
px.bar(
    combined_cosine_similarity_df,
    title='Decompression Embedding Cosine Similarity',
    labels={
        'value': 'Cosine Similarity',
        'index': 'Text'},
    barmode='group')

In [None]:
# Average Universally
transposed_cosine_similarity_df = combined_cosine_similarity_df.T
transposed_cosine_similarity_df = transposed_cosine_similarity_df.mean(axis=1)

In [None]:
# Plot the decompression cosine similarity
px.bar(
    transposed_cosine_similarity_df,
    title='Averaged Decompression Cosine Similarity',
    color=transposed_cosine_similarity_df.index,
    text=transposed_cosine_similarity_df.values.round(3),
    labels={
        'value': 'Decompression Cosine Similarity',
        'index': 'Compression Method'})