In [4]:
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
def generate_ai_essays():
    import openai

    openai.organization = os.getenv("OPENAI_ORG")
    openai.api_key = os.getenv("OPENAI_API_KEY")

    prompt_dir = Path("./data/prompts/").absolute()
    essay_dir = Path("./data/essays/").absolute()

    generated_essays = {}

    for prompt_file_path in prompt_dir.iterdir():
        prompt_number = int(
            prompt_file_path.name.removeprefix("prompt_").removesuffix(".txt")
        )

        if (essay_dir / f"gpt_{prompt_number}.txt").exists():
            essay_file_path = essay_dir / f"gpt_{prompt_number}.txt"

            with essay_file_path.open("r") as essay_file:
                essay = essay_file.read().replace("\n", "")
                generated_essays[prompt_number] = essay

        else:
            with prompt_file_path.open("r") as prompt_file:
                prompt = prompt_file.read().replace("\n", "")

                generated_essays[prompt_number] = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo", 
                    # max_tokens=2048, # defaults to infinity
                    messages=[
                        {"role": "system", "content":"You are an essay writer."},
                        {"role": "user", "content": prompt},
                    ]
                )

    return generated_essays


generated_essays = generate_ai_essays()

essay_dir = Path("./data/essays/").absolute()

for number, message in generated_essays.items():
    essay_file = essay_dir / f"gpt_{number}.txt"
    essay_file.write_text(message.choices[0].message.content)

In [59]:
import openai, asyncio

sem = asyncio.Semaphore(30)


async def create_chat_completion(prompt: str):
    async with sem:
        completion =  await openai.ChatCompletion.acreate(
            model="gpt-3.5-turbo",
            # max_tokens=2048, # defaults to infinity
            messages=[
                {"role": "system", "content": "You are an essay writer."},
                {"role": "user", "content": prompt},
            ],
        )

        await asyncio.sleep(1)
        return completion


async def generate_essays_async():
    openai.organization = os.getenv("OPENAI_ORG")
    openai.api_key = os.getenv("OPENAI_API_KEY")

    prompt_dir = Path("./data/prompts/").absolute()
    essay_dir = Path("./data/essays/").absolute()

    prompts = {}

    for prompt_file_path in prompt_dir.iterdir():
        prompt_number = int(
            prompt_file_path.name.removeprefix("prompt_").removesuffix(".txt")
        )

        with prompt_file_path.open("r") as prompt_file:
            prompt = prompt_file.read().replace("\n", "")

            prompts[prompt_number] = prompt

    generated_essays = await asyncio.gather(
        *[
            asyncio.ensure_future(create_chat_completion(prompt))
            for prompt in prompts.values()
        ]
    )

    for number, message in enumerate(generated_essays):
        essay_file = essay_dir / f"gpt_{number}.txt"
        essay_file.write_text(message.choices[0].message.content)


await generate_essays_async()

RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 6130fcb82071ae0e945256866b952593 in your message.)

In [9]:
import pandas as pd

dataset = pd.read_csv("./data/dataset.csv")


def preproccess(dataset: pd.DataFrame):
    wrong_indexes_train = []

    for index, row in dataset.iterrows():
        if row["words"] < 100:
            wrong_indexes_train.append(index)

    if wrong_indexes_train != []:
        dataset.loc[wrong_indexes_train].to_csv('./data/dropped_rows.csv')
        dataset.drop(index=wrong_indexes_train, inplace=True)
    
    return dataset


def dataset_to_csv():
    dataset = pd.DataFrame()

    essay_dir = Path("./data/essays/").absolute()
    essay_name = []
    essays = []
    words = []
    chars = []
    is_gpt = []

    for essay_path in essay_dir.iterdir():
        essay_name.append(essay_path.name)

        if essay_path.name.split("_")[0] == "gpt":
            is_gpt.append(1)
        else:
            is_gpt.append(0)

        with essay_path.open("r") as essay:
            essay = essay.read().replace("\n", "")
            essays.append(essay)
            words.append(len(essay.split(" ")))
            chars.append(len(list(essay.replace(" ", ""))))

    dataset = pd.DataFrame(
        {
            "essay_name": essay_name,
            "essay": essays,
            "words": words,
            "chars": chars,
            "is_gpt": is_gpt,
        }
    )

    dataset = preproccess(dataset)

    dataset.to_csv(Path("./data/dataset.csv"), index=False)

    return dataset


dataset_to_csv()

Unnamed: 0,essay_name,essay,words,chars,is_gpt
0,human_1.txt,The protagonist Yossarian in Joseph Heller's C...,916,4451,0
1,human_2.txt,Steven Pinker’s “A History of Violence: Edge M...,577,3049,0
2,human_3.txt,"In the play “Death of a Salesman”, the four ma...",581,2765,0
3,human_4.txt,In the novel A Portrait of the Artist as a You...,420,2099,0
4,human_5.txt,Fashion is a puzzling topic. Fashion appeals t...,377,1959,0
...,...,...,...,...,...
111,gpt_54.txt,"In Jon Krakauer’s novel, Into The Wild, Christ...",416,2300,1
112,gpt_55.txt,"Umberto Eco's essay ""Ur-Fascism"" defines fasci...",125,651,1
113,gpt_56.txt,"In Ray Bradbury's ""The Pedestrian,"" the light ...",115,589,1
114,gpt_57.txt,"In the dimly lit streets of Victorian London, ...",154,608,1
