# Python Script for Text OCR using OpenAI API
---
[OpenAI API documentation](https://platform.openai.com/docs/api-reference/introduction)

In [64]:
from dotenv import load_dotenv # load .env file for obtaining api key
from openai import OpenAI      # text generation
import pandas as pd            # save generated text as csv
import random      

load_dotenv()  # config .env file
MAX_WORDS = 12 # max words in generated text

In [65]:
client = OpenAI()


def generate_OCR_text(prompt, max_tokens = MAX_WORDS):
    '''
    Generate text using OpenAI\'s GPT-3.5 model from a given prompt.
    '''
    response = client.completions.create(
        model = "gpt-3.5-turbo-instruct",
        prompt = prompt,
        max_tokens = max_tokens,
        temperature = 0.975
    )
    return response.choices[0].text.strip()

# functions used to clean up the generated text (aka output)
def remove_quotation_marks(response_output):
    return response_output.replace('"', '')

def remove_delimiters(response_output):
    response_output = response_output.replace('\n', '')
    response_output = response_output.replace('\\', '')
    return response_output

# generation of 10,000 entries
entries = list()
duplicates = 0
random_word_prompts = ['Give me any random word (DO NOT define it):', 'Give me the name of a random molecule (DO NOT describe it):',
                       'Give me the name of a random molecule (DO NOT describe it):', 'Give me the name of a random city (just the city name):',
                       'Give me any random complex vocabulary word (DO NOT define it):', 'Give me the scientific name of a random animal (just the genus and epithet):',
                       'Give me a random food (just the name):', 'Give me a random plant (just the name):', 
                       'Give me a random item that you\'d find in a convenient store (just the name):', 'Give me a random type of material or fabric (just the name):']

random_phrase_prompts = ['Give me a random short title for a book:', 'Make up a random short title for a movie:',
                         'Make up a random short title for a TV show:', 'Make up a random short title for a video game:',
                         'Make up a random short title for a song:', 'Make up a fictional character\'s name:',
                         'Make up a random short title for a podcast:', 'Make up a random short title for a play:',
                         'Make up a random short title for a cartoon:', 'Make up a random short title for a comic book:']
while len(entries) < 10_000:
    # randomly choose a prompt (50/50 chance)
    p = random.uniform(0, 1)
    if p < 0.5:
        r_i = random.randint(0, 9)
        prompt = random_word_prompts[r_i]

    else:
        r_i = random.randint(0, 9)
        prompt = random_phrase_prompts[r_i]

    output = generate_OCR_text(prompt)

    # clean up the output
    output = remove_quotation_marks(output)
    output = remove_delimiters(output)

    # make sure we don't have duplicates
    if output not in entries:
        entries.append(output)
    else:
        duplicates += 1

In [68]:
print(f"Entries (size: {len(entries)}):")
print(entries)
print('\n')
print(f"Number of duplicates: {duplicates}")

Entries (size: 10000):
['Aspirin', 'Chasing Chaos: A Journey of Discovery', 'Midnight Solitude', 'Hexadecane', 'Evelyn Thornwood', 'Lost in the Moment', 'Sulfur hexafluoride', 'Spider plant', 'The Electric Adventures of Zephyr: A Tale of', 'Midnight Mirage', 'Zehara Naxos', 'Equanimity', 'Chiffon', 'The Wacky Adventures of Zippy and Flick', 'Crazy Adventures of the Zany Zoos', 'Charleston', 'Meliorism', 'Denim', 'Snack Pack Pudding', 'Pothos', 'Canis lupus (gray wolf)', 'Echoes in the Night', 'Perfluorooctane sulfonate', 'Midnight Mirage: The Lost Oasis', 'Midnight Moonbeams', 'Cosmic Chaos: The Intergalactic Adventures of', 'Midnight Fantasy', 'Unexpected Conversations', 'Whispers in the Wind', 'Echoes of the Past', 'Midnight Wanderlust', 'Uncharted Musings', 'Serendipitous Stories', 'Galactic Quest: The Search for the Cosmic Crystal', 'Luna Blackwood', 'Velvet', 'Midnight Whispers', 'Kara Devereaux', 'The Midnight Masquerade', 'Euphoria', 'Cairo', 'Silk', 'Evelina Fairchild', 'Chaos 

In [70]:
# store in a pandas dataframe
df = pd.DataFrame(entries, columns=['Words/Short Phrase'])

# save as a csv file
df.to_csv('OCR_text_dataset.csv', index=False)