In [1]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4o") # temperature 0.7 (default)

In [4]:
from langchain_core.output_parsers import StrOutputParser
from prompt import sundanese_overgeneration_prompt, topics

chain = sundanese_overgeneration_prompt | model | StrOutputParser()

def generate_25_examples(topic):
    return chain.batch([{"topic":topic}] * 5)

In [12]:
sundanese_overgeneration_prompt

FewShotPromptTemplate(input_variables=['topic'], example_selector=<prompt.CustomExampleSelector object at 0x109af4160>, example_prompt=PromptTemplate(input_variables=['correct_ending', 'story_premise', 'wrong_ending'], template='Story Premise: {story_premise}\nCorrect Ending: {correct_ending}\nIncorrect Ending: {wrong_ending}'), suffix='Please generate several triplets, strictly following the format in the examples, do not add bullets or any additional response.', prefix='Your task is to write severals triplets of story premises consists of four sentences, wrong ending, and correct ending in Sundanese. Include sundanese cultural value in the story with the topic "{topic}". Here are some examples of the story format:')

In [6]:
res = generate_25_examples(topics[-1])
print(res[0])

Story Premise: Mang Darman sok ngadamel jamu tradisional unggal isuk-isuk. Manéhna percaya yén jamu tiasa ngajaga kasehatan. Unggal isuk, tanggana sok meuli jamu ti Mang Darman. Jamu éta dijieun tina campuran jahe, kunyit, sareng bahan alami séjénna.
Correct Ending: Tanggana ngarasa langkung séhat sanggeus rutin nginum jamu ti Mang Darman
Incorrect Ending: Mang Darman mutuskeun pikeun ngeureunkeun ngadamel jamu

Story Premise: Engkus sok nginum jamu peuyem saméméh sare. Manéhna yakin yén jamu éta tiasa ngusir setrés. Engkus sering nyaritakeun ka babaturanana ngeunaan khasiat jamu peuyem. Sababaraha babaturanana mimiti nginum jamu éta ogé.
Correct Ending: Babaturanana ogé ngaraos langkung santai sanggeus nginum jamu peuyem
Incorrect Ending: Babaturanana jadi teu hayang ngobrol deui jeung Engkus

Story Premise: Dina upacara adat Seren Taun, masarakat ngumpul di balé desa. Arak-arakan hasil tani disayogikeun ka karuhun. Aya tari-tarian tradisional sareng musik gamelan pikeun ngiringan upa

In [None]:
from tqdm.auto import tqdm

responses = []

for _ in tqdm(range(40)):
    res = generate_25_examples()
    responses.append(res)

In [None]:
import re
def parse_generated_stories(res):
    # Split the input based on double new lines to separate each story
    res = re.sub(r'\d+\.\s*', '', res)
    res = res.split('\n')
    stories = []
    premise, correct_ending, incorrect_ending = None, None, None
    for line in res:
        if 'story premise' in line.lower():
            premise = line.split(':')[-1].strip()
        elif 'incorrect ending' in line.lower():
            incorrect_ending = line.split(":")[-1].strip()
            stories.append({"premise": premise, "correct_ending": correct_ending, "incorrect_ending": incorrect_ending})
            premise, correct_ending, incorrect_ending = None, None, None
        elif 'correct ending' in line.lower():
            correct_ending = line.split(':')[-1].strip()
        else:
            continue

    return stories

In [None]:
responses_flattened = [res for subset in responses for res in subset]
responses_formatted = [parse_generated_stories(res) for res in responses_flattened]
responses_final = [res for subset in responses_formatted for res in subset]


In [None]:
import pickle as pkl

pkl.dump(responses_final, open("generated_stories.pkl", 'wb'))

In [None]:
import pandas as pd 
pd.DataFrame(responses_final).to_csv("train_gpt4o.csv", index=False)