In [1]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4o") # temperature 0.7 (default)



In [2]:
from langchain_core.output_parsers import StrOutputParser
from prompt import sundanese_overgeneration_prompt, topics, javanese_overgeneration_prompt

# Function to create a chain for the selected language
def create_chain(prompt_template, model):
    return prompt_template | model | StrOutputParser()

# Chains for both languages
chain_sundanese = create_chain(sundanese_overgeneration_prompt, model)
chain_javanese = create_chain(javanese_overgeneration_prompt, model)

In [22]:
# Generalized function to generate examples for both languages
def generate_examples(language_chain, topic, n, batch_size=5):
    # Determine the number of batches based on n and batch size
    num_batches = (n + batch_size - 1) // batch_size  # Ensures rounding up to next batch
    examples = []
    
    # Batch processing to generate examples
    for _ in range(num_batches):
        # Generate one batch of examples
        batch_input = [{"topic": topic}] * batch_size
        batch_output = language_chain.batch(batch_input)
        examples.extend(batch_output)
    
    return examples[:n]  # In case of extra examples from batch size rounding

In [23]:
# Example function to generate 25 examples for a specific topic in Javanese
def generate_examples_javanese(topic, n):
    return generate_examples(chain_javanese, topic, n)

# Example function to generate 25 examples for a specific topic in Sundanese
def generate_examples_sundanese(topic, n):
    return generate_examples(chain_sundanese, topic, n)


Topics:

0. Food
1. Wedding
2. Family Relationship
3. Pregnancy and kids
4. Death
5. Religious Holiday
6. Agriculture
7. Fishers and trade
8. Art
9. Traditional Games
10. Daily Activities
11. Socio-regiligious

In [24]:
# Example usage
selected_topic = topics[0]  # Select a topic (e.g., "Food")

# Generate 25 examples in Javanese and Sundanese
javanese_examples = generate_examples_javanese(topic = selected_topic, n=25)

In [25]:
print(javanese_examples[0])

Story Premise: Kinan tuku oleh-oleh gethuk goreng saka Sokaraja kanggo keluargane. Kinan seneng banget ngetokake panganan tradisional iki marang kulawargane. Gethuk goreng iku panganan sing digawe saka telo sing digoreng lan dipangan nganggo gula aren cair. Kulawargane Kinan ora sabar pengin nyicipi.
Correct Ending: Kabeh kulawargane Kinan seneng banget karo rasa manis lan gurih gethuk goreng.
Incorrect Ending: Kinan lali nggawa gethuk goreng iku bali menyang omah.

Story Premise: Keluarga Sinta mangan bareng saben malem Minggu. Saben malem Minggu, ibune masak rawon sing enak banget. Rawon iku sup daging sapi khas Jawa Timur sing ngandhut kluwek. Saben anggota kulawarga nggawa cerito lan nuduhake kabar anyar.
Correct Ending: Mangan bareng dadi wektu sing spesial kanggo kulawarga Sinta.
Incorrect Ending: Ibune Sinta lali carane masak rawon.

Story Premise: Dimas lan kanca-kancane nggawe es dawet ireng kanggo dijual ing pasar. Es dawet ireng iku minuman tradisional saka cendol ireng lan 

In [5]:
topics

['Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
 'Wedding (e.g.: traditions before, during, and after marriage, bride & groom wedding clothes, invited guests, wedding location, food and gifts)',
 'Family relationships (e.g.: relationship with main and extended family, relation with society/neighbours, clan/descendant system)',
 'Pregnancy and Kids (e.g.: traditions during pregnancy, traditions after birth, how to care for a newborn baby, how to care for toddlers, how to care for children, teenagers, parents and childrens interactions as adults)',
 'Death (e.g.: tradition when death occurs, taking care of corpse, tradition after the body is buried, clothes of the mourners, inheritance matters)',
 'Religious holidays (e.g.: traditions before religious holidays, traditions leading up to religious holidays, traditions during religious holidays, traditions after religious holidays)',
 'Agriculture (e.g.:what to 

In [27]:
from tqdm.auto import tqdm

# responses = []

# for _ in tqdm(range(40)):
#     res = generate_25_examples_javanese(selected_topic)
#     responses.append(res)

# Initialize an empty list to store the responses for each topic
responses = []

# Loop over each topic (0 to 11) with a progress bar
for idx in tqdm(range(len(topics))):  # Assuming topics[0] to topics[11] exist
    print('Generating for topic:', topics[idx])
    topic = topics[idx]  # Get the topic for the current index
    res = generate_examples_javanese(n=85,topic=topic)  # Generate examples for the current topic
    responses.append((topic, res))  # Append the topic and result as a tuple (topic, res)

  0%|          | 0/12 [00:00<?, ?it/s]

Generating for topic: Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)
Generating for topic: Wedding (e.g.: traditions before, during, and after marriage, bride & groom wedding clothes, invited guests, wedding location, food and gifts)
Generating for topic: Family relationships (e.g.: relationship with main and extended family, relation with society/neighbours, clan/descendant system)
Generating for topic: Pregnancy and Kids (e.g.: traditions during pregnancy, traditions after birth, how to care for a newborn baby, how to care for toddlers, how to care for children, teenagers, parents and childrens interactions as adults)
Generating for topic: Death (e.g.: tradition when death occurs, taking care of corpse, tradition after the body is buried, clothes of the mourners, inheritance matters)
Generating for topic: Religious holidays (e.g.: traditions before religious holidays, traditions leading up to religious holid

In [15]:
responses

[('Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
  ['Story Premise: Laras seneng banget karo wedang uwuh, minuman tradisional saka Yogyakarta. Saben sore, dheweke mesthi ngombe wedang uwuh kanggo njaga kesehatan lan semangat. Wedang uwuh digawe saka campuran rempah-rempah alami sing wangi. Laras percaya yen wedang uwuh bisa ngusir rasa kesel lan lemes.\nCorrect Ending: Saben sore, dheweke seneng bareng-bareng karo kanca-kanca ngombe wedang uwuh.\nIncorrect Ending: Dheweke ngindhari ngombe wedang uwuh saben sore.\n\nStory Premise: Budi lan keluargane asring lelungan menyang Solo kanggo golek oleh-oleh khas. Salah siji oleh-oleh sing paling disenengi yaiku serabi. Serabi Solo duwe rasa manis lan gurih sing khas. Budi mesthi nggawa pulang serabi akeh-akeh kanggo sedulure.\nCorrect Ending: Oleh-oleh serabi Solo nggawe Budi tambah akrab karo keluargane.\nIncorrect Ending: Budi ora seneng karo serabi Solo.\n\nSto

In [28]:
len(responses)

12

In [29]:
import re
def parse_generated_stories(topic, res):
    # Split the input based on double new lines to separate each story
    res = re.sub(r'\d+\.\s*', '', res)
    res = res.split('\n')
    stories = []
    premise, correct_ending, incorrect_ending = None, None, None
    for line in res:
        if 'story premise' in line.lower():
            premise = line.split(':')[-1].strip()
        elif 'incorrect ending' in line.lower():
            incorrect_ending = line.split(":")[-1].strip()
            stories.append({"topic": topic, "premise": premise, 
                            "correct_ending": correct_ending, "incorrect_ending": incorrect_ending})
            premise, correct_ending, incorrect_ending = None, None, None
        elif 'correct ending' in line.lower():
            correct_ending = line.split(':')[-1].strip()
        else:
            continue

    return stories

In [30]:
responses_flattened = [(topics, res) for topics, subset in responses for res in subset]
responses_formatted = [parse_generated_stories(topics, res) for topics, res in responses_flattened]
responses_final = [story for subset in responses_formatted for story in subset]

In [32]:
responses_final[:5]

[{'topic': 'Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
  'premise': 'Siti ngirim jenang dodol marang sedulure ing Jakarta. Jenang dodol yaiku panganan tradisional saka daerah Banyumas sing digawe saka ketan lan gula Jawa. Siti ngarep-arep sedulure seneng lan kelingan kampung halaman. Nalika sedulure nampa paket kasebut, dheweke langsung mbukak lan nyoba jenang dodol kuwi.',
  'correct_ending': 'Sedulure Siti seneng banget lan kelingan masa kecil ing Banyumas.',
  'incorrect_ending': 'Sedulure Siti ora suka lan mung nyimpen jenang dodol ing lemari.'},
 {'topic': 'Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
  'premise': 'Budi lan keluargane seneng mangan sega liwet saben Minggu. Sega liwet iku panganan khas Jawa sing dimasak nganggo kaldu lan rempah-rempah. Biasane, sega liwet dipangan bareng-bareng ing ngarep omah nganggo ta

In [34]:
import pickle as pkl

pkl.dump(responses_final, open("train_gpt4o_jv.pkl", 'wb'))

In [35]:
import pandas as pd 
pd.DataFrame(responses_final).to_csv("train_gpt4o_jv.csv", index=False)

In [37]:
# List of topics in numbered format
short_topics = [
    'Food', 'Wedding', 'Family Relationship', 'Pregnancy and Kids', 'Death', 
    'Religious Holiday', 'Agriculture', 'Fishers and Trade', 'Art', 
    'Traditional Games', 'Daily Activities', 'Socio-religious'
]

# Create a mapping between detailed topics and numbered topics
topic_mapping = {detailed_topic: short_topic for detailed_topic, short_topic in zip(topics, short_topics)}
# Assuming 'responses_final' is your dataset as a pandas DataFrame
df = pd.DataFrame(responses_final)

# Replace the 'topic' column values with the shorter numbered names
df['topic'] = df['topic'].map(topic_mapping)
df.to_csv("train_gpt4o_jv.csv", index=False)