In [1]:
import getpass
import os
import cohere
from langchain_cohere import ChatCohere

os.environ["COHERE_API_KEY"] = getpass.getpass("Enter your Cohere API Key: ")

model = ChatCohere(model="command-r-plus-08-2024")

In [2]:
from langchain_core.output_parsers import StrOutputParser
from prompt import topics, javanese_overgeneration_prompt_cohere, sundanese_overgeneration_prompt_cohere

def create_chain(prompt_template, model):
    return prompt_template | model | StrOutputParser()

chain_sundanese = create_chain(sundanese_overgeneration_prompt_cohere, model)
chain_javanese = create_chain(javanese_overgeneration_prompt_cohere, model)

In [3]:
chain_javanese

FewShotPromptTemplate(input_variables=['topic'], example_selector=<prompt.CustomExampleSelector object at 0x107375550>, example_prompt=PromptTemplate(input_variables=['correct_ending', 'story_premise', 'wrong_ending'], template='Story Premise: {story_premise}\nCorrect Ending: {correct_ending}\nIncorrect Ending: {wrong_ending}'), suffix='Please generate three different examples, strictly following the format in the examples, do not add bullets or any additional response.', prefix='Your task is to write three different examples of story premises consisting of four sentences, wrong ending, and correct ending in Javanese. Include Javanese cultural values in the story with the topic "{topic}". Here are some examples of the story format:')
| ChatCohere(client=<cohere.client.Client object at 0x121b9de50>, async_client=<cohere.client.AsyncClient object at 0x126ffba30>, model='command-r-plus-08-2024', cohere_api_key=SecretStr('**********'))
| StrOutputParser()

In [4]:
def generate_examples(language_chain, topic, n, batch_size=5):
    num_batches = (n + batch_size - 1) // batch_size 
    examples = []
    
    for _ in range(num_batches):
        batch_input = [{"topic": topic}] * batch_size
        batch_output = language_chain.batch(batch_input)
        examples.extend(batch_output)
    
    return examples[:n] 

In [5]:
def generate_examples_javanese(topic, n):
    return generate_examples(chain_javanese, topic, n)

def generate_examples_sundanese(topic, n):
    return generate_examples(chain_sundanese, topic, n)

In [6]:
topics_cat= {
    "Food": [
        "Breakfast", "Lunch", "Dinner", "Snacks", "Food souvenir",
        "Traditional foods and beverages", "Eating habit", "Cutlery", 
        "Cooking ware", "Fruits"
    ],
    "Wedding": [
        "Traditions before marriage", "Traditions when getting married", 
        "Traditions after marriage", "Men's wedding clothes", 
        "Women's wedding clothes", "Invited guests", "Wedding location", 
        "Foods at a wedding", "Gifts brought to wedding"
    ],
    "Family relationship": [
        "Relationships within the main family", 
        "Relationships in the extended family", 
        "Relations with society or neighbors", "Clan or descendant system"
    ],
    "Pregnancy and kids": [
        "Traditions during pregnancy", "Traditions after birth", 
        "How to care for a newborn baby", "How to care for toddlers", 
        "How to care for children", "How to care for teenagers", 
        "Parents and children interactions as adults"
    ],
    "Death": [
        "When death occurs", "The process of caring for a corpse", 
        "Traditions after the body is buried", "The clothes of the mourners", 
        "Inheritance matters"
    ],
    "Religious holiday": [
        "Traditions before religious holidays", 
        "Traditions leading up to religious holidays", 
        "Traditions during holidays", "Traditions after religious holidays"
    ],
    "Agriculture": [
        "What to plant", "Traditions when planting", "Harvest"
    ],
    "Fisheries and trade": [
        "Traditions of taking care of livestock or fish", "Buying and selling traditions"
    ],
    "Art": [
        "Musical instruments", "Folk songs", "Traditional dances", 
        "Use of art at certain events", "Poetry or similar literature"
    ],
    "Traditional games": [
        "Game types", "Location played"
    ],
    "Daily activities": [
        "Morning activities", "Afternoon activities", "Evening activities", 
        "Leisure activities", "Household chores", "Transportation"
    ],
    "Socio-religious aspects of life": [
        "Regular religious activities", "Mystical things", "Traditional ceremonies", 
        "Lifestyle", "Self-care", "Traditional medicine", "Traditional sayings"
    ]
}

In [9]:
selected_topic = topics[0]
print(selected_topic)

javanese_examples = generate_examples_javanese(topic = 'food souvenir', n=1)

Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)


In [10]:
print(javanese_examples[0])

Story Premise: "Lho, kok ra sah mangan, Mas?" tanyaku maring Mas Joko sing lagi ngeliatin wedang uwuh ing ndalemku. "Aku kudu nggoleki oleh-oleh panganan khas Jogja," jawab Mas Joko. Aku langsung ngerti, soale Mas Joko mau mudik nang omahane nang Solo. Mas Joko ora tau panganan khas Jogja.
Correct Ending: Aku langsung ngajeni Mas Joko nang pasar Bringharjo kanggo golek oleh-oleh.
Incorrect Ending: Aku ngajeni Mas Joko nang pasar Beringharjo kanggo golek wedang uwuh.

Story Premise: Ning wis kumpul karo kanca-kancane nang angkringan. "Aku pengin mangan gorengan," katane Ning. "Lho, kok ora mangan bakso?" tanyaku. Ning nggandhekake kancane sing liyane. "Aku pengin mangan gorengan, tapi ora duwe duwit," jawab Ning.
Correct Ending: Aku langsung ngajeni Ning nang warung gorengan kanggo mangan.
Incorrect Ending: Aku ngajeni Ning nang warung bakso kanggo mangan.

Story Premise: "Lho, kok ra mangan, Mbak?" tanyaku maring Mbak Ratih sing lagi ngeliatin jajanan pasar ing ndalemku. "Aku kudu nggo

In [9]:
javanese_examples

['Story Premise: "Kapan saiki nggih, Mas? Aku arep mangan sing endhog, nanging aku ora duwe duit." Pitutur bocah wadon marang penjual sing nunggu ing warung tenda. "Aku arep ngutang, nanging aku ora tau ngutang, aku ora duwe duit." "Ya wis, nggih, aku ngerti, aku ora bakal ngutangke, nanging aku arep nggawe sing endhog, aku arep nggawe sing endhog kanggo kowe."\nCorrect Ending: Bocah wadon iku ora duwe duit, nanging penjual iku ora ngutangke, nanging nggawe sing endhog kanggo kersane.\nIncorrect Ending: Penjual iku ngutangke bocah wadon iku, nanging bocah wadon iku ora duwe duit kanggo ngbayar.\n\nStory Premise: "Aku arep mangan sing endhog, nanging aku ora duwe duit." Pitutur bocah wadon marang penjual sing nunggu ing warung tenda. "Ya wis, nggih, aku ngerti, aku ora bakal ngutangke, nanging aku arep nggawe sing endhog kanggo kowe." "Aku ora duwe duit, nanging aku arep mangan sing endhog."\nCorrect Ending: Penjual iku nggawe sing endhog kanggo bocah wadon iku, nanging bocah wadon iku 

In [10]:
# Count the total number of categories across all topics
total_categories = sum(len(categories) for categories in topics_cat.values())

print(f"Total number of categories: {total_categories}")

Total number of categories: 64


In [11]:
# If you want to run thru all category
# # from tqdm.auto import tqdm

# # responses = []

# # for topic_name in tqdm(topics_cat.keys()):
# #     print('Generating for topic:', topic_name)
# #     categories = topics_cat[topic_name]  
# #     for category in categories:
# #         print('Generating for category:', category)
# #         res = generate_examples_javanese(n=85, topic=category) 
# #         responses.append((topic_name, category, res))

In [6]:
from tqdm.auto import tqdm

responses = []

# Loop over each topic (0 to 11) with a progress bar
for idx in tqdm(range(len(topics))):  
    print('Generating for topic:', topics[idx])
    topic = topics[idx] 
    res = generate_examples_javanese(n=30,topic=topic) 
    responses.append((topic, res)) 

  0%|          | 0/12 [00:00<?, ?it/s]

Generating for topic: Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)
Generating for topic: Wedding (e.g.: traditions before, during, and after marriage, bride & groom wedding clothes, invited guests, wedding location, food and gifts)
Generating for topic: Family relationships (e.g.: relationship with main and extended family, relation with society/neighbours, clan/descendant system)
Generating for topic: Pregnancy and Kids (e.g.: traditions during pregnancy, traditions after birth, how to care for a newborn baby, how to care for toddlers, how to care for children, teenagers, parents and childrens interactions as adults)
Generating for topic: Death (e.g.: tradition when death occurs, taking care of corpse, tradition after the body is buried, clothes of the mourners, inheritance matters)
Generating for topic: Religious holidays (e.g.: traditions before religious holidays, traditions leading up to religious holid

In [7]:
import re
def parse_generated_stories(topic, res):
    # Split the input based on double new lines to separate each story
    res = re.sub(r'\d+\.\s*', '', res)
    res = res.split('\n')
    stories = []
    premise, correct_ending, incorrect_ending = None, None, None
    for line in res:
        if 'story premise' in line.lower():
            premise = line.split(':')[-1].strip()
        elif 'incorrect ending' in line.lower():
            incorrect_ending = line.split(":")[-1].strip()
            stories.append({"topic": topic, "premise": premise, 
                            "correct_ending": correct_ending, "incorrect_ending": incorrect_ending})
            premise, correct_ending, incorrect_ending = None, None, None
        elif 'correct ending' in line.lower():
            correct_ending = line.split(':')[-1].strip()
        else:
            continue

    return stories

In [8]:
responses_flattened = [(topics, res) for topics, subset in responses for res in subset]
responses_formatted = [parse_generated_stories(topics, res) for topics, res in responses_flattened]
responses_final = [story for subset in responses_formatted for story in subset]

In [9]:
responses_final[:5]

[{'topic': 'Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
  'premise': 'Joko lan Yanto arep tuku oleh-oleh. Dheweke arep mangan ing warung. Warunge wis tutup. Dheweke ora entuk oleh-oleh.',
  'correct_ending': 'Dheweke arep tuku oleh-oleh lan mangan ing warung liyane.',
  'incorrect_ending': 'Dheweke ora entuk oleh-oleh lan mangan ing warung.'},
 {'topic': 'Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
  'premise': 'Sinta seneng mangan tempe. Dheweke arep tuku tempe. Tempe sing diwarunge wis habis. Sinta ora entuk tempe.',
  'correct_ending': 'Sinta tuku tempe ing pasar.',
  'incorrect_ending': 'Sinta ora mangan tempe.'},
 {'topic': 'Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
  'premise': 'Bayu lan Bagas arep tuku wedang ronde. Dheweke arep mangan

In [10]:
len(responses_final)

1077

In [11]:
import pickle as pkl

pkl.dump(responses_final, open("train_cohere_rplus_jv.pkl", 'wb'))

In [12]:
import pandas as pd 
pd.DataFrame(responses_final).to_csv("train_cohere_rplus_jv.csv", index=False)

In [13]:
# List of topics in numbered format
short_topics = [
    'Food', 'Wedding', 'Family Relationship', 'Pregnancy and Kids', 'Death', 
    'Religious Holiday', 'Agriculture', 'Fishers and Trade', 'Art', 
    'Traditional Games', 'Daily Activities', 'Socio-religious'
]

# Create a mapping between detailed topics and numbered topics
topic_mapping = {detailed_topic: short_topic for detailed_topic, short_topic in zip(topics, short_topics)}
# Assuming 'responses_final' is your dataset as a pandas DataFrame
df = pd.DataFrame(responses_final)

# Replace the 'topic' column values with the shorter numbered names
df['topic'] = df['topic'].map(topic_mapping)
df.to_csv("train_cohere_rplus_jv.csv", index=False)

In [14]:
responses_final[:3]

[{'topic': 'Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
  'premise': 'Joko lan Yanto arep tuku oleh-oleh. Dheweke arep mangan ing warung. Warunge wis tutup. Dheweke ora entuk oleh-oleh.',
  'correct_ending': 'Dheweke arep tuku oleh-oleh lan mangan ing warung liyane.',
  'incorrect_ending': 'Dheweke ora entuk oleh-oleh lan mangan ing warung.'},
 {'topic': 'Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
  'premise': 'Sinta seneng mangan tempe. Dheweke arep tuku tempe. Tempe sing diwarunge wis habis. Sinta ora entuk tempe.',
  'correct_ending': 'Sinta tuku tempe ing pasar.',
  'incorrect_ending': 'Sinta ora mangan tempe.'},
 {'topic': 'Food (e.g.: food souvenir, traditional foods and beverages, eating habit, traditional cutlery or cooking ware, local fruit)',
  'premise': 'Bayu lan Bagas arep tuku wedang ronde. Dheweke arep mangan

In [15]:
def remove_duplicates(input_list):
    seen = []
    unique_list = []

    for item in input_list:
        # Convert the dictionary to a frozenset of its items for hashing
        item_tuple = frozenset(item.items())
        if item_tuple not in seen:
            seen.append(item_tuple)
            unique_list.append(item)

    return unique_list

responses_final_unique = remove_duplicates(responses_final)

In [17]:
len(responses_final_unique)

1076