In [2]:
# csv format
columns = ["kural_number", "kural_in_tamil", "meaning_in_tamil", "kural_in_english", "meaning_in_english", "category", "tags", "q1", "q2", "q3"]
import pandas as pd

df = pd.DataFrame(columns=columns)
df

Unnamed: 0,kural_number,kural_in_tamil,meaning_in_tamil,kural_in_english,meaning_in_english,category,tags,q1,q2,q3


In [3]:
tamil_vs_eng_url = "https://www.projectmadurai.org/pm_etexts/utf8/pmuni0017.html"

tamil_meaning = ["https://www.projectmadurai.org/pm_etexts/utf8/pmuni0450_01.html", "https://www.projectmadurai.org/pm_etexts/utf8/pmuni0450_02.html", "https://www.projectmadurai.org/pm_etexts/utf8/pmuni0450_03.html", "https://www.projectmadurai.org/pm_etexts/utf8/pmuni0450_04.html"]

tamil_vs_eng_url_html = "tamil_vs_eng_url.html"

In [4]:
import os
from bs4 import BeautifulSoup

if not os.path.exists(tamil_vs_eng_url_html):
	!wget -O {tamil_vs_eng_url_html} {tamil_vs_eng_url}

with open(tamil_vs_eng_url_html, "r", encoding="utf-8") as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, "html5lib")
with open(tamil_vs_eng_url_html, "w", encoding="utf-8") as file:
    file.write(soup.prettify())

print("HTML fetched and saved successfully!")

HTML fetched and saved successfully!


In [5]:
import re
import json
all_text = [tag.get_text(strip=True) for tag in soup.find_all("center")]
main_topic = []
temp_dup = set()
for i in range(1, 1330):
	for content in all_text:
		if "Part " in content:
			if content not in temp_dup:
				temp_dup.add(content)
				split_text = re.split(r"- Part .* -", content)
				main_topic.append({
					"main_topic": {
						"tamil": split_text[0].split(".")[-1].strip(),
						"english": split_text[-1].strip()
					}
				})

print(json.dumps(main_topic, indent=4, ensure_ascii=False))

[
    {
        "main_topic": {
            "tamil": "அறத்துப்பால்",
            "english": "VIRTUE"
        }
    },
    {
        "main_topic": {
            "tamil": "பொருட்பால்",
            "english": "WEALTH"
        }
    },
    {
        "main_topic": {
            "tamil": "இன்பத்துப்பால்",
            "english": "NATURE OF LOVE"
        }
    }
]


In [6]:
def remove_leading_number(text):
	# Remove leading patterns like "1.1", "1.2", etc.
	text = re.sub(r"^\s*\d+\.\d+\s*", "", text)
	# Remove leading patterns like "1.", "2.", etc.
	text = re.sub(r"^\s*\d+\.\s*", "", text)
	# Remove leading patterns like "1", "2", etc.
	text = re.sub(r"^\s*\d+\s*", "", text)
	# Remove trailing numbers
	text = re.sub(r"\s*\d+$", "", text)
	# Replace multiple spaces with single space
	text = re.sub(r"\s+", " ", text)
	return text.strip()

def clean_text(text):
	"""Remove multiple spaces, replace newlines with spaces, and trim start & end spaces."""
	text = re.sub(r"\s+", " ", text)  # Replace multiple spaces & newlines with single space
	return text.strip()  # Remove spaces from start & end

In [7]:
all_ul_tags = soup.find_all("ul")
for tag in all_ul_tags:
	prev_elem = tag.previous_element
	while remove_leading_number(prev_elem.get_text()) == "":
		prev_elem = prev_elem.previous_element

	index = int(prev_elem.get_text().strip()[0]) - 1
	sub_topic = remove_leading_number(prev_elem.get_text())
	if "sub_topic" not in main_topic[index]:
		main_topic[index]["sub_topic"] = []
	
	parts = sub_topic.split("-", 1)
	main_topic[index]["sub_topic"].append({
		"tamil": remove_leading_number(parts[0]),
		"english": remove_leading_number(parts[1])
	})
	main_topic[index]["sub_topic"][-1]["category"] = []
	for row in tag.find_all("tr"):
		columns = row.find_all("td")
		if len(columns) >= 2:
			if "category" not in main_topic[index]["sub_topic"][-1]:
				main_topic[index]["sub_topic"][-1]["category"] = []

			main_topic[index]["sub_topic"][-1]["category"].append({
				"tamil": remove_leading_number(columns[0].get_text()),
				"english": remove_leading_number(columns[1].get_text()),
				"kurals": []
			})

print(json.dumps(main_topic, indent=4, ensure_ascii=False))

[
    {
        "main_topic": {
            "tamil": "அறத்துப்பால்",
            "english": "VIRTUE"
        },
        "sub_topic": [
            {
                "tamil": "பாயிரம்",
                "english": "PREFACE",
                "category": [
                    {
                        "tamil": "கடவுள் வாழ்த்து",
                        "english": "The praise of God",
                        "kurals": []
                    },
                    {
                        "tamil": "வான் சிறப்பு",
                        "english": "The blessing of Rain",
                        "kurals": []
                    },
                    {
                        "tamil": "நீத்தார் பெருமை",
                        "english": "The merit of Ascetics",
                        "kurals": []
                    },
                    {
                        "tamil": "அறன் வலியுறுத்தல்",
                        "english": "The power of virtue",
                        "kurals": []
  

In [8]:
def empty_kural_category():
	for i in range(len(main_topic)):
		for j in range(len(main_topic[i]["sub_topic"])):
			for k in range(len(main_topic[i]["sub_topic"][j]["category"])):
				if len(main_topic[i]["sub_topic"][j]["category"][k]["kurals"]) < 10:
					return i, j, k

	return -1, -1, -1


In [9]:
first_element = soup.find_next(string=lambda text: text and "------" in text)
kural_id = 0
i, j, k = empty_kural_category()
current_kural = {"tamil": "", "english": ""}
line_count = -1
while first_element and kural_id <= 1330:
	text = clean_text(first_element.get_text())
	first_element = first_element.next_element
	if text:
		if text[0].isdigit():
			line_count = 0
			current_kural = {"tamil": "", "english": ""}

		if line_count > -1:
			if line_count < 2:
				line_count += 1
				current_kural['tamil'] += "\n"
				current_kural['tamil'] += remove_leading_number(text)

			elif line_count >= 2 and line_count <= 6:
				line_count += 1
				current_kural['english'] += "\n"
				current_kural['english'] += remove_leading_number(text)

			if text[-1].isdigit():
				kural_id += 1
				main_topic[i]["sub_topic"][j]["category"][k]["kurals"].append({
					"id": kural_id,
					"tamil": current_kural['tamil'].strip(),
					"english": current_kural['english'].strip()
				})
				current_kural = {"tamil": "", "english": ""}
				line_count = -1
				i, j, k = empty_kural_category()

print(json.dumps(main_topic, indent=4, ensure_ascii=False))

[
    {
        "main_topic": {
            "tamil": "அறத்துப்பால்",
            "english": "VIRTUE"
        },
        "sub_topic": [
            {
                "tamil": "பாயிரம்",
                "english": "PREFACE",
                "category": [
                    {
                        "tamil": "கடவுள் வாழ்த்து",
                        "english": "The praise of God",
                        "kurals": [
                            {
                                "id": 1,
                                "tamil": "அகர முதல எழுத்தெல்லாம் ஆதி\nபகவன் முதற்றே உலகு",
                                "english": "'A' leads letters; the Ancient Lord\nLeads and lords the entire world."
                            },
                            {
                                "id": 2,
                                "tamil": "கற்றதனா லாய பயனென்கொல் வாலறிவன்\nநற்றாள் தொழாஅர் எனின்",
                                "english": "That lore is vain which does not fall\nAt His good feet wh

In [10]:
converted_kurals = []

def load_data():
	try:
		if os.path.exists("thirukural.json"):
			with open("thirukural.json", "r", encoding="utf-8") as file:
				converted_kurals = json.load(file)
				return converted_kurals
	except:
		pass

	return []

In [11]:
converted_kurals = load_data()
if len(converted_kurals) == 0:
	for main_topic_obj in main_topic:
		main_topic_tamil = main_topic_obj["main_topic"]["tamil"]
		main_topic_english = main_topic_obj["main_topic"]["english"]
		
		# Process each sub_topic
		for sub_topic in main_topic_obj["sub_topic"]:
			sub_topic_tamil = sub_topic["tamil"]
			sub_topic_english = sub_topic["english"]
			
			# Process each category in the sub_topic
			for category in sub_topic["category"]:
				category_tamil = category["tamil"]
				category_english = category["english"]
				
				# Process each kural in the category
				for kural in category["kurals"]:
					# Create a new kural object in the desired format
					converted_kural = {
						"id": kural["id"],
						"kural_in_tamil": kural["tamil"],
						"kural_in_english": kural["english"],
						"category_in_tamil": category_tamil,
						"category_in_english": category_english,
						"sub_topic_in_tamil": sub_topic_tamil,
						"sub_topic_in_english": sub_topic_english,
						"main_topic_in_tamil": main_topic_tamil,
						"main_topic_in_english": main_topic_english
					}
					
					# Add the converted kural to our list
					converted_kurals.append(converted_kural)

	print(converted_kurals)

In [12]:
def save_json():
	with open("thirukural.json", "w", encoding="utf-8") as file:
		json.dump(converted_kurals, file, indent=4, ensure_ascii=False)

	print("File saved as thirukural.json")

save_json()

File saved as thirukural.json


In [13]:
import csv

def convert_tocsv():
	with open("thirukural.csv", 'w', newline='', encoding='utf-8') as file:
		writer = csv.DictWriter(file, fieldnames=converted_kurals[0].keys())
		writer.writeheader()
		writer.writerows(converted_kurals)

	print("File saved as thirukural.csv")

convert_tocsv()

ValueError: dict contains fields not in fieldnames: 'query10_in_english'

In [14]:
for i in range(len(tamil_meaning)):
	if not os.path.exists(tamil_meaning[i].split("/")[-1]):
		!wget -O {tamil_meaning[i].split("/")[-1]} {tamil_meaning[i]}
		file_name = tamil_meaning[i].split("/")[-1]
		with open(file_name, "r", encoding="utf-8") as file:
			html_content = file.read()

		soup = BeautifulSoup(html_content, "html5lib")
		with open(file_name, "w", encoding="utf-8") as file:
			file.write(soup.prettify())

In [16]:
def clean_explanation_string(text):
	text = re.sub(r"^[\(\)\[\]-]+|[\(\)\[\]-]+$", "", text).strip()
	text = re.sub(r"^[\(\)\[\]-]+|[\(\)\[\]-]+$", "", text).strip()
	return re.sub(r"^[\(\)\[\]-]+|[\(\)\[\]-]+$", "", text).strip()

In [17]:
def get_startinng_number(text):
	import re

	# Extract number only if it appears at the start
	match = re.match(r'^\d+', text)

	# Convert to integer if a match is found
	number = int(match.group()) if match else None

	return number

def remove_first_n_words(text, n):
    words = text.split()  # Split text into words
    return ' '.join(words[n:])  # Join after skipping first n words

In [18]:
totalid = 0
for i in range(len(tamil_meaning)):
	file_name = tamil_meaning[i].split("/")[-1]
	with open(file_name, "r", encoding="utf-8") as file:
		html_content = file.read()

	soup = BeautifulSoup(html_content, "html5lib")
	for row in soup.find_all("tr"):
		if totalid >= len(converted_kurals):
			break

		row_text = remove_leading_number(row.get_text())
		if "விளக்கம்" in row_text:
			row_text = clean_explanation_string(row_text.split("விளக்கம்")[-1])
		else:
			row_text = clean_explanation_string(remove_first_n_words(row_text, 7))

		if row_text:
			index = get_startinng_number(clean_text(row.get_text())) - 1
			converted_kurals[index]["explanation_in_tamil"] = row_text
			totalid += 1

save_json()
convert_tocsv()

File saved as thirukural.json


ValueError: dict contains fields not in fieldnames: 'query10_in_english'

In [21]:
from gemiwrap import GeminiWrapper
from google import genai
import time

converted_kurals = load_data()

system_prompt = """# Tamil to English Explanation Conversion

You are a Tamil-English bilingual assistant. Convert Tamil explanations to clear, culturally appropriate English while:

1. Preserving original meaning and cultural context
2. Explaining Tamil-specific concepts briefly when needed
3. Maintaining the same tone and register as the original
4. Using natural-sounding English rather than literal translations
5. Keeping specialized terminology intact with brief explanations

For culturally-specific terms:
- Retain Tamil words in italics with brief explanations
- Example: "*Kolam* (geometric floor design)"

Prioritize clarity and cultural accuracy over literal translation."""

schema = genai.types.Schema(
			type = genai.types.Type.OBJECT,
			required = ["explanation_in_english"],
			properties = {
				"explanation_in_english": genai.types.Schema(
					type = genai.types.Type.STRING,
				),
			},
		)
gemini = GeminiWrapper(system_instruction=system_prompt, schema=schema)

for i, _ in enumerate(converted_kurals):
	try:
		if "explanation_in_english" not in converted_kurals[i]:
			response = gemini.send_message(converted_kurals[i]["explanation_in_tamil"])
			print(response)
			converted_kurals[i]["explanation_in_english"] = json.loads(response[0])["explanation_in_english"]
			time.sleep(5)
			save_json()
			convert_tocsv()
	except:
		pass

[48;5;22m[1;37m2025-03-29T18:58:20.367283[0m [48;5;22m[1;37msystem_instruction:: # Tamil to English Explanation Conversion

You are a Tamil-English bilingual assistant. Convert Tamil explanations to clear, culturally appropriate English while:

1. Preserving original meaning and cultural context
2. Explaining Tamil-specific concepts briefly when needed
3. Maintaining the same tone and register as the original
4. Using natural-sounding English rather than literal translations
5. Keeping specialized terminology intact with brief explanations

For culturally-specific terms:
- Retain Tamil words in italics with brief explanations
- Example: "*Kolam* (geometric floor design)"

Prioritize clarity and cultural accuracy over literal translation.[0m
---------------------------------------------------------------------------------------------------
[48;5;22m[1;37m2025-03-29T18:58:20.367425[0m [48;5;22m[1;37mhistory:: [][0m
-------------------------------------------------------------

In [19]:
def convertjson_tostr(data_obj):
	str_data = ""
	for key in data_obj.keys():
		str_data += f"{key}: {data_obj[key]}\n"

	return str_data

In [None]:
converted_kurals = load_data()

system_prompt_question = """You are a specialized assistant for creating LLM training data based on Thirukkural content. For each input containing Thirukkural information, generate 10 different types of queries in both Tamil and English that would logically lead to retrieving this specific Thirukkural as the response.

When processing a Thirukkural input, analyze:
- The couplet text in Tamil and English
- The category and topic classifications
- The explanations and meanings
- Key phrases and concepts

Then create 10 varied query types for each language that someone might use when looking for this specific content, such as:
- Direct quote searches (e.g., "கற்றதனால் ஆய பயன் என்ன?")
- Concept-based queries (e.g., "What does Thirukkural say about knowledge without worship?")
- Category searches (e.g., "Thirukkural on praise of God")
- Topic-specific questions (e.g., "Thirukkural about the value of learning")
- Meaning-based searches (e.g., "Is knowledge without devotion useful according to Thirukkural?")
- Keyword queries (e.g., "Thirukkural knowledge devotion feet")
- Identification queries (e.g., "Which Thirukkural talks about worshipping God's feet?")
- Moral lesson queries (e.g., "Thirukkural teaching about the purpose of knowledge")
- Reference queries (e.g., "Thirukkural from Virtue section about knowledge")
- Application searches (e.g., "How to use knowledge properly according to Thirukkural")

Format your response as:
1. Tamil Queries (numbered 1-10)
2. English Queries (numbered 1-10)

Each query should naturally lead to the specific Thirukkural in the provided input being the appropriate response in a retrieval or Q&A system."""

schema_question = genai.types.Schema(
			type = genai.types.Type.OBJECT,
			required = ["questions_in_tamil", "questions_in_english"],
			properties = {
				"questions_in_tamil": genai.types.Schema(
					type = genai.types.Type.ARRAY,
					items = genai.types.Schema(
						type = genai.types.Type.STRING,
					),
				),
				"questions_in_english": genai.types.Schema(
					type = genai.types.Type.ARRAY,
					items = genai.types.Schema(
						type = genai.types.Type.STRING,
					),
				),
			},
		)

gemini = GeminiWrapper(system_instruction=system_prompt_question, schema=schema_question)

for i, _ in enumerate(converted_kurals):
	if "query0_in_tamil" not in converted_kurals[i]:
		try:
			response = gemini.send_message(convertjson_tostr(converted_kurals[i]))[0]
			for j, question in enumerate(json.loads(response)["questions_in_tamil"]):
				converted_kurals[i][f'query{j}_in_tamil'] = question

			for j, question in enumerate(json.loads(response)["questions_in_english"]):
				converted_kurals[i][f'query{j}_in_english'] = question

			time.sleep(5)
			save_json()
			convert_tocsv()

		except:
			pass

[48;5;22m[1;37m2025-03-29T18:58:25.212785[0m [48;5;22m[1;37msystem_instruction:: You are a specialized assistant for creating LLM training data based on Thirukkural content. For each input containing Thirukkural information, generate 10 different types of queries in both Tamil and English that would logically lead to retrieving this specific Thirukkural as the response.

When processing a Thirukkural input, analyze:
- The couplet text in Tamil and English
- The category and topic classifications
- The explanations and meanings
- Key phrases and concepts

Then create 10 varied query types for each language that someone might use when looking for this specific content, such as:
- Direct quote searches (e.g., "கற்றதனால் ஆய பயன் என்ன?")
- Concept-based queries (e.g., "What does Thirukkural say about knowledge without worship?")
- Category searches (e.g., "Thirukkural on praise of God")
- Topic-specific questions (e.g., "Thirukkural about the value of learning")
- Meaning-based searche