# On the final Datasets

Below is a brief query on the features related to each dataset

In [10]:
import json

def count_entries(json_file_path):
    """
    Count the number of top-level entries in a JSON file.

    Parameters:
    json_file_path (str): The path to the JSON file.

    Returns:
    int: The number of top-level entries in the JSON file.
    """
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if isinstance(data, dict):
        return len(data)
    elif isinstance(data, list):
        return len(data)
    else:
        print("Warning: The JSON file does not contain a dictionary or a list at the top level.")
        return 0

In [15]:
# Path to the JSON files
json_first_dt = "Outputs/JSON/dt1.json"
json_dictionaries = "Outputs/JSON/DictionaryEntries_SHUFFLED.json"
json_texts = "Outputs/JSON/KichwaTexts_SHUFFLED.json"
json_spanish = "Outputs/JSON/spanish_HF_F.json"

entry_count_first_dt = count_entries(json_first_dt)
entry_count_dictionaries = count_entries(json_dictionaries)
entry_count_texts = count_entries(json_texts)
entry_count_spanish = count_entries(json_spanish)

print(f"The first dictionary has {entry_count_first_dt} entries.")
print(f"The dictionary entries have {entry_count_dictionaries} entries.")
print(f"The kichwa texts dataset has {entry_count_texts} entries.")  
print(f"The Spanish hf dataset has {entry_count_spanish} entries.")


The first dictionary has 2066 entries.
The dictionary entries have 8838 entries.
The kichwa texts dataset has 2271 entries.
The Spanish hf dataset has 51942 entries.


In [22]:
import json

def get_first_10_elements(json_file_path):
    """
    Reads a JSON file and returns the first 10 elements of the JSON array.
    
    Parameters:
    json_file_path (str): The path to the JSON file.
    
    Returns:
    list: The first 10 elements of the JSON array, or the entire array if it contains fewer than 10 elements.
    """
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f) 
    
    # Ensure the data is a list
    if not isinstance(data, list):
        raise ValueError(f"The JSON file at {json_file_path} does not contain a JSON array.")
    
    # Return the first 10 elements, or all elements if there are fewer than 10
    return data[:5]

In [24]:
# Path to the JSON files
json_first_dt = "Outputs/JSON/dt1.json"
json_dictionaries = "Outputs/JSON/DictionaryEntries_SHUFFLED.json"
json_texts = "Outputs/JSON/KichwaTexts_SHUFFLED.json"
json_spanish = "Outputs/JSON/spanish_HF_F.json"

# Call the function and print the result
first_10_elements_dt = get_first_10_elements(json_first_dt)
first_10_elements_dictionaries = get_first_10_elements(json_dictionaries)   
first_10_elements_texts = get_first_10_elements(json_texts)
first_10_elements_spanish = get_first_10_elements(json_spanish)

print(f"The first 5 elements of the first dictionary are:\n{first_10_elements_dt}\n")
print(f"The first 5 elements of the dictionary entries are:\n{first_10_elements_dictionaries}\n")
print(f"The first 5 elements of the kichwa texts dataset are:\n{first_10_elements_texts}\n")
print(f"The first 5 elements of the Spanish hf dataset are:\n{first_10_elements_spanish}\n")


The first 5 elements of the first dictionary are:
[{'Instrucción': '¿Cuál es la traducción de Kichwa a Español, las pronunciaciones y ejemplos de uso de la palabra achachay?', 'Entrada': 'achachay', 'Salida': 'Palabra: achachay, Pronunciaciones: ačačay, ačačaw, atsatsay, Definición: interj. expresión de frío. Yapa chiri kakpi rimay. Wawakunaka achachay nishpa chayamun. Sin. Chiri chiri.'}, {'Instrucción': '¿Cómo se pronuncia la palabra achachaw y qué significa?', 'Entrada': 'achachaw', 'Salida': 'Palabra: achachaw, Pronunciaciones: ačačaw, ačučuy, Definición: interj. amz. expresión de calor. Yapa rupay tiyakpi rimay. achachaw, mikunaka rupakmi kashka. Sin. Araray, rupakuk.'}, {'Instrucción': 'Proporciona la traducción, las diferentes pronunciaciones y ejemplos de uso para achka en español.', 'Entrada': 'achka', 'Salida': 'Palabra: achka, Pronunciaciones: ačka, ačika, aška, Definición: adv. bastante, harto, mucho. Imatapash tawkata, tawka tiyakta, mana ashallata rikuchik. Chakramantaka 

## Code to Process different types of data into a single dataset file

### PDF to Alpaca

This section aims to adapt all the flexible enough data to be exported as a dataset in the format:

[
    {
    "Instruction": "Some instruction", 
    "Input": "Some input",
    "Output": "Some output"},

    {
    "Instruction": "some instruction",
    "Input": "some input",
    "Output": "some output"}
}]

### First attempt to read a PDF

In [None]:
import PyPDF2

#Code to read pdf, get only alphanumeric characters and write to a text file but keep line breaks
def read_pdf(file):
    pdfFileObj = open(file,'rb')
    pdfReader = PyPDF2.PdfReader(pdfFileObj)
    num_pages = len(pdfReader.pages)
    count = 0
    text = ""
    while count < num_pages:
        pageObj = pdfReader.pages[count]
        count +=1
        text += pageObj.extract_text()
    if text != "":
        text = text
    else:
        text = textract.process(fileurl, method='tesseract', language='eng')
    return text

file = "Data/Kichwa-Español-www.forosecuador.ec.pdf"

text = read_pdf(file)

import pandas as pd

textdf = pd.DataFrame(text.split("\n"), columns = ["text"])

print(textdf.head())

In [None]:
print(textdf.head(100))

#Save text to txt file
with open("Outputs/Clean txt/Diccionario1.txt", "w") as text_file:
    text_file.write(text)

### A higher level approach towards PDF mining

In [None]:
#Only if needed
# !pip install pdfminer.six

In [16]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar

# Initialize variables to hold text
left_column = []
right_column = []
pages_text = []  # Holds the combined text for each page

# Loop through each page
for page_layout in extract_pages("Data/Tests/sample.pdf"):
    # Temporary lists to hold the text for the current page's 
    # left and right columns
    left_page = []
    right_page = []
    
    for element in page_layout:
        # Check if the element is a text container
        if isinstance(element, LTTextContainer):
            text = element.get_text()
            
            # Check the x-coordinate to determine the column
            # (You may need to adjust the threshold value)
            if element.bbox[0] < 300:
                left_page.append(text)
            else:
                right_page.append(text)
    
    # Combine text blocks for the current page and append to pages_text
    combined_page_text = ' '.join(left_page) + ' ' + ' '.join(right_page)
    pages_text.append(combined_page_text)

# Print or save the texts
for i, text in enumerate(pages_text):
    print(f"Page {i+1} Text:")
    print(text)
    print("="*40)

first_output_path = "Data/sample.pdf"

# Save to file
with open(first_output_path, "w") as text_file:
    for page_text in pages_text:
        text_file.write(page_text + '\n\n')


Page 1 Text:
A
 achachay  [ačačay, ačačaw, atsatsay] in-
terj. expre sión de frío. Yapa chiri kakpi rimay.
Wawakunaka achachay nishpa chayamun. 
Sin. Chiri chiri.
achachaw  [ačačaw,  ačučuy]  interj.  amz.
expresión de calor. Yapa rupay tiyakpi rimay.
achachaw, mikunaka rupakmi kashka. 
Sin. Araray, rupakuk.
achka  [ačka,  ačika,  aška]  adv.  bastante,
harto, mucho. Imatapash tawkata, tawka  ti-
yakta, mana ashallata rikuchik.
Chakramantaka achka saratami pallarkani.
Sin. Ashtaka,  hatunta, pachan, tawka, llas-
hak.
achik [ačix, ačig, ači] s. luz, claridad, claro.
Killamanta, intimanta, kuyllur manta llukshik
llipyarik; imatapash rikunkapak kak. 
Tamya punchapika achikka mana rikurinchu.
achiklla [ačixlya] adj. lúcido, claro, nítido.
Killamanta, intimanta, kuyllur manta llukshik
ancha llipyarik; imatapash rikunkapaklla kak. 
Tamya punchaka  mana achikllachu kan.
Sin. chuya.
achikmama  [ačikmama,  ačimama]  s.  S.
madrina. Wawa shutirikukpi markak warmi.
Achikmamaka achikwawamanka tant

In [None]:
#This is necessary only when testing the code, it helps you work with a smaller file

#Read the sample.txt file and create a 20kb file from it
with open("Outputs/Clean txt/sample.txt", "r") as text_file:
    text = text_file.read()
    text = text[:20000]
    print(text)
    with open("Outputs/Clean txt/sample2.txt", "w") as text_file2:
        text_file2.write(text)

### Now that we have the text, let's try formatting it

In [None]:
import re

def segment_text(text, output_file_path):

    # Initialize variables to keep track of the current entry and list of entries
    current_entry_lines = []

    # Open the output file in 'write' mode
    with open(output_file_path, 'w', encoding='utf-8') as f:
        # Loop through each line in the sample text to identify and accumulate entries
        for line in text.split('\n'):
            # Check if the line starts a new entry based on the pattern "word [variations...]"
            if re.match(r"\s*\w+\s*\[.*?\]", line):
                # If we have accumulated lines for the current entry, join and correct them, then write to the file
                if current_entry_lines:
                    corrected_entry = ' '.join(current_entry_lines).replace("- ", "")
                    corrected_entry = re.sub(r" +", " ", corrected_entry)  # Replace multiple spaces with a single space
                    f.write(corrected_entry + '\n')
                # Start accumulating lines for the new entry
                current_entry_lines = [line.strip()]
            else:
                # Otherwise, accumulate lines for the current entry
                current_entry_lines.append(line.strip())

        # Don't forget to add the last accumulated entry
        if current_entry_lines:
            corrected_entry = ' '.join(current_entry_lines).replace("- ", "")
            corrected_entry = re.sub(r" +", " ", corrected_entry)  # Replace multiple spaces with a single space
            f.write(corrected_entry + '\n')


In [None]:
# Read the uploaded sample file to examine its content
file_path = 'Outputs/Clean txt/Diccionario1.txt'

with open(file_path, 'r', encoding='utf-8') as f:
    almost_raw_text = f.read()

# Define the output file path
output_file_path = 'Outputs/Clean txt/segmentedA.txt'

segment_text(almost_raw_text, output_file_path)

In [None]:
#Final check of the file
import re

def re_check(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Initialize a list to hold the processed lines
    processed_lines = []
    
    # Regular expression to match the format: "word [variation]"
    pattern = re.compile(r'(\w+ \[\w+\])')
    
    for line in lines:
        # Find all matches of the pattern in the line
        matches = pattern.findall(line)
        
        # If more than one match is found, split the line
        if len(matches) > 1:
            for match in matches:
                # Extract the index of the match to split the line
                index = line.find(match) + len(match)
                
                # Append each word and its corresponding content to the processed lines
                processed_lines.append(line[:index].strip() + '\n')
                line = line[index:].strip()
        
        # If only one match is found, keep the line as is
        processed_lines.append(line)
    
    # Write the processed lines to the output file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.writelines(processed_lines)

# Define the input and output file paths
input_file_path = 'Outputs/Clean txt/segmentedA copy.txt'
output_file_path = 'Outputs/Clean txt/segmentedAF.txt'

# Run the function
re_check(input_file_path, output_file_path)

In [None]:
import re

def re_check(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Initialize a list to hold the processed lines
    processed_lines = []
    
    # Regular expression to match the desired format: "word [variation]"
    pattern = re.compile(r'(\w+ \[[^\]]+\])')
    
    for line in lines:
        # Find all matches of the pattern in the line
        matches = [(m.start(), m.group()) for m in pattern.finditer(line)]
        
        # If more than one match is found, split the line
        if len(matches) > 1:
            start_index = 0
            for start, match in matches:
                # Append the segment from the start of the line or the end of the last match, up to the start of this match
                processed_lines.append(line[start_index:start].strip() + '\n')
                start_index = start
                
            # Append the remaining part of the line
            processed_lines.append(line[start_index:].strip() + '\n')
        else:
            # If only one match is found, keep the line as is
            processed_lines.append(line)
    
    # Remove any empty lines
    processed_lines = [line for line in processed_lines if line.strip()]
    
    # Write the processed lines to the output file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.writelines(processed_lines)

# Define the input and output file paths
input_file_path = 'Outputs/Clean txt/segmentedA copy.txt'
output_file_path = 'Outputs/Clean txt/segmentedAF.txt'

# Run the function
re_check(input_file_path, output_file_path)

In [None]:
import re

def find_multiple_brackets_in_line(file_path):
    # Regular expression pattern to match '[...words.]' structures
    pattern = r'\b([A-Z]+|\d+)\b'
    
    # Initialize a list to store line numbers where multiple '[...words.]' occur
    line_numbers = []
    
    # Read the file and search for the pattern
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f.readlines(), 1):  # Start line numbering from 1
            matches = re.findall(pattern, line)
            if len(matches) > 1:
                line_numbers.append(line_number)
                
    return line_numbers

# Path to the text file
file_path = 'Outputs/Clean txt/segmentedAF.txt'

# Find and print line numbers where multiple '[...words.]' occur
line_numbers = find_multiple_brackets_in_line(file_path)
print(f"Multiple '[...words.]' structures found on lines: {line_numbers}")


In [None]:
# Read a file and write it to a different, smaller, file
def write_to_file(input_file_path, output_file_path, max_lines=500):

    with open(input_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.writelines(lines[:max_lines])

# Define the input and output file paths
input_file_path = 'Outputs/Clean txt/segmentedAF.txt'
output_file_path = 'Outputs/Clean txt/segmentedAF2.txt'

write_to_file(input_file_path, output_file_path, max_lines=500)

### Formatting for the Castellano-Kichwa file

In [None]:
import re

# Define function to format text 
def format_text(input_text):
    lines = input_text.split('\n')
    formatted_lines = []

    skip_next = False
    for i in range(len(lines)):
        if skip_next:
            skip_next = False
            continue

        line = lines[i].lstrip()

        # Check if the line ends with a hyphen, possibly followed by whitespace
        if re.search(r'[-]\s*$', line):
            # Remove the hyphen and trailing spaces, and concatenate with the next line
            next_line = lines[i + 1].lstrip() if i + 1 < len(lines) else ""
            line = re.sub(r'[-]\s*$', '', line)  # Remove trailing hyphen and whitespace
            line += next_line  # Concatenate with the next line
            skip_next = True  # Skip the next line since we've already processed it

        formatted_lines.append(line)

    formatted_text = '\n'.join(formatted_lines)
    
    # Remove double spaces between lines
    formatted_text = formatted_text.replace('\n\n', '\n')
    
    return formatted_text

# Method to read all text from a file and return it as a single string
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

# Read text from file
input_file_path = "Outputs/Clean txt/cas_kich.txt"  # Replace with the path to your input text file
input_text = read_text_from_file(input_file_path)

# Apply the formatting function
formatted_text = format_text(input_text)

# Output the formatted text (or save it to a file)
print(formatted_text)


Castellano - Kichwa

A
a cambio de, adv. ranti, rantimpa.
a continuación, adv. kipa; chaymanta.
a diario, adv. punchanta punchanta, punchantin.
a gusto, adv. ninantak.
a tiempo, adv. llikchalla, kachka.
ábaco qichwa (objeto), s. yupana.
abajo, adv. uray, wayku.
abandonado, adj. sapalla, sakishka, shitashka, hichushka.
abandonar, v. sakina, hichuna, shitana.
abdómen, s. wiksa.
abeja,  s.  wayrunku,  chullumpi,  putan
chuspi.
abismo, s. kaka.
ablandarse, v. llampuna, apiyana.
abonanzar, v. kasiyachina.
abonar, el terreno, s.  wanuna.
abono, s.  wanu, isma.
aborrecer, v. chiknina, millana.
abortar, v. shulluna.
aborto,  s.  shullu,  shullushka,  shullushka
wawa.
abrazar, s. ukllana.
abrigado, adj. kunuk.
abril, s. <*ayriwa.
abrir, v. paskana, chiktana; a presión manual: chillpina.
absolutamente  todo,  adj.  tukuypacha,
illakta, tukuymashna.
absor ber, v. chumkana, tsumkana.
abuela, s. hatuku mama, hatunmama.
abuelo, s. hatuku yaya, hatunyaya.
abultado, adj. raku.
abundar, v. kamana, nana

In [None]:
# To save it to a file, uncomment the following lines:
output_file_path = "Outputs/Clean txt/cas_kich_F.txt"  # Replace with the path to your output text file
with open(output_file_path, "w") as f:
    f.write(formatted_text)

In [None]:
import re

# Read text from file
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

# Format text based on uppercase words followed by a hyphen
def format_text_based_on_uppercase(input_text):
    lines = input_text.split('\n')
    formatted_lines = []
    current_line = ""

    for line in lines:
        line = line.strip()
        # Check if the line starts with an uppercase word followed by a hyphen
        if line and line.split()[0].isupper() and line.split()[0][-1] == '-':
            if current_line:
                formatted_lines.append(current_line.strip())
            current_line = line
        else:
            current_line += " " + line

    if current_line:
        formatted_lines.append(current_line.strip())
        
    formatted_text = '\n'.join(formatted_lines)
    
    return formatted_text

# Replace uppercase words with word + line break
def replace_uppercase_words(input_text):
    # Regular expression pattern to match uppercase words followed by '.-'
    pattern = re.compile(r'([A-Z\s]+[.]-)')
    
    # Replace matches with the match itself followed by a line break
    formatted_text = pattern.sub(r'\n\1', input_text)
    
    return formatted_text

# Function to remove initial and trailing blank spaces from every line in a text file
def remove_spaces_from_file(input_file_path, output_file_path):
    # Read the file line by line and strip initial and trailing spaces
    with open(input_file_path, 'r') as infile:
        lines = infile.readlines()
        stripped_lines = [line.strip() for line in lines]
    
    # Write the new content to the output file
    with open(output_file_path, 'w') as outfile:
        outfile.write('\n'.join(stripped_lines))

In [None]:
# Replace with the path to your text file
input_file_path = "Outputs/Clean txt/segmentedAF.txt"

# Read text from file
#input_text = read_text_from_file(input_file_path)

# Apply the formatting function
#formatted_text = replace_uppercase_words(input_text)

# Output the formatted text (or save it to a file)
#print(formatted_text)

# Uncomment to save it to a file
output_file_path = "Outputs/Clean txt/segmentedAF.txt"

remove_spaces_from_file(input_file_path, output_file_path) 

# with open(output_file_path, "w") as f:
#     f.write(formatted_text)

In [42]:
# Define the function to remove all double spaces (i.e., empty lines) from a text file
def remove_double_spaces(input_file_path, output_file_path):
    # Initialize a list to hold non-empty lines
    non_empty_lines = []
    
    # Read the file line by line and check for empty lines
    with open(input_file_path, 'r') as infile:
        for line in infile:
            if line.strip():  # Check if line is not empty
                non_empty_lines.append(line.strip())
    
    # Write the non-empty lines back to the file
    with open(output_file_path, 'w') as outfile:
        outfile.write('\n'.join(non_empty_lines))

In [38]:
# As a last processing step, we need to split the DINEIB file into the sentences and the words

# Function to split a file into two different files based on the presence of '=' symbol in each line
def split_file_based_on_symbol(input_file_path, file1_path, file2_path):
    # Initialize lists to hold lines for each output file
    lines_file1 = []
    lines_file2 = []
    
    # Read the file line by line and check for the presence of '=' symbol
    with open(input_file_path, 'r') as infile:
        for line in infile:
            if '=' in line:
                lines_file1.append(line.strip())
            else:
                lines_file2.append(line.strip())
    
    # Write lines to the first output file
    with open(file1_path, 'w') as outfile1:
        outfile1.write('\n'.join(lines_file1))
        
    # Write lines to the second output file
    with open(file2_path, 'w') as outfile2:
        outfile2.write('\n'.join(lines_file2))

In [43]:
input = "Outputs/Clean txt/DINEIB_F.txt"
file1 = "Outputs/Clean txt/DINEIB_F_1.txt"
file2 = "Outputs/Clean txt/DINEIB_F_2.txt"

split_file_based_on_symbol(input, file1, file2)

In [44]:
remove_double_spaces(file2, file2)

In [45]:
input = "Data/General_grammar.txt"
output = "Outputs/Clean txt/General_grammar_F.txt"
remove_spaces_from_file(input, output) 
remove_double_spaces(output, output)

In [92]:
input = 'Data/textosKICHWA.txt'
output = 'Outputs/Clean txt/textosKICHWA_F.txt'

remove_spaces_from_file(input, output) 
remove_double_spaces(output, output)

In [115]:
import json

def parse_text_to_json(text):
    dataset = []
    entry = {}
    current_key = None
    current_value = ""
    for line in text.split("\n"):
        line = line.strip()
        if line.startswith("Instrucción:") or line.startswith("Entrada:") or line.startswith("Salida:"):
            if current_key:
                entry[current_key] = current_value.strip()
            try:
                current_key, current_value = line.split(": ", 1)
            except ValueError:
                current_key = line.rstrip(":")
                current_value = ""
            if current_key == "Salida":
                entry[current_key] = current_value.strip()
                dataset.append(entry)
                entry = {}
                current_key = None
                current_value = ""
        else:
            current_value += " " + line
    return dataset


In [112]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

In [116]:
# This damn thing, specifically, didn't work, so I just used it to create the dict and then added the values manually

input_file_path = 'Outputs/Clean txt/textosKICHWA_F.txt'
output_file_path = 'Outputs/JSON/Base_JSONs/textosKICHWA_F_BASE.json'

text = read_text_file(input_file_path)

# Parsing the text
dataset = parse_text_to_json(text)

with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

output_file_path

'Outputs/JSON/Base_JSONs/textosKICHWA_F_BASE.json'

# Creating the JSON dictionaries

The cleaned data comes from:

1. Ministerio de Eduación del Ecuador - Libro del maestro y diccionario
2. DINEIB 
3. Recursos para enseñanza de kichwa USFQ

In [16]:
# This is to improve the initial random state of the dataset
import json
import random

input_file = 'Outputs/JSON/KichwaTexts_Basis.json'

def randomize_json(input_path, output_path):
    # Load the JSON data from the file
    with open(input_file, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    # Shuffle the dataset
    random.shuffle(dataset)

    # Save the shuffled dataset back to a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

for i in range(10):
    output_file = f'Outputs/JSON/KichwaTexts_Basis_SHUFFLED_{i}.json'
    randomize_json(input_file, output_file)
    input_file = output_file


### This formats the 'cas_kich_F' file

This one has a 'spanish, word_type. kichwa' structure so let's go for it:

In [4]:
import json
import random

def parse_to_json_spanish_to_kichwa(file_path):
    dataset = []
    
    # Define instruction variations
    instruction_variations = [
        "¿Cómo se traduce la palabra española '{word}' al Kichwa?",
        "Dame la traducción en Kichwa de '{word}'.",
        "¿Qué significa '{word}' en Kichwa?",
        "Traduce '{word}' al Kichwa.",
        "¿Cómo traducirías el español '{word}' a Kichwa?",
        "¿Cuál es la palabra en Kichwa que corresponde a '{word}' en español?",
        "¿Cómo se dice '{word}' en Kichwa?",
        "Ofrece el equivalente en Kichwa de la palabra española '{word}'.",
        "¿Qué palabra en Kichwa se usa para '{word}'?",
        "¿Cómo se traduciría '{word}' al Kichwa?",
        "Proporciona la traducción al Kichwa de '{word}'.",
        "Indica el término en Kichwa que se relaciona con '{word}'.",
        "¿Cuál es la versión en Kichwa de '{word}'?",
        "Menciona la palabra en Kichwa que se asemeja a '{word}'.",
        "Dime cómo se expresa '{word}' en el idioma Kichwa.",
        "¿Cuál es el sinónimo en Kichwa de '{word}'?",
        "¿Cómo se podría decir '{word}' en Kichwa?",
        "¿Cómo se enunciaría '{word}' en el idioma Kichwa?",
        "¿Cuál sería la equivalencia de '{word}' en Kichwa?",
        "Describe la traducción de '{word}' al Kichwa.",
        "Revela el significado en Kichwa de '{word}'.",
        "Brinda la conversión de '{word}' al Kichwa.",
        "Explica cómo se diría '{word}' en Kichwa.",
        "¿De qué manera se podría traducir '{word}' al Kichwa?",
        "¿Existe una palabra en Kichwa que sea similar a '{word}'?",
        "Encuentra la palabra en Kichwa correspondiente a '{word}'.",
        "¿Qué término usarías en Kichwa para '{word}'?",
        "¿Cómo se representa '{word}' en el idioma Kichwa?",
        "¿Cómo se interpretaría '{word}' en Kichwa?",
        "¿Qué palabra se asemeja a '{word}' en el idioma Kichwa?"
    ]

    with open(file_path, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f.readlines(), 1):
            line = line.strip()
            if not line:
                continue  # Skip empty lines

            try:
                # Initialize variables
                spanish_word_and_type = ""
                kichwa_translation = ""
                spanish_word = ""
                word_type = ""

                # First, split by comma to separate Spanish and Kichwa parts
                spanish_word_and_type, kichwa_translation = line.split(',', 1)
                
                # Check for period for word type
                if '.' in spanish_word_and_type:
                    spanish_word, word_type = spanish_word_and_type.split('.', 1)
                else:
                    spanish_word = spanish_word_and_type  # If no period, take the whole part as the word
                
                # Remove unwanted spaces
                spanish_word = spanish_word.strip()
                word_type = word_type.strip()
                kichwa_translation = kichwa_translation.strip()
                
                # Randomly select an instruction
                instruction = random.choice(instruction_variations).format(word=spanish_word)
                
                entry = {
                    "Instrucción": instruction,
                    "Entrada": f"{spanish_word}, {word_type}" if word_type else spanish_word,
                    "Salida": kichwa_translation
                }
                dataset.append(entry)
                
            except Exception as e:
                print(f"Error on line {line_number}: {line}")
                print(f"Exception: {e}")
    
    return dataset

# Example usage:
file_path = 'Outputs/Clean txt/Final/cas_kich_F.txt'
dataset = parse_to_json_spanish_to_kichwa(file_path)

# Save the dataset as a JSON file
output_file_path = 'Outputs/JSON/Base_JSONs_2/cas_kich_BASE.json'
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

### This formats the 'DIC_MIN' txt

This txt has a specific format which includes word pronunciations inside brackets '[]'

In [3]:
import json
import random

def parse_to_json_random_instruction(file_path):
    dataset = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            # Extract the word, pronunciation, and definition
            word, pronun_and_def = line.split('[', 1)
            pronun, definition = pronun_and_def.split(']', 1)
            
            word = word.strip()
            pronun = pronun.strip()
            definition = definition.strip()
            
            # Create instructions variations in Spanish
            # Create more instruction variations in Spanish
            instruction_variations = [
                f"¿Cuál es la traducción de Kichwa a Español, las pronunciaciones y ejemplos de uso de la palabra '{word}'?",
                f"Proporciona la traducción, las diferentes pronunciaciones y ejemplos de uso para '{word}' en español.",
                f"¿Cómo se pronuncia la palabra '{word}' y qué significa?",
                f"¿Qué significa la palabra '{word}' y cómo se pronuncia?",
                f"Traduce la palabra '{word}' al español, proporciona las diferentes pronunciaciones y ejemplos de uso.",
                f"¿Cómo se dice '{word}' en español? Proporciona las diferentes pronunciaciones y ejemplos de uso.",
                f"Indica la traducción de '{word}' al español, junto con sus diferentes pronunciaciones y ejemplos de uso.",
                f"¿Qué es la palabra '{word}' en español? Incluye las pronunciaciones y ejemplos de uso.",
                f"Explica el significado de '{word}' en español y cómo se pronuncia.",
                f"Detalla la traducción al español de la palabra '{word}', así como sus pronunciaciones y ejemplos de uso.",
                f"¿De qué forma se utiliza la palabra '{word}' en una oración? Proporciona la traducción al español, las pronunciaciones y ejemplos de uso.",
                f"Describe el significado de la palabra '{word}' y ofrece sus distintas pronunciaciones en español.",
                f"¿Cómo se traduce y pronuncia '{word}' en español? Proporciona ejemplos de uso.",
                f"¿Cuál es el equivalente en español de la palabra '{word}'? Incluye las pronunciaciones y ejemplos de cómo se utiliza.",
                f"Ofrece la traducción al español de '{word}', sus distintas formas de pronunciarlo y cómo se usa en una frase.",
                f"¿Cómo se diría '{word}' en español? Detalla las pronunciaciones y ejemplos de uso.",
                f"Brinda información sobre la palabra '{word}', incluyendo su traducción al español, pronunciaciones y ejemplos de uso.",
                f"¿Qué implica la palabra '{word}' en el idioma español? Añade las pronunciaciones y ejemplos de uso.",
            ]

            
            # Randomly select an instruction
            instruction = random.choice(instruction_variations)
            
            entry = {
                "Instrucción": instruction,
                "Entrada": word,
                "Salida": f"Palabra: {word}, Pronunciaciones: {pronun}, Definición: {definition}"
            }
            dataset.append(entry)
    
    return dataset

# Read the uploaded file and parse it into JSON format
file_path = 'Outputs/Clean txt/Final/DIC_MIN_F.txt'
dataset = parse_to_json_random_instruction(file_path)

# Save the dataset as a JSON file
output_file_path = 'Outputs/JSON/Base_JSONs_2/DIC_MIN_BASE.json'
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

output_file_path


'Outputs/JSON/Base_JSONs_2/DIC_MIN_BASE.json'

### This formats the 'DINEIB_F_1' and 'Sal_DES_F' files

Both files are sentences in kichwa translated to spanish separated by the equals symbol '='



In [1]:
import json
import random

def parse_to_json_equals_format(file_path, instruction_variations):
    dataset = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f.readlines(), 1):
            line = line.strip()
            if not line:
                continue  # Skip empty lines

            try:
                # Split by '=' to separate the Kichwa sentence from its Spanish translation
                kichwa_sentence, spanish_translation = line.split('=', 1)
                
                # Remove unwanted spaces
                kichwa_sentence = kichwa_sentence.strip()
                spanish_translation = spanish_translation.strip()
                
                # Remove the last period from kichwa_sentence
                kichwa_sentence = kichwa_sentence.rstrip('.')

                # Randomly select an instruction
                instruction = random.choice(instruction_variations).format(sentence=kichwa_sentence)
                
                entry = {
                    "Instrucción": instruction,
                    "Entrada": kichwa_sentence,
                    "Salida": spanish_translation
                }
                dataset.append(entry)
                
            except Exception as e:
                print(f"Error on line {line_number}: {line}")
                print(f"Exception: {e}")
    
    return dataset

In [2]:
input_file = 'Outputs/Clean txt/Final/DINEIB_F_1.txt'
output_file = 'Outputs/JSON/Base_JSONs_2/DINEIB_F_1_BASE_noperiod.json'

instruction_variations = [
    "¿Cómo se traduce la siguiente oración en Kichwa al español? '{sentence}'",
    "Dame la traducción en español de la oración en Kichwa: '{sentence}'",
    "¿Qué significa la oración '{sentence}' en español?",
    "¿Cómo se diría '{sentence}' en español?",
    "¿Qué significa el kichwa '{sentence}' en español?",
    "¿Cuál es la interpretación en español de la oración '{sentence}' en Kichwa?",
    "Traduce la oración '{sentence}' al español.",
    "¿Cuál es la versión en español de la oración '{sentence}'?",
    "Ofrece la traducción en español para la oración '{sentence}'.",
    "¿Cómo se podría expresar '{sentence}' en español?",
    "Explica en español el significado de la oración '{sentence}'.",
    "¿Qué oración en español corresponde a '{sentence}' en Kichwa?",
    "Dime cómo se expresa '{sentence}' en español.",
    "¿Cómo se podría traducir '{sentence}' al español?",
    "Proporciona la traducción al español de la oración '{sentence}'.",
    "Indica el significado en español de la oración '{sentence}'.",
    "¿Qué frase en español se asemeja a '{sentence}'?",
    "¿Cómo se verbalizaría '{sentence}' en español?",
    "Descifra el significado en español de la oración '{sentence}'.",
    "¿Cuál sería la equivalencia en español de la oración '{sentence}'?",
    "Revela la traducción en español de '{sentence}'.",
    "¿Cómo se expresaría '{sentence}' en términos españoles?",
    "Transcribe '{sentence}' al español.",
    "Convierte la oración '{sentence}' al español."
]


dataset = parse_to_json_equals_format(input_file, instruction_variations)

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)


In [74]:
input_file = 'Outputs/Clean txt/Final/Sal_Des_F.txt'
output_file = 'Outputs/JSON/Base_JSONs/Sal_Des_F_BASE.json'

instruction_variations = [
    "¿Cómo se traduce la siguiente oración en Kichwa al español? '{sentence}'",
    "Dame la traducción en español de la oración en Kichwa: '{sentence}'",
    "¿Qué significa la oración '{sentence}' en español?",
    "¿Cómo se diría '{sentence}' en español?",
    "¿Qué significa el kichwa '{sentence}' en español?",
    "¿Cuál es la interpretación en español de la oración '{sentence}' en Kichwa?",
    "Traduce la oración '{sentence}' al español.",
    "¿Cuál es la versión en español de la oración '{sentence}'?",
    "Ofrece la traducción en español para la oración '{sentence}'.",
    "¿Cómo se podría expresar '{sentence}' en español?",
    "Explica en español el significado de la oración '{sentence}'.",
    "¿Qué oración en español corresponde a '{sentence}' en Kichwa?",
    "Dime cómo se expresa '{sentence}' en español.",
    "¿Cómo se podría traducir '{sentence}' al español?",
    "Proporciona la traducción al español de la oración '{sentence}'.",
    "Indica el significado en español de la oración '{sentence}'.",
    "¿Qué frase en español se asemeja a '{sentence}'?",
    "¿Cómo se verbalizaría '{sentence}' en español?",
    "Descifra el significado en español de la oración '{sentence}'.",
    "¿Cuál sería la equivalencia en español de la oración '{sentence}'?",
    "Revela la traducción en español de '{sentence}'.",
    "¿Cómo se expresaría '{sentence}' en términos españoles?",
    "Transcribe '{sentence}' al español.",
    "Convierte la oración '{sentence}' al español."
]


dataset = parse_to_json_equals_format(input_file, instruction_variations)

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)


### This formats the 'DINEIB_F_2' and 'ESP_KC_F' files

Both files use a hyphen to divide the words from one language to the other. The DINEIB file goes from kichwa to spanish, the ESP_KC file goes from spanish to kichwa

In [3]:
import json
import random

def parse_to_json_hyphen_format(file_path, instruction_variations):
    dataset = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f.readlines(), 1):
            line = line.strip()
            if not line:
                continue  # Skip empty lines

            try:
                # Split by hyphen to separate the word from its translation
                word, translation = line.split('-', 1)
                
                # Remove unwanted spaces
                word = word.strip()
                translation = translation.strip()

                # Remove the last period from word 
                word = word.rstrip('.')
                
                # Randomly select an instruction
                instruction = random.choice(instruction_variations).format(word=word)
                
                entry = {
                    "Instrucción": instruction,
                    "Entrada": word,
                    "Salida": translation
                }
                dataset.append(entry)
                
            except Exception as e:
                print(f"Error on line {line_number}: {line}")
                print(f"Exception: {e}")
    
    return dataset

In [4]:
# Kichwa to spanish (DINEIB)

input_file = 'Outputs/Clean txt/Final/DINEIB_F_2.txt'
output_file = 'Outputs/JSON/Base_JSONs/DINEIB_F_2_BASE_noperiod.json'

instruction_variations = [
    "¿Cómo se traduce la palabra Kichwa '{word}' al español?",
    "Dame la traducción en español de '{word}'.",
    "¿Qué significa '{word}' en español?",
    "Traduce '{word}' al español.",
    "¿Cuál es la palabra en español que corresponde a '{word}' en Kichwa?",
    "¿Cómo se dice '{word}' en español?",
    "Ofrece el equivalente en español de la palabra Kichwa '{word}'.",
    "¿Qué palabra en español se usa para '{word}'?",
    "¿Cómo se traduciría '{word}' al español?",
    "Proporciona la traducción al español de '{word}'.",
    "Indica el término en español que se relaciona con '{word}'.",
    "¿Cuál es la versión en español de '{word}'?",
    "Menciona la palabra en español que se asemeja a '{word}'.",
    "Dime cómo se expresa '{word}' en el idioma español.",
    "¿Cuál es el sinónimo en español de '{word}'?",
    "¿Cómo se podría decir '{word}' en español?",
    "¿Cuál es el equivalente exacto en español para '{word}'?",
    "¿Qué término en español representa a '{word}'?",
    "¿Cómo se podría traducir '{word}' al español?"
]

dataset = parse_to_json_hyphen_format(input_file, instruction_variations)

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

In [5]:
# Spanish to Kichwa (ESP_KC_F)

input_file = 'Outputs/Clean txt/Final/ESP_KC_F.txt'
output_file = 'Outputs/JSON/Base_JSONs_2/ESP_KC_F_BASE_noperiod.json'

instruction_variations = [
    "¿Cómo se traduce la palabra española '{word}' al Kichwa?",
    "Dame la traducción en Kichwa de '{word}'.",
    "¿Qué significa '{word}' en Kichwa?",
    "Traduce '{word}' al Kichwa.",
    "¿Cuál es la palabra en Kichwa que corresponde a '{word}' en español?",
    "¿Cómo se dice '{word}' en Kichwa?",
    "Ofrece el equivalente en Kichwa de la palabra española '{word}'.",
    "¿Qué palabra en Kichwa se usa para '{word}'?",
    "¿Cómo se traduciría '{word}' al Kichwa?",
    "Proporciona la traducción al Kichwa de '{word}'.",
    "Indica el término en Kichwa que se relaciona con '{word}'.",
    "¿Cuál es la versión en Kichwa de '{word}'?",
    "Menciona la palabra en Kichwa que se asemeja a '{word}'.",
    "Dime cómo se expresa '{word}' en el idioma Kichwa.",
    "¿Cuál es el sinónimo en Kichwa de '{word}'?",
    "¿Cómo se podría decir '{word}' en Kichwa?",
    "¿Cuál es el equivalente exacto en Kichwa para '{word}'?",
    "¿Qué término en Kichwa representa a '{word}'?",
    "¿Cómo se podría traducir '{word}' al Kichwa?"
]


dataset = parse_to_json_hyphen_format(input_file, instruction_variations)

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

### This formats the kallari_rimanaky_F file

This one emulates a conversation so I'm having a hard time thinking of some way to format it.

In the end, I chose to ignore the dialogue and instead focus in its usefulness as to have a few examples differentiating the semantics between spanish and kichwa.

In [93]:
import json

def parse_to_json_dialogue_format(file_path):
    dataset = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    i = 0
    while i < len(lines):
        line = lines[i].strip()
        i += 1

        if not line or line.startswith("A:") or line.startswith("B:") or line.startswith("Kallari rimanakuy"):
            continue  # Skip dialogue markers and empty lines

        if line.startswith("Kichwa"):
            kichwa_sentence = lines[i].strip()
            i += 2  # Skip "Estructura del kichwa en español"
            kichwa_structure = lines[i].strip()
            i += 2  # Skip "Español"
            correct_spanish = lines[i].strip()
            i += 1

            # Entry for maintaining Kichwa structure
            dataset.append({
                "Instrucción": f"¿Cómo se diría la siguiente frase en kichwa, '{kichwa_sentence}', en español conservando la estructura gramatical del kichwa?",
                "Entrada": kichwa_sentence,
                "Salida": kichwa_structure
            })

            # Entry for semantically correct Spanish
            dataset.append({
                "Instrucción": f"¿Cómo se diría la siguiente frase en kichwa, '{kichwa_sentence}', en español?",
                "Entrada": kichwa_sentence,
                "Salida": correct_spanish
            })

    return dataset

In [94]:
# Save the dataset as a JSON file
input_file_path = 'Outputs/Clean txt/Final/kallari_rimanakuy_F.txt'
output_file_path = 'Outputs/JSON/Base_JSONs_2/kallari_rimanakuy_F_BASE.json'

dataset = parse_to_json_dialogue_format(input_file_path)

with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)


In [1]:
# Convert the parquet dataset from huggingface to a json

import pandas as pd

# Load the Parquet file
parquet_file = "Data/train-00000-of-00001-15d700350dbbb567.parquet"
df = pd.read_parquet(parquet_file)

# Convert to JSON and save
json_file = "Outputs/JSON/spanish_HF.json"
df.to_json(json_file, orient='records', lines=True)


# Finally, let's join datasets together

**Dictionaries:**

This two will be used for less detailed training:
- cas_kich
- DIC_MIN

This will be used for more fine detail in the training:
- DINEIB_F_2
- ESP_KC_F
- Sal_Des_F

**Texts:**
- DINEIB_F_1
- Kallari_Rimanakuy
- textosKichwa

In [7]:
import json

def merge_json_files(file_paths, output_file):
    """
    This function takes a list of file paths to JSON files, reads and merges the JSON data,
    and writes the merged data to an output file.

    Parameters:
    file_paths (list): A list of file paths to the JSON files to be merged.
    output_file (str): The path to the output file where the merged data will be saved.
    """
    merged_data = []

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, list):
                merged_data.extend(data)
            else:
                print(f"Warning: Data in {file_path} is not a list. Skipping this file.")

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=4)

In [8]:
# Dictionaries

# List of JSON file paths to be merged
json_files = ["Outputs/JSON/Base_JSONs_2/cas_kich_BASE.json", 
                "Outputs/JSON/Base_JSONs_2/DIC_MIN_BASE.json"]

# Path to the output file
output_file_path = "Outputs/JSON/DictionaryEntries_LessDetail.json"

# Call the function
merge_json_files(json_files, output_file_path)


In [11]:
# Dictionaries

# List of JSON file paths to be merged
json_files = ["Outputs/JSON/Base_JSONs_2/DINEIB_F_2_BASE_noperiod.json", 
                "Outputs/JSON/Base_JSONs_2/ESP_KC_F_BASE_noperiod.json", 
                "Outputs/JSON/Base_JSONs_2/Sal_Des_F_BASE.json"]

# Path to the output file
output_file_path = "Outputs/JSON/DictionaryEntries_MoreDetail.json"

# Call the function
merge_json_files(json_files, output_file_path)


In [12]:
# Sentences

# List of JSON file paths to be merged
json_files = ["Outputs/JSON/Base_JSONs_2/DINEIB_F_1_BASE_noperiod.json", 
              "Outputs/JSON/Base_JSONs_2/kallari_rimanakuy_F_BASE.json",
              "Outputs/JSON/Base_JSONs_2/textosKICHWA_F_BASE.json"]

# Path to the output file
output_file_path = "Outputs/JSON/KichwaTexts_Basis.json"

# Call the function
merge_json_files(json_files, output_file_path)

In [13]:
# Join the two datasets

# List of JSON file paths to be merged
json_files = ["Outputs/JSON/DictionaryEntries_LessDetail.json", 
              "Outputs/JSON/DictionaryEntries_MoreDetail.json",
              "Outputs/JSON/KichwaTexts_Basis.json"]

# Path to the output file
output_file_path = "Outputs/JSON/AllData_Basis.json"

# Call the function
merge_json_files(json_files, output_file_path)

In [14]:
import json

def transform_entry(entry):
    # Determine if there's an input, use it, otherwise use an empty string
    input_value = entry.get('input', '')
    
    # Restructure the entry
    transformed_entry = {
        "Instrucción": entry['instruction'],
        "Entrada": input_value,
        "Salida": entry['output']
    }
    
    return transformed_entry

def transform_json_file(input_file_path, output_file_path):
    # Read the original data
    with open(input_file_path, 'r', encoding='utf-8') as f:
        original_data = [json.loads(line) for line in f if line.strip()]
    
    # Transform each entry
    transformed_data = [transform_entry(entry) for entry in original_data]
    
    # Write the transformed data to the new file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(transformed_data, f, ensure_ascii=False, indent=4)

# Paths to the input and output JSON files
input_file_path = "Outputs/JSON/spanish_HF.json"
output_file_path = "Outputs/JSON/spanish_HF_F.json"

# Call the function
transform_json_file(input_file_path, output_file_path)
