### Merge all text files into one merged file


In [1]:
import glob

# Get all .txt files in the folder (adjust the path if needed)
file_list = glob.glob("D:/AIT_lecture/NLP/code/Assignment/NLP-2025/NLP-A3/Dataset/*.txt") 

output_file = "D:/AIT_lecture/NLP/code/Assignment/NLP-2025/myword/merged.txt"

with open(output_file, "w", encoding="utf-8") as outfile:
    for file in file_list:
        with open(file, "r", encoding="utf-8") as infile:
            outfile.write(infile.read() + "\n")  # Adds a newline between files

print(f"Merged {len(file_list)} files into {output_file}")

Merged 101 files into D:/AIT_lecture/NLP/code/Assignment/NLP-2025/myword/merged.txt


### To filter out unecessary

In [2]:
import re

# Input and output file paths
input_file = r"D:\AIT_lecture\NLP\code\Assignment\NLP-2025\NLP-A3\Dataset\merged.txt"  
output_file = "cleaned_output.txt"

# Read the file
with open(input_file, "r", encoding="utf-8") as infile:
    lines = infile.readlines()

cleaned_lines = []
for line in lines:
    # Remove numbers at the beginning of a line
    line = re.sub(r'^\d+\s*', '', line)  
    
    # Remove ratings like "#3/5", "#4/5", "#5/5"
    line = re.sub(r'#\d+/\d+', '', line).strip()  
    
    # Remove standalone "/5"
    line = re.sub(r'/5', '', line).strip()  
    
    # Remove "# 3", "# 4", "# 5"
    line = re.sub(r'#\s*[345]', '', line).strip()  
    
    # Remove "# REVIEW" and any standalone "#"
    line = re.sub(r'#\s*REVIEW', '', line, flags=re.IGNORECASE).strip()  
    line = re.sub(r'#', '', line).strip()  # Remove any remaining "#" symbols

    # Append only non-empty lines
    if line:
        cleaned_lines.append(line)

# Write the cleaned text to a new file
with open(output_file, "w", encoding="utf-8") as outfile:
    outfile.write("\n".join(cleaned_lines))

print(" Cleaning complete! Check 'cleaned_output.txt'.")


 Cleaning complete! Check 'cleaned_output.txt'.


### From clean text, split eng text file and myn text file

In [None]:
import re

# Input and output file paths
input_file = "cleaned_output.txt"  # Input cleaned text file
myanmar_output_file = "myn_text.txt"  # Output file for Myanmar text
english_output_file = "eng_text.txt"  # Output file for English text

# Read the file
with open(input_file, "r", encoding="utf-8") as infile:
    lines = infile.readlines()

# Initialize lists to store Myanmar and English lines
myanmar_lines = []
english_lines = []

# Flag to determine whether the next line is English or Myanmar
is_myanmar = False

# Variable to track the last Myanmar sentence added to avoid duplication
last_myanmar_line = ""

# Iterate through the lines to separate Myanmar and English text
for line in lines:
    # Remove leading/trailing spaces
    line = line.strip()
    
    if line:  # Only consider non-empty lines
        # Check if the line contains Myanmar script (Unicode range)
        if re.search(r'[\u1000-\u109F\u102B-\u103E\u1040-\u1049]', line):  # Myanmar Unicode range
            # Check if the current Myanmar line is the same as the last one added
            if line != last_myanmar_line:
                myanmar_lines.append(line)  # Add Myanmar text
                last_myanmar_line = line  # Update the last Myanmar line
            is_myanmar = True  # Next line should be English
        else:
            if is_myanmar:  # If the previous line was Myanmar, then this line is English
                english_lines.append(line)
                is_myanmar = False  # Reset flag after English sentence

# Write the Myanmar text to the file
with open(myanmar_output_file, "w", encoding="utf-8") as myanmar_out:
    myanmar_out.write("\n".join(myanmar_lines))

# Write the English text to the file
with open(english_output_file, "w", encoding="utf-8") as english_out:
    english_out.write("\n".join(english_lines))


### create a DataFrame 

In [8]:
import csv
import pandas as pd

# File paths
english_file = "eng_text.txt"  # Replace with your actual English file
myanmar_file = "myn_text.txt"  # Replace with your actual Myanmar file
output_csv = "output.csv"

# Read files
with open(english_file, "r", encoding="utf-8") as en_file, open(myanmar_file, "r", encoding="utf-8") as my_file:
    english_lines = en_file.readlines()
    myanmar_lines = my_file.readlines()

# Ensure both files have the same number of lines
min_length = min(len(english_lines), len(myanmar_lines))

# Prepare data for DataFrame
data = []
for i in range(min_length):
    data.append([english_lines[i].strip(), myanmar_lines[i].strip()])

# Create a DataFrame
df = pd.DataFrame(data, columns=['en', 'my'])

# Save to CSV
df.to_csv(output_csv, index=False, encoding="utf-8-sig")

print("CSV file created successfully!")




CSV file created successfully!


In [9]:
df.head()

Unnamed: 0,en,my
0,"And he said, Mama, I'm home.",ထို့နောက် သူက အမေ ကျွန်တော်ပြန်ရောက်ပြီ ဟုပြော...
1,"Well, I wasn't even thinking about that, but I...",ငါက ဒါတွေကိုတောင် စဥ်းစားနေခဲ့တာမဟုတ်ပေမယ့် ငါ...
2,"And I thought that was a privilege, and it's s...",ငါက ဒါဟာ အခွင့်ထူးတယ်လို့ထင်ခဲ့ပြီး အဲဒါက အခု...
3,"They told me that, uh, that I would be called ...",သူတို့ ငါ့ကိုပြောတာကတော့ အဲ ငါ့ကို ယောက်ကျားတစ...
4,There's so much you could talk about on that I...,အဲဒါနဲ့ပါတ်သတ်ပြီး ပြောစရာတွေကများလွန်းလို့ ကျ...


In [10]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Define split sizes
TEST_SIZE = 0.1  # 10% of the data for testing
VAL_SPLIT = 0.5  # 50% of the remaining data for validation

# Split data into train and temp (test + validation)
train_data, temp_data = train_test_split(df, test_size=TEST_SIZE, random_state=42, shuffle=True)

# Split temp_data into validation and test sets
val_data, test_data = train_test_split(temp_data, test_size=VAL_SPLIT, random_state=42, shuffle=True)

# Reset index to avoid issues with '__index_level_0__' column
for dataset in [train_data, val_data, test_data]:
    dataset.reset_index(drop=True, inplace=True)

# Convert Pandas DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# Print dataset sizes
print(f"Dataset sizes - Train: {len(train_dataset)}, Validation: {len(val_dataset)}, Test: {len(test_dataset)}")


  from .autonotebook import tqdm as notebook_tqdm


Dataset sizes - Train: 8973, Validation: 499, Test: 499


In [16]:
from huggingface_hub import HfApi, create_repo, login

# Authenticate using your Hugging Face token
login(token="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")

# Save the dataset to disk
dataset_dict.save_to_disk('dataset/myn-eng-dataset')

Saving the dataset (1/1 shards): 100%|██████████| 8973/8973 [00:00<00:00, 1124152.15 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 499/499 [00:00<00:00, 124255.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 499/499 [00:00<00:00, 166345.39 examples/s]


In [17]:
repo_id = 'khinhlaing/myn-eng-dataset'
create_repo(repo_id, repo_type='dataset', private=False)

# Push the dataset to Hugging Face
dataset_dict.push_to_hub(repo_id)

Creating parquet from Arrow format: 100%|██████████| 9/9 [00:00<00:00, 749.99ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.51s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1016.31ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.43s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 983.65ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/khinhlaing/myn-eng-dataset/commit/7abffedb3a3d9989b5d421543f1794be6d675695', commit_message='Upload dataset', commit_description='', oid='7abffedb3a3d9989b5d421543f1794be6d675695', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/khinhlaing/myn-eng-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='khinhlaing/myn-eng-dataset'), pr_revision=None, pr_num=None)

In [18]:
import datasets
corpus= datasets.load_dataset('khinhlaing/myn-eng-dataset')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 8973/8973 [00:00<00:00, 472884.89 examples/s]
Generating validation split: 100%|██████████| 499/499 [00:00<00:00, 187995.84 examples/s]
Generating test split: 100%|██████████| 499/499 [00:00<00:00, 166490.95 examples/s]


In [19]:
corpus

DatasetDict({
    train: Dataset({
        features: ['en', 'my'],
        num_rows: 8973
    })
    validation: Dataset({
        features: ['en', 'my'],
        num_rows: 499
    })
    test: Dataset({
        features: ['en', 'my'],
        num_rows: 499
    })
})