In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import re
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tamil-wikipedia-articles/valid/valid/AD_wiki_47.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AA_wiki_38.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AA_wiki_08.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AC_wiki_99.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AD_wiki_76.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AE_wiki_38.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AA_wiki_59.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AB_wiki_14.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AB_wiki_00.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AB_wiki_54.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AA_wiki_84.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AB_wiki_73.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AC_wiki_83.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AC_wiki_69.txt
/kaggle/input/tamil-wikipedia-articles/valid/valid/AD_wiki_01.txt
/kaggle/in

In [11]:
!pip install open-tamil




In [12]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import tamil
import pickle

seed = 1234
torch.manual_seed(seed)

<torch._C.Generator at 0x7d16b946ddd0>

In [13]:
class BytePairEncoder:
    def __init__(self, vocab_size=1000):
        
        self.vocab_size = vocab_size
        self.vocab = {}  
        self.inverse_vocab = {} 
        self.merges = []  

    def extract_sentences(self, text):
        return [line.strip()+' <EOS>' for line in text.split(".") if len(line.split(" ")) > 2]

    def train(self, content):
        texts = self.extract_sentences(content)
        #word_counts = {low:1, lower:2, high:1}
        word_counts = {}
        for text in texts:
            idx = 0
            for word in text.split():
                if idx!=0:
                    word = "Ġ"+word
                else:
                    idx += 1
                if word not in word_counts:
                    word_counts[word] = 0
                word_counts[word] += 1
        #word_vocab = {l o w:1, l o w e r:2, h i g h:1}
        word_vocab = {}

        for word, count in word_counts.items():
            parts = word.split('Ġ')
            core_word = parts[-1]  
            if core_word == '<EOS>':
                word_vocab['Ġ' + ' <EOS>'] = count
            elif re.match(r'^[\d,.-]+$', core_word):  
                word_vocab['Ġ ' + core_word] = count  
        
            elif re.search(r'[க-ஹ]+[a-zA-Z]+|[a-zA-Z]+[க-ஹ]+', core_word):  
                word_vocab['Ġ ' + core_word] = count  
        
            elif re.match(r'^[\W_]+$', core_word):  
                word_vocab['Ġ ' + core_word] = count  
        
            else:
                chars = tamil.utf8.get_letters(word) 
                word_vocab[" ".join(chars)] = count 
                
        #initial_vocab = {l, o ,w, e, r, h, i, g}
        initial_vocab = set()
        for word in word_vocab.keys():
            initial_vocab.update(word.split())

        #'b': 0, # 'd': 1}, self.vocab[char]
        #{0: 'b',1: 'd'} self.inverse_vocab[i]
        for i, char in enumerate(sorted(initial_vocab)):
            self.vocab[char] = i
            self.inverse_vocab[i] = char
        
        #next_id = 12 for example
        next_id = len(self.vocab)
        print(next_id)
        while len(self.vocab) < self.vocab_size:
            pairs = self._get_pairs(word_vocab)
            if not pairs:
                break
            best_pair = max(pairs, key=pairs.get)
            new_token = best_pair[0] + best_pair[1]
            self.merges.append(best_pair)
            
            self.vocab[new_token] = next_id
            self.inverse_vocab[next_id] = new_token
            next_id += 1
            
            new_vocab = {}
            bigram = ' '.join(best_pair)
            replacement = ''.join(best_pair)
            
            for word, count in word_vocab.items():
                new_word = word.replace(bigram, replacement)
                new_vocab[new_word] = count
                
            word_vocab = new_vocab
            
            if len(self.vocab) >= self.vocab_size:
                break
    
    def _get_pairs(self, word_vocab):
        pairs = {}
        for word, count in word_vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pair = (symbols[i], symbols[i + 1])
                pairs[pair] = pairs.get(pair, 0) + count
        return pairs
    
    def encode(self, text):
        tokens = []
        for word in text.split():
            word = " ".join(list(word))
            
            for pair in self.merges:
                bigram = ' '.join(pair)
                replacement = ''.join(pair)
                word = word.replace(bigram, replacement)
            
            for subword in word.split():
                if subword in self.vocab:
                    tokens.append(self.vocab[subword])
                else:
                    for char in subword:
                        if char in self.vocab:
                            tokens.append(self.vocab[char])
        
        return tokens
    
    def decode(self, ids):
        tokens = [self.inverse_vocab[id] if id in self.inverse_vocab else '<UNK>' for id in ids]
        return ''.join(tokens)




In [14]:
def save_model_pickle(bpe, filename="bpe_model.pkl"):
    with open(filename, "wb") as f:
        pickle.dump(bpe, f)

def load_model_pickle(filename="bpe_model.pkl"):
    with open(filename, "rb") as f:
        return pickle.load(f)


In [15]:
import os

def load_text_from_folder(folder_path):
    all_text = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    all_text.append(content)
            except Exception as e:
                print(f"Skipping {file_path}: {e}")
    return " ".join(all_text)

folder_path = "/kaggle/input/tamil-wikipedia-articles/valid/valid"
full_corpus = load_text_from_folder(folder_path)
print(len(full_corpus))
#full_corpus = full_corpus[:10000]
bpe = BytePairEncoder(vocab_size=50000)
bpe.train(full_corpus)
#print(bpe.merges)
save_model_pickle(bpe, "bpe_model.pkl")  

73619256
211
[('Ġ', '<EOS>'), ('Ġ', 'இ'), ('Ġ', 'அ'), ('ம்', ','), ('Ġ', 'க'), ('ட்', 'ட'), ('Ġ', 'எ'), ('Ġ', 'வ'), ('Ġ', 'ப'), ('த்', 'தி'), ('ந்', 'து'), ('Ġ', 'ம'), ('க்', 'கு'), ('Ġ', 'ஆ'), ('தி', 'ரு'), ('ப்', 'ப'), ('க', 'ள்'), ('நெ', 'ல்'), ('Ġ', 'உ'), ('ந்', 'த'), ('Ġ', 'திரு'), ('வ', 'ட்ட'), ('Ġஎ', 'ன்'), ('க்', 'க'), ('ற்', 'கு'), ('ய', 'ர்'), ('Ġ', 'த'), ('த்', 'து'), ('யி', 'ல்'), ('வே', 'லி'), ('ன்', 'ற'), ('நெல்', 'வேலி'), ('கு', 'ம்'), ('Ġ', 'ந'), ('ங்', 'க'), ('ள்', 'ள'), ('ற்', 'று'), ('Ġ', 'செ'), ('Ġஅ', 'ரு'), ('த்தி', 'ல்'), ('Ġமா', 'வட்ட'), ('ற', 'து'), ('த்', 'த'), ('ல்', 'லு'), ('ரு', 'ம்'), ('Ġதிரு', 'நெல்வேலி'), ('வி', 'ல்'), (':', '-'), ('Ġ', 'சி'), ('வ', 'ர்'), ('Ġஇ', 'ம்'), ('மா', 'வட்ட'), ('ல்லு', 'ரி'), ('Ġஅரு', 'வி'), ('க', 'ளி'), ('Ġகோ', 'வில்'), ('Ġஅ', 'மை'), ('Ġந', 'தி'), ('Ġ', 'சு'), ('Ġஇம்', 'மாவட்ட'), ('Ġபோ', 'ன்ற'), ('Ġக', 'ல்லுரி'), ('ள', 'ம்'), ('ற்', 'ற'), ('Ġ', 'ச'), ('Ġஇ', 'ட'), ('ப', 'ர'), ('யு', 'ம்'), ('Ġவ', 'ர'), ('ப்ப', 'டு'), ('கி', 'றது'