## Unigram 1000 model training

In [None]:

import sentencepiece as spm

# Define paths
input_file = "hi_100.txt"  # Path to the text corpus file
model_prefix = "unigram1k"      # Prefix for the trained model files
vocab_size = 1000                # Vocabulary size
model_type = "unigram"            # Model type (other options: "bpe", "char", "word")

# Train SentencePiece model
spm.SentencePieceTrainer.Train(
    f"--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type}"
)

# Load trained model

def unigram_1k(corpus):
    sp = spm.SentencePieceProcessor()
    sp.Load("unigram1k.model")
    tokens = sp.encode_as_pieces(corpus)
    return tokens


## Unigram 2000 model training

In [10]:
import sentencepiece as spm

# Define paths
input_file = "hi_100.txt"  # Path to the text corpus file
model_prefix = "unigram2k"      # Prefix for the trained model files
vocab_size = 2000                # Vocabulary size
model_type = "unigram"            # Model type (other options: "bpe", "char", "word")

# Train SentencePiece model
spm.SentencePieceTrainer.Train(
    f"--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type}"
)

# Load trained model
def unigram_2k(corpus):
    sp = spm.SentencePieceProcessor()
    sp.Load("unigram2k.model")
    tokens = sp.encode_as_pieces(corpus)
    return tokens

## Bpe 1k model training

In [12]:
import sentencepiece as spm

# Path to your corpus
corpus_file = "hi_100.txt"

# Path to save the trained model
model_prefix = "hindi_bpe_model_1k"

# Define the parameters for training the BPE model
vocab_size = 1000  # You can adjust this based on your corpus size and requirements
model_type = "bpe"  # BPE model type

# Train the BPE model
spm.SentencePieceTrainer.train(input=corpus_file, model_prefix=model_prefix, vocab_size=vocab_size, model_type=model_type)
def bpe_1k(corpus):
    sp = spm.SentencePieceProcessor()   
    sp.Load("hindi_bpe_model_1k.model")
    tokens = sp.encode_as_pieces(corpus)
    return tokens

## Bpe 2k model training

In [13]:
import sentencepiece as spm

# Path to your corpus
corpus_file = "hi_100.txt"

# Path to save the trained model
model_prefix = "hindi_bpe_model_2k"

# Define the parameters for training the BPE model
vocab_size = 2000  # You can adjust this based on your corpus size and requirements
model_type = "bpe"  # BPE model type

# Train the BPE model
spm.SentencePieceTrainer.train(input=corpus_file, model_prefix=model_prefix, vocab_size=vocab_size, model_type=model_type)
def bpe_2k(corpus):
    sp = spm.SentencePieceProcessor()   
    sp.Load("hindi_bpe_model_2k.model")
    tokens = sp.encode_as_pieces(corpus)
    return tokens

## Mbert Training

In [17]:
from transformers import BertTokenizer

# Load the mBERT tokenizer
tokenizer_mbert_1k = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# Define the Hindi line
def mBert_1k(corpus):
    tokens = tokenizer_mbert_1k.tokenize(tokenizer_mbert_1k.decode(tokenizer_mbert_1k.encode(corpus, max_length=1000, truncation=True, padding='max_length')))
    return tokens

In [18]:
from transformers import BertTokenizer

# Load the mBERT tokenizer
tokenizer_mbert_2k = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# Define the Hindi line
def mBert_2k(corpus):
    tokens = tokenizer_mbert_2k.tokenize(tokenizer_mbert_2k.decode(tokenizer_mbert_2k.encode(corpus, max_length=2000, truncation=True, padding='max_length')))
    return tokens

## Indic Bert tokens genration

In [21]:
from transformers import AutoTokenizer

# Load the IndicBERT tokenizer
tokenizer_indic_bert = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
def indicBert_1k(corpus):
    tokens = tokenizer_indic_bert.tokenize(tokenizer_indic_bert.decode(tokenizer_indic_bert.encode(corpus, max_length=1000, truncation=True)))
    return tokens

def indicBert_2k(corpus):
    tokens = tokenizer_indic_bert.tokenize(tokenizer_indic_bert.decode(tokenizer_indic_bert.encode(corpus, max_length=2000, truncation=True)))
    return tokens

## White Space Tokenizer

In [25]:
def WhiteSpaceTokenizer(corpus):
    tokens = corpus.split()
    return tokens

In [5]:
def clean_white_space(tokens):
    ans = []
    for token in tokens:
        # token.lstrip()
        ans.append(token)
        # print(token)
    return ans

matra_of_vowels = {'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'े', 'ै', 'ो', 'ौ', 'ं', 'ः', 'ँ'}

# Set containing Hindi vowels
hindi_vowels = {'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'अं', 'अः', 'अँ'}

# Set containing Hindi consonants
hindi_consonants = {'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 
                    'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 
                    'श', 'ष', 'स', 'ह', 'क्ष', 'त्र', 'ज्ञ'}
halant = "्"

def clean_token(tokens1):
    tokens = []
    for word in tokens1:
        s=""
        for c in word:
            if (c not in matra_of_vowels) and (c not in hindi_consonants) and (c not in hindi_vowels) and c != halant: continue
            else: s = s + c
        if(len(s)): tokens.append(s)
    return tokens

In [8]:
def find_prf(tokens, ground_truth):
    true_positive = 0
    false_positive = 0
    false_negative = 0

    for i in range(len(tokens)):
        for j in range(len(tokens[i])):
            if tokens[i][j] in ground_truth[i]:
                true_positive += 1
            else:
                false_positive += 1
    
    for i in range(len(ground_truth)):
        for token in ground_truth[i]:
            if token not in tokens[i]:
                false_negative += 1

    precision = true_positive / (true_positive + false_positive) if true_positive + false_positive > 0 else 0
    recall = true_positive / (true_positive + false_negative) if true_positive + false_negative > 0 else 0

    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1_score

# Unigram 1k

Unigram 1k


In [9]:
tokenss = []
with open('corpus.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    
    for line in file:
        
        tokens = unigram_1k(line)
        tokens = clean_token(tokens)
        tokenss.append(tokens)
# print(tokenss)
grnd_trth = []
with open('ground_truth.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    # for line in file:
    #     print(line)
    #         # Print the list of words
    vector = []
    for line in file:
        # Split the line by commas to create a list of strings (word groups)
        word_groups = line.strip().split(',')
        word_groups = clean_white_space(word_groups)

        # print(word_groups)
        # vector.append(word_groups)
        # Process each word group
        grnd_trth.append(word_groups)
# print(grnd_trth)
# print(len(grnd_trth))
# print(len(tokenss))
prf = find_prf(tokenss, grnd_trth)
print(prf)

[['फ', 'िल', 'हा', 'ल', 'हरियाणा', 'में', 'लाख', 'ह', 'ेक्ट', 'े', 'य', 'र', 'में', 'ग', '्', 'वार', 'की', 'बु', 'आई', 'हुई', 'है', 'और', 'लाख', 'ह', 'ेक्ट', 'े', 'य', 'र', 'में', 'इस', 'की', 'बु', 'आई', 'का', 'ल', 'क्ष', '्य', 'रख', 'ा', 'गया', 'है', 'जबकि', 'पिछले', 'साल', 'की', 'स', 'मान', 'अ', 'व', 'धि', 'में', 'लाख', 'ह', 'ेक्ट', 'े', 'य', 'र', 'में', 'ग', '्', 'वार', 'ब', 'ो', 'या', 'गया', 'था'], ['ग', 'ा', 'ड', 'ियों', 'वा', 'ला', 'काम', 'तो', 'हो', 'गया', 'कोई', 'और', 'बात', 'कह', 'नी', 'हो', 'किसी', 'को', 'प्र', 'धा', 'न', 'जी', 'ने', 'एक', 'बार', 'फिर', 'सब', 'को', 'सं', 'ब', 'ो', 'ध', 'ित', 'करते', 'ह', 'ु', 'ये', 'कहा'], ['मैं', 'सि', 'ख', 'ा', 'को', 'बा', 'थ', 'र', 'ू', 'म', 'में', 'ले', 'गया', 'वहां', 'पर', 'मैं', 'ने', 'उसे', 'द', 'ि', 'वार', 'प', 'क', 'ड', 'वा', 'के', 'ख', 'डा', 'कर', 'दिया', 'फिर', 'सा', 'ब', 'ु', 'न', 'अपने', 'हाथ', 'में', 'ले', 'के', 'ख', 'ू', 'ब', 'झ', 'ा', 'ग', 'बना', 'या'], ['इस', 'क्षेत्र', 'में', 'पोस्ट', 'ग्र', 'ै', 'ज', 'ु', 'ए', 'शन', 'करने',

# unigram 2k

Unigram 2k

In [11]:
tokenss = []
with open('corpus.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    
    for line in file:
        
        tokens = unigram_2k(line)
        tokens = clean_token(tokens)
        tokenss.append(tokens)
# print(tokenss)
grnd_trth = []
with open('ground_truth.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    # for line in file:
    #     print(line)
    #         # Print the list of words
    vector = []
    for line in file:
        # Split the line by commas to create a list of strings (word groups)
        word_groups = line.strip().split(',')
        word_groups = clean_white_space(word_groups)

        # print(word_groups)
        # vector.append(word_groups)
        # Process each word group
        grnd_trth.append(word_groups)
prf = find_prf(tokenss, grnd_trth)
print(prf)

(0.07979274611398963, 0.2851851851851852, 0.12469635627530366)


Bpe_1k

In [14]:
tokenss = []
with open('corpus.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    
    for line in file:
        
        tokens = bpe_1k(line)
        tokens = clean_token(tokens)
        tokenss.append(tokens)
# print(tokenss)
grnd_trth = []
with open('ground_truth.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    # for line in file:
    #     print(line)
    #         # Print the list of words
    vector = []
    for line in file:
        # Split the line by commas to create a list of strings (word groups)
        word_groups = line.strip().split(',')
        word_groups = clean_white_space(word_groups)

        # print(word_groups)
        # vector.append(word_groups)
        # Process each word group
        grnd_trth.append(word_groups)
prf = find_prf(tokenss, grnd_trth)
print(prf)

(0.05382674516400336, 0.2379182156133829, 0.0877914951989026)


Bpe_2k

In [15]:
tokenss = []
with open('corpus.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    
    for line in file:
        
        tokens = bpe_2k(line)
        tokens = clean_token(tokens)
        tokenss.append(tokens)
# print(tokenss)
grnd_trth = []
with open('ground_truth.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    # for line in file:
    #     print(line)
    #         # Print the list of words
    vector = []
    for line in file:
        # Split the line by commas to create a list of strings (word groups)
        word_groups = line.strip().split(',')
        word_groups = clean_white_space(word_groups)

        # print(word_groups)
        # vector.append(word_groups)
        # Process each word group
        grnd_trth.append(word_groups)
prf = find_prf(tokenss, grnd_trth)
print(prf)

(0.06965174129353234, 0.26022304832713755, 0.10989010989010989)


# mbert 1k

In [19]:
tokenss = []
with open('corpus.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    
    for line in file:
        
        tokens = mBert_1k(line)
        tokens = clean_token(tokens)
        tokenss.append(tokens)
# print(tokenss)
grnd_trth = []
with open('ground_truth.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    # for line in file:
    #     print(line)
    #         # Print the list of words
    vector = []
    for line in file:
        # Split the line by commas to create a list of strings (word groups)
        word_groups = line.strip().split(',')
        word_groups = clean_white_space(word_groups)

        # print(word_groups)
        # vector.append(word_groups)
        # Process each word group
        grnd_trth.append(word_groups)
prf = find_prf(tokenss, grnd_trth)
print(prf)

(0.05067567567567568, 0.2247191011235955, 0.08270158511371468)


# M bert 2k

In [20]:
tokenss = []
with open('corpus.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    
    for line in file:
        
        tokens = mBert_2k(line)
        tokens = clean_token(tokens)
        tokenss.append(tokens)
# print(tokenss)
grnd_trth = []
with open('ground_truth.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    # for line in file:
    #     print(line)
    #         # Print the list of words
    vector = []
    for line in file:
        # Split the line by commas to create a list of strings (word groups)
        word_groups = line.strip().split(',')
        word_groups = clean_white_space(word_groups)

        # print(word_groups)
        # vector.append(word_groups)
        # Process each word group
        grnd_trth.append(word_groups)
prf = find_prf(tokenss, grnd_trth)
print(prf)

(0.05067567567567568, 0.2247191011235955, 0.08270158511371468)


# indic bert 1k

In [22]:
tokenss = []
with open('corpus.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    
    for line in file:
        
        tokens = indicBert_1k(line)
        tokens = clean_token(tokens)
        tokenss.append(tokens)
# print(tokenss)
grnd_trth = []
with open('ground_truth.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    # for line in file:
    #     print(line)
    #         # Print the list of words
    vector = []
    for line in file:
        # Split the line by commas to create a list of strings (word groups)
        word_groups = line.strip().split(',')
        word_groups = clean_white_space(word_groups)

        # print(word_groups)
        # vector.append(word_groups)
        # Process each word group
        grnd_trth.append(word_groups)
prf = find_prf(tokenss, grnd_trth)
print(prf)

(0.013171225937183385, 0.04961832061068702, 0.020816653322658127)


# indic bert 2k

In [23]:
tokenss = []
with open('corpus.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    
    for line in file:
        
        tokens = indicBert_2k(line)
        tokens = clean_token(tokens)
        tokenss.append(tokens)
# print(tokenss)
grnd_trth = []
with open('ground_truth.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    # for line in file:
    #     print(line)
    #         # Print the list of words
    vector = []
    for line in file:
        # Split the line by commas to create a list of strings (word groups)
        word_groups = line.strip().split(',')
        word_groups = clean_white_space(word_groups)

        # print(word_groups)
        # vector.append(word_groups)
        # Process each word group
        grnd_trth.append(word_groups)
prf = find_prf(tokenss, grnd_trth)
print(prf)

(0.013171225937183385, 0.04961832061068702, 0.020816653322658127)


# WhiteSpaceTokenizer


In [26]:
tokenss = []
with open('corpus.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    
    for line in file:
        
        tokens = WhiteSpaceTokenizer(line)
        tokens = clean_token(tokens)
        tokenss.append(tokens)
# print(tokenss)
grnd_trth = []
with open('ground_truth.txt', 'r', encoding='utf-8') as file:
    # Read the entire content of the file
    # for line in file:
    #     print(line)
    #         # Print the list of words
    vector = []
    for line in file:
        # Split the line by commas to create a list of strings (word groups)
        word_groups = line.strip().split(',')
        word_groups = clean_white_space(word_groups)

        # print(word_groups)
        # vector.append(word_groups)
        # Process each word group
        grnd_trth.append(word_groups)
prf = find_prf(tokenss, grnd_trth)
print(prf)

(0.13971742543171115, 0.3308550185873606, 0.19646799116997793)
