In [1]:
%cd ..

/workspace/sound_asr


In [3]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
[0m

In [11]:
import sentencepiece as spm
import os

# Path to the unigram file and output directory
unigram_file = "librispeech-vocab.txt"  # Replace with your unigram file path
output_dir = "./sentencepiece_model"
preprocessed_file = "./preprocessed_unigrams.txt"  # Temp file
vocab_size = 30000  # Desired vocabulary size

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Preprocess the unigram file to create synthetic "sentences"
print("[INFO] Preprocessing unigram file...")
with open(unigram_file, "r", encoding="utf-8") as infile, open(preprocessed_file, "w", encoding="utf-8") as outfile:
    batch_size = 100
    batch = []
    for line in infile:
        word = line.strip()
        if word:
            batch.append(word)
        if len(batch) >= batch_size:
            outfile.write(" ".join(batch) + "\n")
            batch = []
    if batch:
        outfile.write(" ".join(batch) + "\n")

print(f"[INFO] Preprocessed file saved to {preprocessed_file}")

# Train the SentencePiece model
model_prefix = os.path.join(output_dir, "librispeech_unigram_model")
spm.SentencePieceTrainer.train(
    input=preprocessed_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    model_type="unigram",
    character_coverage=1.0,
    bos_id=-1,
    eos_id=-1,
    unk_id=0,
    pad_id=1,
    user_defined_symbols=["▁"],
    shuffle_input_sentence=True,
    normalization_rule_name="identity"  # Disable normalization
)

print(f"[INFO] SentencePiece model saved to {model_prefix}.model and {model_prefix}.vocab")

# Load and test the model
sp = spm.SentencePieceProcessor()
sp.load(f"{model_prefix}.model")

# Test encoding and decoding
text = "THIS IS AN EXAMPLE."  # Adjust case and spacing as needed
token_ids = sp.encode(text, out_type=int)
tokens = sp.encode(text, out_type=str)
decoded_text = sp.decode(token_ids)

print(f"Input Text: {text}")
print(f"Token IDs: {token_ids}")
print(f"Tokens: {tokens}")
print(f"Decoded Text: {decoded_text}")

# Inspect the vocab
with open(f"{model_prefix}.vocab", "r", encoding="utf-8") as f:
    vocab = f.readlines()
print("First 50 tokens in vocab:", vocab[:50])


[INFO] Preprocessing unigram file...
[INFO] Preprocessed file saved to ./preprocessed_unigrams.txt
[INFO] SentencePiece model saved to ./sentencepiece_model/librispeech_unigram_model.model and ./sentencepiece_model/librispeech_unigram_model.vocab
Input Text: THIS IS AN EXAMPLE.
Token IDs: [2, 1814, 2, 72, 2, 27, 2, 9119, 0]
Tokens: ['▁', 'THIS', '▁', 'IS', '▁', 'AN', '▁', 'EXAMPLE', '.']
Decoded Text: THIS IS AN EXAMPLE ⁇ 
First 50 tokens in vocab: ['<unk>\t0\n', '<pad>\t0\n', '▁\t0\n', 'E\t-1.02664\n', 'S\t-2.36567\n', "'\t-3.37786\n", 'D\t-4.16412\n', 'ING\t-4.55007\n', 'A\t-4.61137\n', 'Y\t-4.67708\n', 'T\t-4.83491\n', 'N\t-4.92079\n', 'L\t-5.10186\n', 'O\t-5.15328\n', 'ED\t-5.15388\n', 'R\t-5.15845\n', 'IN\t-5.20676\n', 'LY\t-5.22307\n', 'I\t-5.32059\n', 'UN\t-5.52611\n', 'ER\t-5.68763\n', 'M\t-5.69725\n', 'K\t-5.96746\n', 'TH\t-6.08362\n', 'AL\t-6.10076\n', 'MAN\t-6.15482\n', 'US\t-6.19319\n', 'AN\t-6.20053\n', 'NESS\t-6.2134\n', 'C\t-6.21874\n', 'B\t-6.26467\n', 'H\t-6.26497\n', 

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./preprocessed_unigrams.txt
  input_format: 
  model_prefix: ./sentencepiece_model/librispeech_unigram_model
  model_type: UNIGRAM
  vocab_size: 30000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: ▁
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: -1
  eos_id: -1
  pad_id: 1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
 

In [96]:
import sentencepiece as spm
import os

# Path to the unigram file and output directory
# unigram_file = "librispeech-vocab.txt"  # Replace with your unigram file path
unigram_file = "aggregated_result.txt"  # Replace with your unigram file path
output_dir = "./sentencepiece_model"
preprocessed_file = "./preprocessed_unigrams.txt"  # Temp file for synthetic "sentences"
# vocab_size = 10000  # Adjust to a smaller vocab size
vocab_size = 1000  # Adjust to a smaller vocab size
batch_size = 10  # Create more synthetic sentences by lowering batch size

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Preprocess the unigram file
print("[INFO] Preprocessing unigram file...")
with open(unigram_file, "r", encoding="utf-8") as infile, open(preprocessed_file, "w", encoding="utf-8") as outfile:
    batch = []
    for line in infile:
        line = line.lower()
        word = line.strip()
        if word:
            batch.append(word)
        if len(batch) >= batch_size:
            outfile.write(" ".join(batch) + "\n")
            batch = []
    if batch:
        outfile.write(" ".join(batch) + "\n")

print(f"[INFO] Preprocessed file saved to {preprocessed_file}")

# Train the SentencePiece model
model_prefix = os.path.join(output_dir, "librispeech_unigram_model1000_new")
spm.SentencePieceTrainer.train(
    input=preprocessed_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    model_type="unigram",
    character_coverage=1.0,
    bos_id=-1,
    eos_id=-1,
    unk_id=0,
    pad_id=1,
    user_defined_symbols=[".", "!", "?", ",", "'"],
    shuffle_input_sentence=True,
    normalization_rule_name="identity"  # Disable normalization
)

print(f"[INFO] SentencePiece model saved to {model_prefix}.model and {model_prefix}.vocab")


[INFO] Preprocessing unigram file...
[INFO] Preprocessed file saved to ./preprocessed_unigrams.txt
[INFO] SentencePiece model saved to ./sentencepiece_model/librispeech_unigram_model1000_new.model and ./sentencepiece_model/librispeech_unigram_model1000_new.vocab


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./preprocessed_unigrams.txt
  input_format: 
  model_prefix: ./sentencepiece_model/librispeech_unigram_model1000_new
  model_type: UNIGRAM
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: .
  user_defined_symbols: !
  user_defined_symbols: ?
  user_defined_symbols: ,
  user_defined_symbols: '
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_voc

In [2]:
%cd ..

/workspace/sound_asr


In [3]:
### NEW

import sentencepiece as spm
import os
import re

# Path to the unigram file and output directory
unigram_file = "aggregated_result.txt"  # Replace with your unigram file path
output_dir = "./sentencepiece_model"
preprocessed_file = "./preprocessed_unigrams.txt"  # Temp file for synthetic "sentences"
vocab_size = 1000  # Adjust to a smaller vocab size
batch_size = 10  # Create more synthetic sentences by lowering batch size

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Preprocess the unigram file
print("[INFO] Preprocessing unigram file...")

def remove_punctuation(text):
    """
    Remove all punctuation from text using regex.
    """
    return re.sub(r"[^\w\s]", "", text)  # Retain only word characters and spaces

with open(unigram_file, "r", encoding="utf-8") as infile, open(preprocessed_file, "w", encoding="utf-8") as outfile:
    batch = []
    for line in infile:
        line = line.lower()
        word = remove_punctuation(line.strip())  # Remove punctuation
        if word:
            batch.append(word)
        if len(batch) >= batch_size:
            outfile.write(" ".join(batch) + "\n")
            batch = []
    if batch:
        outfile.write(" ".join(batch) + "\n")

print(f"[INFO] Preprocessed file saved to {preprocessed_file}")

# Train the SentencePiece model
model_prefix = os.path.join(output_dir, "librispeech_unigram_model1000_new2")
spm.SentencePieceTrainer.train(
    input=preprocessed_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    model_type="unigram",
    character_coverage=1.0,
    bos_id=-1,
    eos_id=-1,
    unk_id=0,
    pad_id=1,
    shuffle_input_sentence=True,
    normalization_rule_name="identity"  # Disable normalization
)

print(f"[INFO] SentencePiece model saved to {model_prefix}.model and {model_prefix}.vocab")


[INFO] Preprocessing unigram file...
[INFO] Preprocessed file saved to ./preprocessed_unigrams.txt
[INFO] SentencePiece model saved to ./sentencepiece_model/librispeech_unigram_model1000_new2.model and ./sentencepiece_model/librispeech_unigram_model1000_new2.vocab


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./preprocessed_unigrams.txt
  input_format: 
  model_prefix: ./sentencepiece_model/librispeech_unigram_model1000_new2
  model_type: UNIGRAM
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: -1
  eos_id: -1
  pad_id: 1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>


In [99]:
import sentencepiece as spm

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
# sp.load("sentencepiece_model/librispeech_unigram_model.model")
sp.load("sentencepiece_model/librispeech_unigram_model1000_new.model")

# Test encoding and decoding
test_sentences = [
    "A B C D E F G",
    "A'MIGHTY'S A'BODY A'COURT",
    "THIS IS AN EXAMPLE",
    "THIS IS A TEST SENTENCE"
]

for text in test_sentences:
    text = text.lower()
    token_ids = sp.encode(text, out_type=int)
    tokens = sp.encode(text, out_type=str)
    decoded_text = sp.decode(token_ids).strip()  # Remove leading/trailing spaces

    print(f"\nInput Text: {text}")
    print(f"Token IDs: {token_ids}")
    print(f"Tokens: {tokens}")
    print(f"Decoded Text: {decoded_text}")



Input Text: a b c d e f g
Token IDs: [11, 9, 40, 62, 230, 131, 9, 47, 106]
Tokens: ['▁a', '▁', 'b', '▁c', '▁d', '▁e', '▁', 'f', '▁g']
Decoded Text: a b c d e f g

Input Text: a'mighty's a'body a'court
Token IDs: [11, 6, 30, 179, 67, 218, 6, 7, 11, 6, 452, 18, 23, 11, 6, 26, 370, 15]
Tokens: ['▁a', "'", 'm', 'ig', 'h', 'ty', "'", 's', '▁a', "'", 'bo', 'd', 'y', '▁a', "'", 'c', 'our', 't']
Decoded Text: a'mighty's a'body a'court

Input Text: this is an example
Token IDs: [77, 48, 103, 142, 36, 177, 41]
Tokens: ['▁this', '▁is', '▁an', '▁ex', 'a', 'mp', 'le']
Decoded Text: this is an example

Input Text: this is a test sentence
Token IDs: [77, 48, 11, 9, 15, 197, 587, 213]
Tokens: ['▁this', '▁is', '▁a', '▁', 't', 'est', '▁sent', 'ence']
Decoded Text: this is a test sentence


In [100]:
import sentencepiece as spm

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
# sp.load("sentencepiece_model/librispeech_unigram_model.model")
sp.load("sentencepiece_model/librispeech_unigram_model1000_new.model")

# Function to get the token ID for a given token
def get_token_id(token: str) -> int:
    """
    Returns the token ID for the given token.
    
    Args:
    - token (str): The token to look up.

    Returns:
    - int: Token ID if the token exists, -1 otherwise.
    """
    token_id = sp.piece_to_id(token)
    if token_id == -1:
        print(f"[INFO] Token '{token}' not found in vocabulary.")
    else:
        print(f"[INFO] Token '{token}' has ID: {token_id}")
    return token_id

# Test cases
tokens_to_lookup = ["▁", "A", "THIS", "UNKNOWN_TOKEN"]

for token in tokens_to_lookup:
    token = token.lower()
    token_id = get_token_id(token)
    print(f"Token: '{token}', ID: {token_id}")


[INFO] Token '▁' has ID: 9
Token: '▁', ID: 9
[INFO] Token 'a' has ID: 36
Token: 'a', ID: 36
[INFO] Token 'this' has ID: 0
Token: 'this', ID: 0
[INFO] Token 'unknown_token' has ID: 0
Token: 'unknown_token', ID: 0


In [103]:
token = "it"
get_token_id(token)

[INFO] Token 'it' has ID: 83


83

In [114]:
sp.decode_ids(100)

'te'

In [68]:
import os
import torch
import sentencepiece as spm

class CTCTextEncoder_Subword:
    """
    A minimal CTC-compatible text encoder/decoder that uses a raw SentencePiece model.
    - No Wav2Vec2CTCTokenizer or JSON vocab needed.
    - Spaces are handled by the '▁' token in the trained SP model.
    """

    def __init__(
        self,
        sp_model_path: str = "librispeech_unigram_model.model",
        # Optional arguments if you want to do LM decoding, etc.
        lm_path: str = None,
        use_lm: bool = False,
    ):
        """
        sp_model_path: Path to the raw SentencePiece model file (.model) 
                       that you trained (or loaded).
        lm_path:       Path to a KenLM .arpa or .bin if you want an LM-based decode. (Not used here)
        use_lm:        Whether to eventually use an LM for beam-search decode. (Not used here)
        """
        self.sp_model_path = sp_model_path
        self.lm_path = lm_path
        self.use_lm = use_lm

        # Load SentencePiece
        if not os.path.exists(sp_model_path):
            raise FileNotFoundError(f"SentencePiece model not found at: {sp_model_path}")
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(sp_model_path)

        # Basic debug info
        vocab_size = self.sp.get_piece_size()
        print(f"[INIT] Loaded SentencePiece model '{sp_model_path}' with vocab_size={vocab_size}")
        print(f"       use_lm={use_lm}, lm_path={lm_path if lm_path else 'None'}")

    def encode(self, text: str) -> torch.Tensor:
        """
        Encode text -> subword token IDs using raw SentencePiece.
        Returns a 2D torch.Tensor shape [1, seq_len].
        """
        token_ids = self.sp.encode(text, out_type=int)

        # For CTC training, we typically want shape [B, T], so wrap in batch dim.
        # Debug
        print(f"\n[ENCODE] text => '{text}'")
        print(f"   => token_ids: {token_ids}")
        return torch.tensor([token_ids], dtype=torch.long)

    def decode(self, token_ids) -> str:
        """
        Decode subword token IDs -> text using raw SentencePiece.
        
        token_ids can be:
          - a Python list of int IDs
          - a 1D torch.Tensor of shape [seq_len]
          - a 2D torch.Tensor of shape [1, seq_len]
        Returns the fully reconstructed text (with spaces).
        """
        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.squeeze().tolist()  # shape => list of int
        elif isinstance(token_ids, list) and isinstance(token_ids[0], list):
            # If shape is [B, T], pick the first batch
            token_ids = token_ids[0]

        # SentencePiece decode
        text = self.sp.decode(token_ids)

        # Debug
        print(f"[DECODE] token_ids => {token_ids}")
        print(f"   => Decoded Text: {text}")
        return text

    def decode_ctc_simple(self, token_ids) -> str:
        """
        (Optional) If you have repeated tokens (typical in raw CTC argmax),
        you can do a simple collapse of consecutive duplicates, ignoring a blank ID if you define one.

        If your SentencePiece has a <pad> or <unk> ID for blank, you can skip them here.
        """
        # This is only relevant if your model outputs repeated subword IDs 
        # or uses a specific blank token. For demonstration:
        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.squeeze().tolist()

        collapsed = []
        prev_id = None
        for tid in token_ids:
            # Suppose <pad> or <unk> was ID=0, skip if you want to treat as blank
            # if tid == 0:
            #     continue

            if tid != prev_id:
                collapsed.append(tid)
            prev_id = tid

        # Now decode
        return self.sp.decode(collapsed)

    def __len__(self):
        """Return the size of the SentencePiece vocabulary."""
        return self.sp.get_piece_size()


In [58]:
import sentencepiece as spm
import json

# Load the SentencePiece model
sp_model_path = "sentencepiece_model/librispeech_unigram_model.model"
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

# Export the vocabulary
vocab = {sp.id_to_piece(i): i for i in range(sp.get_piece_size())}

# Save as JSON
vocab_json_path = "sentencepiece_model/librispeech_vocab.json"
with open(vocab_json_path, "w", encoding="utf-8") as f:
    json.dump(vocab, f, ensure_ascii=False, indent=4)

print(f"Vocabulary saved to {vocab_json_path}")


Vocabulary saved to sentencepiece_model/librispeech_vocab.json


In [59]:
# Load the SentencePiece model
import json

vocab_json_path = "sentencepiece_model/librispeech_vocab.json"
with open(vocab_json_path, "r", encoding="utf-8") as f:
    vocab = json.load(f)

# Check if the space token ('▁') is present
print("[INFO] Is space token ('▁') in vocabulary?", "▁" in vocab)


[INFO] Is space token ('▁') in vocabulary? True


In [72]:
import sentencepiece as spm

sp_model_path = "sentencepiece_model/librispeech_unigram_model.model"
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

# Test sentences
test_sentences = [
    "THIS IS A TEST SENTENCE.",
    "ANOTHER EXAMPLE WITH A'MIGHTY.",
    "HELLO WORLD!",
]


for text in test_sentences:
    text = text.lower()
    token_ids = sp.encode(text, out_type=int)
    tokens = sp.encode(text, out_type=str)
    decoded_text = sp.decode(token_ids)

    print(f"\n[SentencePiece]")
    print(f"Input Text: {text}")
    print(f"Token IDs: {token_ids}")
    print(f"Tokens: {tokens}")
    print(f"Decoded Text: {decoded_text}")



[SentencePiece]
Input Text: this is a test sentence.
Token IDs: [1003, 7, 8, 45, 8, 10, 2215, 8479, 9, 2]
Tokens: ['▁thi', 's', '▁', 'is', '▁', 'a', '▁test', '▁sentenc', 'e', '.']
Decoded Text: this is a test sentence.

[SentencePiece]
Input Text: another example with a'mighty.
Token IDs: [8, 24, 2390, 8893, 2458, 8, 10, 6, 7291, 2]
Tokens: ['▁', 'an', 'other', '▁example', '▁with', '▁', 'a', "'", 'mighty', '.']
Decoded Text: another example with a'mighty.

[SentencePiece]
Input Text: hello world!
Token IDs: [1707, 17, 8, 2425, 3]
Tokens: ['▁hell', 'o', '▁', 'world', '!']
Decoded Text: hello world!


In [54]:
import re
import os
import torch
import kenlm
import numpy as np

from typing import List, Tuple, Optional, Union
from collections import defaultdict
from transformers import Wav2Vec2Processor
import sentencepiece as spm
from pyctcdecode import build_ctcdecoder

os.environ["TOKENIZERS_PARALLELISM"] = "false"


class CTCTextEncoder:
    def __init__(
        self,
        sp_model_path: Optional[str] = None,
        arpa_path: Optional[str] = None,
        binary_path: Optional[str] = "4-gram_lc_correct.bin",
        unigram_path: Optional[str] = "librispeech-vocab.txt",
        lm_weight: float = 0.5,
        beam_size: int = 100,
        use_lm: bool = False,
        use_bpe: bool = True,
        blank_token: str = "<pad>",
        unk_token: str = "<unk>",
        **kwargs
    ):
        print("\n[INIT] CTCTextEncoder init:")
        self.beam_size = beam_size
        self.lm_weight = lm_weight
        self.arpa_path = arpa_path
        self.binary_path = binary_path
        self.blank_token = blank_token
        self.unk_token = unk_token
        self.use_lm = use_lm
        self.use_bpe = use_bpe
        self.sp_model_path = sp_model_path

        print("  -> sp_model_path:", sp_model_path)
        print("  -> lm_weight:", lm_weight)
        print("  -> beam_size:", beam_size)
        print("  -> binary_path:", binary_path)
        print("  -> use_lm:", use_lm)
        print("  -> use_bpe:", use_bpe)
        print("  -> blank_token:", blank_token)
        print("  -> unk_token:", unk_token)

        self.printed_samples = 0
        self.max_printed_samples = 5

        # Load unigrams if provided (for pyctcdecode LM)
        self.unigrams = None
        if unigram_path and os.path.exists(unigram_path):
            print(f"[INIT] Loading unigrams from: {unigram_path}")
            with open(unigram_path, 'r', encoding='utf-8') as f:
                self.unigrams = [line.strip().lower() for line in f if line.strip()]
            print(f"[INIT] Loaded {len(self.unigrams)} unigrams.")

        # Initialize vocabulary (tokenizer or char-based)
        self._initialize_vocabulary()

        # Map indices <-> tokens
        self.ind2char = dict(enumerate(self.vocab))
        self.char2ind = {v: k for k, v in self.ind2char.items()}
        self.blank_index = self.char2ind.get(self.blank_token, None)

        print("\n[INIT] Final Vocabulary Information:")
        print(f"  -> vocab size: {len(self.vocab)}")
        print(f"  -> blank token: '{self.blank_token}' => index {self.blank_index}")
        sample_keys = list(self.ind2char.keys())[:20]
        sample_ind2char = {k: self.ind2char[k] for k in sample_keys}
        print(f"  -> sample ind2char: {sample_ind2char}")
        sample_tokens = list(self.char2ind.keys())[:20]
        sample_char2ind = {t: self.char2ind[t] for t in sample_tokens}
        print(f"  -> sample char2ind: {sample_char2ind}")

        # If using LM, init
        if self.use_lm:
            self._initialize_language_model()
        else:
            print("[INIT] Language model usage disabled.")
            self.lm = None
            self.decoder = None

        print("[INIT] CTCTextEncoder init done.\n")

    def _initialize_vocabulary(self):
        """
        Loads BPE-based vocab from SentencePiece model if use_bpe=True;
        else uses simple char-based vocab.
        """
        if self.use_bpe:
            print("[VOCAB] use_bpe=True => loading SentencePiece model.")
            if not self.sp_model_path or not os.path.exists(self.sp_model_path):
                raise ValueError(f"[VOCAB] Invalid SentencePiece model path: {self.sp_model_path}")

            self.sp_model = spm.SentencePieceProcessor()
            self.sp_model.load(self.sp_model_path)
            self.vocab = [self.sp_model.id_to_piece(i) for i in range(self.sp_model.get_piece_size())]
            print(f"[VOCAB] Loaded SentencePiece model with vocab size {len(self.vocab)}")
        else:
            print("[VOCAB] use_bpe=False => using simple character-based vocab.")
            chars = list("abcdefghijklmnopqrstuvwxyz ")
            self.vocab = [self.blank_token] + chars
            self.sp_model = None

    def _initialize_language_model(self):
        """Initialize KenLM + pyctcdecode for beam search."""
        self.lm = None
        self.decoder = None

        model_path = self.binary_path if self.binary_path else self.arpa_path
        print(f"[LM] Attempting to load language model from path: {model_path}")
        if not model_path or not os.path.exists(model_path):
            print("[LM] No valid LM path found. Skipping LM init.")
            return

        try:
            self.lm = kenlm.Model(model_path)
            print(f"[LM] KenLM model loaded successfully: {model_path}")

            self.decoder = build_ctcdecoder(
                labels=self.vocab,
                kenlm_model_path=model_path,
                alpha=self.lm_weight,
                beta=0.1,
                unk_score_offset=-10.0,
            )

            print("[LM] Successfully initialized pyctcdecode with LM support.")

        except Exception as e:
            print(f"[LM] WARNING: Could not initialize LM: {str(e)}")
            self.decoder = None

    def encode(self, text: str) -> torch.Tensor:
        """
        Convert text -> token IDs.
        """
        if self.use_bpe and self.sp_model is not None:
            text = "▁" + text.replace(" ", " ▁").strip()
            token_ids = self.sp_model.encode(text, out_type=int)
            return torch.tensor(token_ids).unsqueeze(0)
        else:
            normalized_text = self.normalize_text(text)
            token_indices = [
                self.char2ind.get(char, self.char2ind.get(self.unk_token)) for char in normalized_text
            ]
            return torch.tensor(token_indices).unsqueeze(0)

    def decode_simple(self, indices: List[int]) -> str:
        """
        Greedy decode: remove blanks, collapse repeats.
        """
        decoded_chars = []
        prev_idx = None

        for idx in indices:
            if idx == self.blank_index:
                prev_idx = idx
                continue
            if idx == prev_idx:
                continue
            if 0 <= idx < len(self.ind2char):
                decoded_chars.append(self.ind2char[idx])
            prev_idx = idx

        text = "".join(decoded_chars).strip()
        if self.use_bpe:
            text = text.replace("▁", " ")
        return text

    def decode(self, indices: List[int]) -> str:
        """
        Decode token indices to text.
        """
        if self.decoder:
            decoded_text = self.decoder.decode(indices)
            return decoded_text.replace("▁", " ").strip()
        else:
            return self.decode_simple(indices).strip()

    @staticmethod
    def normalize_text(text: str) -> str:
        """Normalize input text."""
        text = text.lower()
        text = re.sub(r"[^a-z ]", "", text)
        return text


In [None]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

# Load the JSON vocabulary instead of the SentencePiece binary model
vocab_json_path = "sentencepiece_model/librispeech_vocab.json"



encoder = CTCTextEncoder_Subword(
    sp_model_path=vocab_json_path,  # Use the JSON file here
    lm_path=None,  # Path to KenLM model if available, or set to None
    vocab_special_tokens={
        "pad_token": "<pad>",
        "word_delimiter_token": " ",  # SentencePiece word boundary token
        "unk_token": "<unk>",
    },
    use_lm=False,
)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfe in position 99: invalid start byte

In [69]:
# Example usage
encoder = CTCTextEncoder_Subword(
    sp_model_path="sentencepiece_model/librispeech_unigram_model.model",
    lm_path=None,
    use_lm=False,
)

# Test some sentences
test_sentences = [
    "THIS IS A TEST SENTENCE.",
    "ANOTHER EXAMPLE WITH A'MIGHTY.",
    "HELLO WORLD!",
]

for text in test_sentences:
    # 1. Encode
    token_ids = encoder.encode(text)

    # 2. Decode
    decoded = encoder.decode(token_ids)

    print(f"\nOriginal:  {text}")
    print(f"Reconstructed: {decoded}")


[INIT] Loaded SentencePiece model 'sentencepiece_model/librispeech_unigram_model.model' with vocab_size=10000
       use_lm=False, lm_path=None

[ENCODE] text => 'THIS IS A TEST SENTENCE.'
   => token_ids: [1003, 7, 8, 45, 8, 10, 2215, 8479, 9, 2]
[DECODE] token_ids => [1003, 7, 8, 45, 8, 10, 2215, 8479, 9, 2]
   => Decoded Text: THIS IS A TEST SENTENCE.

Original:  THIS IS A TEST SENTENCE.
Reconstructed: THIS IS A TEST SENTENCE.

[ENCODE] text => 'ANOTHER EXAMPLE WITH A'MIGHTY.'
   => token_ids: [8, 24, 2390, 8893, 2458, 8, 10, 6, 7291, 2]
[DECODE] token_ids => [8, 24, 2390, 8893, 2458, 8, 10, 6, 7291, 2]
   => Decoded Text: ANOTHER EXAMPLE WITH A'MIGHTY.

Original:  ANOTHER EXAMPLE WITH A'MIGHTY.
Reconstructed: ANOTHER EXAMPLE WITH A'MIGHTY.

[ENCODE] text => 'HELLO WORLD!'
   => token_ids: [1707, 17, 8, 2425, 3]
[DECODE] token_ids => [1707, 17, 8, 2425, 3]
   => Decoded Text: HELLO WORLD!

Original:  HELLO WORLD!
Reconstructed: HELLO WORLD!


In [66]:
test_sentences = [
    "THIS IS A TEST SENTENCE.",
    "ANOTHER EXAMPLE WITH A'MIGHTY.",
    "HELLO WORLD!",
]

for text in test_sentences:
    # Encode the text
    token_ids = encoder.encode(text)
    
    # Convert token IDs to tokens
    tokens = encoder.tokenizer.convert_ids_to_tokens(token_ids.squeeze().tolist())
    
    # Decode back to text
    decoded_text = encoder.decode(token_ids.squeeze().tolist())
    
    # Reconstruct spaces using '▁'
    reconstructed_text = decoded_text.replace("▁", " ").strip()
    
    # Display results
    print(f"\n[ENCODE] text => '{text}'")
    print(f"   => token_ids: {token_ids.tolist()}")
    print(f"[DECODE] token_ids => {token_ids.squeeze().tolist()}")
    print(f"Tokens: {tokens}")
    print(f"Decoded Text (raw): {decoded_text}")
    print(f"Reconstructed Text: {reconstructed_text}")


[ENCODE] text => 'THIS IS A TEST SENTENCE.'
   => token_ids: [[60, 45, 45, 10, 86, 7, 14, 0, 7, 76, 289, 2]]
[DECODE] token_ids => [60, 45, 45, 10, 86, 7, 14, 0, 7, 76, 289, 2]
   => Decoded Text (raw): THISATESTSENTENCE.
   => Reconstructed Text: THISATESTSENTENCE.

[ENCODE] text => 'THIS IS A TEST SENTENCE.'
   => token_ids: [[60, 45, 45, 10, 86, 7, 14, 0, 7, 76, 289, 2]]
[DECODE] token_ids => [60, 45, 45, 10, 86, 7, 14, 0, 7, 76, 289, 2]
Tokens: ['TH', 'IS', 'IS', 'A', 'TE', 'S', 'T', '<unk>', 'S', 'ENT', 'ENCE', '.']
Decoded Text (raw): THISATESTSENTENCE.
Reconstructed Text: THISATESTSENTENCE.
[ENCODE] text => 'ANOTHER EXAMPLE WITH A'MIGHTY.'
   => token_ids: [[605, 211, 835, 77, 1537, 1636, 10, 6, 7291, 2]]
[DECODE] token_ids => [605, 211, 835, 77, 1537, 1636, 10, 6, 7291, 2]
   => Decoded Text (raw): ANOTHEREXAMPLEWITHA'MIGHTY.
   => Reconstructed Text: ANOTHEREXAMPLEWITHA'MIGHTY.

[ENCODE] text => 'ANOTHER EXAMPLE WITH A'MIGHTY.'
   => token_ids: [[605, 211, 835, 77, 1537, 1636,

In [52]:
def preprocess_text(text: str) -> str:
    return "▁" + text.replace(" ", " ▁").strip()

# Preprocess test sentences
preprocessed_sentences = [preprocess_text(text) for text in test_sentences]

for text in preprocessed_sentences:
    token_ids = encoder.encode(text)
    tokens = encoder.tokenizer.convert_ids_to_tokens(token_ids.squeeze().tolist())
    decoded_text = encoder.decode(token_ids.squeeze().tolist())
    reconstructed_text = decoded_text.replace("▁", " ").strip()

    print(f"\n[DEBUG] Preprocessed Encoding '{text}'")
    print(f"Token IDs: {token_ids.tolist()}")
    print(f"Tokens: {tokens}")
    print(f"Decoded Text (raw): {decoded_text}")
    print(f"Reconstructed Text: {reconstructed_text}")


[ENCODE] text => '▁THIS ▁IS ▁A ▁TEST ▁SENTENCE.'
   => token_ids: [1003, 7, 0, 8, 45, 8, 10, 2215, 8479, 9, 2]
[DECODE] token_ids => [1003, 7, 0, 8, 45, 8, 10, 2215, 8479, 9, 2]

[DEBUG] Preprocessed Encoding '▁THIS ▁IS ▁A ▁TEST ▁SENTENCE.'
Token IDs: [[1003, 7, 0, 8, 45, 8, 10, 2215, 8479, 9, 2]]
Tokens: ['▁THI', 'S', '<unk>', '▁', 'IS', '▁', 'A', '▁TEST', '▁SENTENC', 'E', '.']
Decoded Text (raw): THIS IS A TEST SENTENCE.
Reconstructed Text: THIS IS A TEST SENTENCE.
[ENCODE] text => '▁ANOTHER ▁EXAMPLE ▁WITH ▁A'MIGHTY.'
   => token_ids: [8, 605, 211, 8893, 2458, 8, 10, 6, 7291, 2]
[DECODE] token_ids => [8, 605, 211, 8893, 2458, 8, 10, 6, 7291, 2]

[DEBUG] Preprocessed Encoding '▁ANOTHER ▁EXAMPLE ▁WITH ▁A'MIGHTY.'
Token IDs: [[8, 605, 211, 8893, 2458, 8, 10, 6, 7291, 2]]
Tokens: ['▁', 'ANO', 'THER', '▁EXAMPLE', '▁WITH', '▁', 'A', "'", 'MIGHTY', '.']
Decoded Text (raw): ANOTHER EXAMPLE WITH A'MIGHTY.
Reconstructed Text: ANOTHER EXAMPLE WITH A'MIGHTY.
[ENCODE] text => '▁HELLO ▁WORLD!'
   

### Collect vocab

In [94]:
import os
import re
from tqdm import tqdm

def process_text_files(folders, output_file_name="aggregated_result.txt"):
    # Initialize an empty string to store the aggregated content
    aggregated_content = ""

    # Count total files for progress bar
    total_files = sum(len(files) for folder_name in folders for _, _, files in os.walk(folder_name) if files)

    # Initialize progress bar
    with tqdm(total=total_files, desc="Processing files", unit="file") as pbar:
        # Walk through each folder in the list
        for folder_name in folders:
            for root, _, files in os.walk(folder_name):
                for file in files:
                    if file.endswith(".txt"):  # Process only .txt files
                        file_path = os.path.join(root, file)
                        
                        # Read and process each file
                        with open(file_path, "r", encoding="utf-8") as f:
                            for line in f:
                                # Remove leading numbers and dashes using regex
                                processed_line = re.sub(r"^[\d-]+\s*", "", line)
                                aggregated_content += processed_line

                        pbar.update(1)

    # Save the aggregated content to a new file in the current directory
    with open(output_file_name, "w", encoding="utf-8") as output_file:
        output_file.write(aggregated_content)

    print(f"Aggregated content saved to {output_file_name}")


In [92]:
%pwd

'/workspace/sound_asr'

In [95]:
# Example usage
# Replace 'your_folder_names' with the list of folders containing the subfolders and .txt files
folders = ['data/datasets/librispeech/train-clean-100', 'data/datasets/librispeech/train-clean-360', 'data/datasets/librispeech/train-clean-500']
process_text_files(folders)


Processing files:   2%| | 2682/135235 [00:00<00:2

Aggregated content saved to aggregated_result.txt





Output functions

In [None]:
##### PART 1

import os
import re
from tqdm import tqdm

def process_text_files(folders, output_file_name="aggregated_result.txt"):
    # Initialize an empty string to store the aggregated content
    aggregated_content = ""

    # Count total files for progress bar
    total_files = sum(len(files) for folder_name in folders for _, _, files in os.walk(folder_name) if files)

    # Initialize progress bar
    with tqdm(total=total_files, desc="Processing files", unit="file") as pbar:
        # Walk through each folder in the list
        for folder_name in folders:
            for root, _, files in os.walk(folder_name):
                for file in files:
                    if file.endswith(".txt"):  # Process only .txt files
                        file_path = os.path.join(root, file)
                        
                        # Read and process each file
                        with open(file_path, "r", encoding="utf-8") as f:
                            for line in f:
                                # Remove leading numbers and dashes using regex
                                processed_line = re.sub(r"^[\d-]+\s*", "", line)
                                aggregated_content += processed_line

                        pbar.update(1)

    # Save the aggregated content to a new file in the current directory
    with open(output_file_name, "w", encoding="utf-8") as output_file:
        output_file.write(aggregated_content)

    print(f"Aggregated content saved to {output_file_name}")



# Example usage
# Replace 'your_folder_names' with the list of folders containing the subfolders and .txt files
folders = ['data/datasets/librispeech/train-clean-100', 'data/datasets/librispeech/train-clean-360', 'data/datasets/librispeech/train-clean-500']
process_text_files(folders)



##### PART 2

import sentencepiece as spm
import os
import re

# Path to the unigram file and output directory
unigram_file = "aggregated_result.txt"  # Replace with your unigram file path
output_dir = "./sentencepiece_model"
preprocessed_file = "./preprocessed_unigrams.txt"  # Temp file for synthetic "sentences"
vocab_size = 1000  # Adjust to a smaller vocab size
batch_size = 10  # Create more synthetic sentences by lowering batch size

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Preprocess the unigram file
print("[INFO] Preprocessing unigram file...")

def remove_punctuation(text):
    """
    Remove all punctuation from text using regex.
    """
    return re.sub(r"[^\w\s]", "", text)  # Retain only word characters and spaces

with open(unigram_file, "r", encoding="utf-8") as infile, open(preprocessed_file, "w", encoding="utf-8") as outfile:
    batch = []
    for line in infile:
        line = line.lower()
        word = remove_punctuation(line.strip())  # Remove punctuation
        if word:
            batch.append(word)
        if len(batch) >= batch_size:
            outfile.write(" ".join(batch) + "\n")
            batch = []
    if batch:
        outfile.write(" ".join(batch) + "\n")

print(f"[INFO] Preprocessed file saved to {preprocessed_file}")

# Train the SentencePiece model
model_prefix = os.path.join(output_dir, "librispeech_unigram_model1000_new2")
spm.SentencePieceTrainer.train(
    input=preprocessed_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    model_type="unigram",
    character_coverage=1.0,
    bos_id=-1,
    eos_id=-1,
    unk_id=0,
    pad_id=1,
    shuffle_input_sentence=True,
    normalization_rule_name="identity"  # Disable normalization
)

print(f"[INFO] SentencePiece model saved to {model_prefix}.model and {model_prefix}.vocab")
