In [None]:
def get_vocabulary_from_arpa(arpa_path):
    vocabulary = set()

    with open(arpa_path, 'r', encoding='utf-8') as arpa_file:
        in_data_section = False

        for line in arpa_file:
            line = line.strip()

            if line == '\\data\\':
                in_data_section = True
            elif line.startswith('\\') and in_data_section:
                break  # End of data section
            elif not line.startswith('\\') and in_data_section:
                # Inside the n-gram section, extract vocabulary
                parts = line.split()
                if len(parts) > 1:
                    word = parts[1]
                    vocabulary.add(word)

    return vocabulary

def add_tokens_to_arpa(arpa_path, tokens_to_add, tokens_to_exclude):
    with open(arpa_path, 'r', encoding='utf-8') as arpa_file:
        arpa_content = arpa_file.readlines()

    # Find the index where the unigram section starts
    start_index = arpa_content.index('\\1-grams:\n') + 1

    # Insert entries for the new tokens excluding the ones to exclude
    for token in tokens_to_add:
        if token not in tokens_to_exclude:
            arpa_content.insert(start_index, f'-99.999 {token} -99.999\n')

    # Write the modified content back to the ARPA file
    with open(arpa_path, 'w', encoding='utf-8') as arpa_file:
        arpa_file.writelines(arpa_content)

In [None]:
def convert_numbers_to_words(text):
    words = []
    for word in text.split():
        if word.isdigit():
            # Convert numbers to words
            word = num2words(word)
        elif re.match(r'\d+(st|nd|rd|th)', word):
            # Handle ordinal numbers
            number_part = re.match(r'\d+', word).group()
            ordinal_suffix = re.search(r'(st|nd|rd|th)', word).group()
            word = num2words(number_part, ordinal=True) + ordinal_suffix
        words.append(word)
    return ' '.join(words)

def remove_punctuation_and_special_characters(text):
    # Remove punctuation and special characters
    return re.sub(r'[^\w\s,-]', '', text)

In [2]:
# !pip install jiwer
# !pip install evaluate 
# !pip install pyctcdecode
# !pip install torch
# !pip install g2p
# !pip install num2words

Collecting num2words
  Downloading num2words-0.5.13-py3-none-any.whl.metadata (12 kB)
Collecting docopt>=0.6.2 (from num2words)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hDownloading num2words-0.5.13-py3-none-any.whl (143 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.3/143.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25ldone
[?25h  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13705 sha256=6356b80115ff58b5688756d95fccaab0ffdeef2243b8fb7d127200c10dec1516
  Stored in directory: /home/osx/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt, num2words
Successfully installed docopt-0.6.2 num2words-0.5.13


In [None]:
from datasets import DatasetDict, Dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
# from pyctcdecode import build_ctcdecoder
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from tqdm import tqdm
from evaluate import load

In [4]:
# !pip install g2p num2words

[0mCollecting g2p
  Obtaining dependency information for g2p from https://files.pythonhosted.org/packages/c8/2c/4c0c30c92e4647bce762b3cd1397c4ab8119a514a9a58f2211be358ee0e6/g2p-1.1.20230822-py3-none-any.whl.metadata
  Downloading g2p-1.1.20230822-py3-none-any.whl.metadata (16 kB)
Collecting num2words
  Obtaining dependency information for num2words from https://files.pythonhosted.org/packages/8f/f0/ca1228af2bcbce2fdf2b23d58643c84253b88a3c1cd9dba391ca683c4b21/num2words-0.5.13-py3-none-any.whl.metadata
  Downloading num2words-0.5.13-py3-none-any.whl.metadata (12 kB)
Collecting coloredlogs<=14.0 (from g2p)
  Downloading coloredlogs-14.0-py2.py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<=2.3.0,>=2.1.0 (from g2p)
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 kB[0m [31m3.8 MB/s[0m