In [1]:
# !pip install num2words
from datasets import load_dataset
#from g2p_en import G2p
from g2p import make_g2p
from num2words import num2words  # Import num2words from nltk
import re

# Load the Tatoeba dataset
tatoeba_dataset = load_dataset("tatoeba", 'en-mr')

# Initialize g2p converter
transducer = make_g2p('eng', 'eng-arpabet')

In [2]:
# Function to convert numbers to words
def convert_numbers_to_words(text):
    words = []
    for word in text.split():
        if word.isdigit():
            # Convert numbers to words
            word = num2words(word)
        elif re.match(r'\d+(st|nd|rd|th)', word):
            # Handle ordinal numbers
            number_part = re.match(r'\d+', word).group()
            ordinal_suffix = re.search(r'(st|nd|rd|th)', word).group()
            word = num2words(number_part, ordinal=True) + ordinal_suffix
        words.append(word)
    return ' '.join(words)

def remove_punctuation_and_special_characters(text):
    # Remove punctuation and special characters
    return re.sub(r'[^\w\s,-]', '', text)


In [3]:
translations = tatoeba_dataset["train"]["translation"]
for translation in translations[:1]:
  sentence = translation["en"]
  print(sentence)
  sentence = remove_punctuation_and_special_characters(sentence)
  print(sentence)
  sentence = convert_numbers_to_words(sentence)
  print(sentence)
  phonemes_list = [transducer(word).output_string for word in re.findall(r'\S+', sentence)]
  print(phonemes_list)
  all_phonemes = [item for sublist in phonemes_list for item in sublist.split()]
  print(all_phonemes)
  # Replace hyphens with letters that come after the hyphen
  all_phonemes_no_hyphen = [phoneme.split("-")[-1] if "-" in phoneme else phoneme for phoneme in all_phonemes]
  print(all_phonemes_no_hyphen)

Today is June 18th and it is Muiriel's birthday!
Today is June 18th and it is Muiriels birthday
Today is June eighteenthth and it is Muiriels birthday
['T AH D EY ', 'IH Z ', 'JH UW N ', '', 'AH N D ', 'IH T ', 'IH Z ', '', 'B ER TH D EY ']
['T', 'AH', 'D', 'EY', 'IH', 'Z', 'JH', 'UW', 'N', 'AH', 'N', 'D', 'IH', 'T', 'IH', 'Z', 'B', 'ER', 'TH', 'D', 'EY']
['T', 'AH', 'D', 'EY', 'IH', 'Z', 'JH', 'UW', 'N', 'AH', 'N', 'D', 'IH', 'T', 'IH', 'Z', 'B', 'ER', 'TH', 'D', 'EY']


In [4]:
unique_phonemes = set()

with open("/work/van-speech-nlp/jindaznb/jslpnb/torgo_inference_on_cluster/dataset_phonemes.txt", "w") as file:
    for translation in translations:
      sentence = translation["en"]
      sentence = remove_punctuation_and_special_characters(sentence)
      sentence = convert_numbers_to_words(sentence)
      phonemes_list = [transducer(word).output_string for word in re.findall(r'\S+', sentence)]
      all_phonemes = [item for sublist in phonemes_list for item in sublist.split()]

      # Update the set of unique phonemes excluding those containing hyphens
      unique_phonemes.update(phoneme for phoneme in all_phonemes if "-" not in phoneme)

      # Replace hyphens with letters that come after the hyphen
      all_phonemes_no_hyphen = [phoneme.split("-")[-1] if "-" in phoneme else phoneme for phoneme in all_phonemes]

      # Write all individual phonemes (without hyphens) to the file
      file.write(" ".join(all_phonemes_no_hyphen))
      file.write("\n")

# Print the vocabulary of phonemes
print("Phoneme Vocabulary:")
print(unique_phonemes)

Phoneme Vocabulary:
{'ZH', 'AE', 'NG', ',,', 'IH', ',', 'TH', 'ER', 'R', 'M', 'HH', 'D', 'AY', 'DH', 'Z', 'W', 'AO', 'JH', 'Y', 'UH', 'G', 'AA', 'EH', 'F', 'P', 'S', 'N', 'OW', 'T', 'CH', ',,,', 'AW', 'SH', 'EY', 'IY', 'UW', 'AH', 'B', 'L', 'V', 'K', 'OY'}


In [5]:
# Read the content from the file
with open("dataset_phonemes.txt", 'r') as file:
    content = file.read()

# Remove instances of consecutive triple commas (,,,)
content = content.replace(',,,', '')
print(content)
# Remove instances of consecutive double commas (,,)
content = content.replace(',,', '')

# Remove individual commas
content = content.replace(',', '')
# Write the modified content back to the file
with open('dataset_phonemes.txt', 'w') as file:
    file.write(content)

T AH D EY IH Z JH UW N AH N D IH T IH Z B ER TH D EY
IH Z T W EH N T IY N AW
DH AH P AE S W ER D IH Z
AY W IH L B IY B AE K S UW N
AY W IH L B IY B AE K S UW N
IH M AE T AH L AO S F AO R W ER D Z
IH M AE T AH L AO S F AO R W ER D Z
DH IH S IH Z N EH V ER G OW IH NG T UW EH N D
DH IH S IH Z N EH V ER G OW IH NG T UW EH N D
DH IH S IH Z N EH V ER G OW IH NG T UW EH N D
AY W AA Z IH N DH AH M AW N T AH N Z
AY W AA Z IH N DH AH M AW N T AH N Z
Y UW AA R IH N M AY W EY
Y UW AA R IH N M AY W EY
AY M EY K W AH N HH AH N D R AH D AH D EY
AY M EY K W AH N HH AH N D R AH D AH D EY
AY M EY K W AH N HH AH N D R AH D AH D EY
DH AE T W OW N T HH AE P AH N
DH AE T W OW N T HH AE P AH N
DH AE T W OW N T HH AE P AH N
AY M IH S Y UW
AY M IH S Y UW
IH L K AO L DH EH M T AH M AA R OW W EH N AY K AH M B AE K
Y UW SH UH D S L IY P
Y UW SH UH D S L IY P
IH M G OW IH NG T UW G OW
AY K AE N T L AY V DH AE T K AY N D AH V L AY F
M OW S T P IY P AH L TH IH NG K IH M K R EY Z IY
M OW S T P IY P AH L TH IH NG K IH

In [7]:
import os
from huggingface_hub import Repository


input_dataset = "dataset_phonemes.txt"
output_directory = "output_directory/"
output_model_base = "output_model.klm"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Ngram size options
order_trigram = 3
order_unigram = 1

trigram_repo = Repository(local_dir="output_model.klm_trigram", clone_from="laurynflu/output_model.klm_trigram")


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.


OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).

In [None]:
!module load git/2.19.0

In [None]:
!kenlm/build/bin/lmplz -o 3    --text dataset_phonemes.txt     --arpa output_directory/output_model.klm_trigram.arpa     --discount_fallback --skip_symbols|     kenlm/build/bin/build_binary     -T /dev/stdin /output_directory/output_model.klm_trigram.arpa

In [None]:
def get_vocabulary_from_arpa(arpa_path):
    vocabulary = set()

    with open(arpa_path, 'r', encoding='utf-8') as arpa_file:
        in_data_section = False

        for line in arpa_file:
            line = line.strip()

            if line == '\\data\\':
                in_data_section = True
            elif line.startswith('\\') and in_data_section:
                break  # End of data section
            elif not line.startswith('\\') and in_data_section:
                # Inside the n-gram section, extract vocabulary
                parts = line.split()
                if len(parts) > 1:
                    word = parts[1]
                    vocabulary.add(word)

    return vocabulary

# Replace with the actual path to your KenLM ARPA file
arpa_path = "output_directory/output_model.klm_trigram.arpa"
vocabulary = get_vocabulary_from_arpa(arpa_path)

# Print or use the vocabulary as needed
print("Vocabulary:", vocabulary)


In [None]:
def add_tokens_to_arpa(arpa_path, tokens_to_add, tokens_to_exclude):
    with open(arpa_path, 'r', encoding='utf-8') as arpa_file:
        arpa_content = arpa_file.readlines()

    # Find the index where the unigram section starts
    start_index = arpa_content.index('\\1-grams:\n') + 1

    # Insert entries for the new tokens excluding the ones to exclude
    for token in tokens_to_add:
        if token not in tokens_to_exclude:
            arpa_content.insert(start_index, f'-99.999 {token} -99.999\n')

    # Write the modified content back to the ARPA file
    with open(arpa_path, 'w', encoding='utf-8') as arpa_file:
        arpa_file.writelines(arpa_content)

# Replace with the actual path to your KenLM ARPA file
arpa_path = "output_directory/output_model.klm_trigram.arpa"

# Tokens to add
new_tokens = ["<pad>", "<sil>", "<spn>"]

# Tokens to exclude
tokens_to_exclude = ["<s>", "</s>"]

# Add tokens to the ARPA file excluding the specified tokens
add_tokens_to_arpa(arpa_path, new_tokens, tokens_to_exclude)



In [None]:
# Replace with the actual path to your KenLM ARPA file
arpa_path = "output_directory/output_model.klm_trigram.arpa"
vocabulary = get_vocabulary_from_arpa(arpa_path)

# Print or use the vocabulary as needed
print("Vocabulary:", vocabulary)

In [None]:
# Debugging: Print the current working directory
print("Current working directory:", os.getcwd())
output_directory = "output_directory/"

# Change the working directory
os.chdir(output_directory)

# Verify the change
new_dir = os.getcwd()
print("New working directory:", new_dir)


# Debugging: Print the contents of the local directory
print("Contents of the local directory:", os.listdir("."))

# Push to Hugging Face
trigram_repo.push_to_hub(commit_message="Create trigram model with kenLM")

In [None]:
!kenlm/build/bin/build_binary output_directory/output_model.klm_trigram.arpa output_directory/output_model.klm_trigram.bin

In [1]:
from pycorrector import EnSpellCorrector

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
    # 1. 演示英文句子纠错
    sent = "what happending? how to speling it, can you gorrect it?"
    m = EnSpellCorrector()
    details = m.correct(sent)
    print(details)
    print()

[32m2024-02-15 23:21:21.535[0m | [34m[1mDEBUG   [0m | [36mpycorrector.en_spell_corrector[0m:[36m__init__[0m:[36m53[0m - [34m[1mload en spell data: /work/van-speech-nlp/jindaznb/asrenv/lib/python3.10/site-packages/pycorrector/data/en.json.gz, size: 30120[0m


{'source': 'what happending? how to speling it, can you gorrect it?', 'target': 'what happening? how to spelling it, can you correct it?', 'errors': [('happending', 'happening', 5), ('speling', 'spelling', 24), ('gorrect', 'correct', 44)]}



In [3]:
    sent = "what is your name? shylock?"
    r = m.correct(sent)
    print(r)
    print('-' * 42)
    my_dict = {'your': 120, 'name': 2, 'is': 1, 'shylock': 1, 'what': 1}  # word, freq
    spell = EnSpellCorrector(word_freq_dict=my_dict)
    r = spell.correct(sent)
    print(r)
    print()

{'source': 'what is your name? shylock?', 'target': 'what is your name? shock?', 'errors': [('shylock', 'shock', 19)]}
------------------------------------------
{'source': 'what is your name? shylock?', 'target': 'what is your name? shylock?', 'errors': []}



In [4]:
from pycorrector import EnSpellCorrector
m = EnSpellCorrector("output_directory/output_model.klm_trigram.bin")
m
sent = "what is your name? shylock?"
r = m.correct(sent)

AttributeError: 'str' object has no attribute 'values'

In [10]:
from pycorrector import Corrector
model = Corrector(language_model_path='output_directory/output_model.klm_trigram.klm')
print(model.correct('V EH R II S EH K S II'))

{'source': 'V EH R II S EH K S II', 'target': 'V EH R II S EH K S II', 'errors': []}


In [11]:
model = Corrector(language_model_path='people2014corpus_chars.klm')
print(model.correct('少先队员因该为老人让坐'))

Downloading data from https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm

KeyboardInterrupt: 

In [13]:
!echo "your sentence here" | kenlm/build/bin/query text.binary

/work/van-speech-nlp/jindaznb/jslpnb/torgo_inference_on_cluster/kenlm/util/file.cc:76 in int util::OpenReadOrThrow(const char*) threw ErrnoException because `-1 == (ret = open(name, 00))'.
No such file or directory while opening text.binary


In [14]:
!pip install jamspell

Collecting jamspell
  Downloading jamspell-0.0.12.tar.gz (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.3/174.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jamspell
  Building wheel for jamspell (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[65 lines of output][0m
  [31m   [0m !!
  [31m   [0m 
  [31m   [0m         ********************************************************************************
  [31m   [0m         Usage of dash-separated 'description-file' will not be supported in future
  [31m   [0m         versions. Please use the underscore name 'description_file' instead.
  [31m   [0m 
  [31m   [0m         This deprecation is overdue, please update yo

In [2]:
!pip install jamspell

Collecting jamspell
  Downloading jamspell-0.0.12.tar.gz (174 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.3/174.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jamspell
  Building wheel for jamspell (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[52 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_ext
  [31m   [0m building '_jamspell' extension
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 2, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/tmp/pip-install-1xvz2hdu/jamspell_dcec735dc32243fabbd89533dbca74ad/setup.py"

In [None]:
import jamspell

corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('en.bin')

corrector.FixFragment('I am the begt spell cherken!')
# u'I am the best spell checker!'

# corrector.GetCandidates(['i', 'am', 'the', 'begt', 'spell', 'cherken'], 3)
# # (u'best', u'beat', u'belt', u'bet', u'bent', ... )

# corrector.GetCandidates(['i', 'am', 'the', 'begt', 'spell', 'cherken'], 5)
# # (u'checker', u'chicken', u'checked', u'wherein', u'coherent', ...)

: 

In [2]:
print()


