In [2]:
from transformers import AutoModel, AutoTokenizer
from text2phonemesequence import Text2PhonemeSequence
import torch

# Load XPhoneBERT model and its tokenizer
xphonebert = AutoModel.from_pretrained("vinai/xphonebert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/xphonebert-base")

# Load Text2PhonemeSequence
# text2phone_model = Text2PhonemeSequence(language='eng-us', is_cuda=True)
text2phone_model = Text2PhonemeSequence(language='jpn', is_cuda=True)

# Input sequence that is already WORD-SEGMENTED (and text-normalized if applicable)
sentence = "That is , it is a testing text ."  
# sentence = "これ は 、 テスト テキスト です ."

input_phonemes = text2phone_model.infer_sentence(sentence)

input_ids = tokenizer(input_phonemes, return_tensors="pt")

with torch.no_grad():
    features = xphonebert(**input_ids)

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/xphonebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
features = features.last_hidden_state
print("Input phonemes:", input_phonemes)
print("Input IDs:", input_ids)
print("Features shape:", features.shape)
# The features are the last hidden state of the model

Input phonemes: t a t ▁ i s ▁ , ▁ i t ɕ i ▁ i s ▁ a ▁ t ɛ s t i ŋ ▁ t e k s t ▁ .
Input IDs: {'input_ids': tensor([[ 0,  7,  6,  7,  4,  8,  9,  4, 29,  4,  8,  7, 45,  8,  4,  8,  9,  4,
          6,  4,  7, 17,  9,  7,  8, 35,  4,  7, 14, 13,  9,  7,  4, 33,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Features shape: torch.Size([1, 35, 768])


## Simple tokenizer

In [4]:
import os
import re
import string
import pickle
from collections import Counter

class SimpleTokenizer:
    def __init__(self, vocab_size=1000):
        """Simple word-level tokenizer with fixed vocabulary size."""
        self.vocab_size = vocab_size
        self.word_to_id = {"<PAD>": 0, "<UNK>": 1}
        self.id_to_word = {0: "<PAD>", 1: "<UNK>"}
        self.next_id = 2  # Start from 2 (0=PAD, 1=UNK)
        
    def fit(self, texts):
        """Build vocabulary from texts based on word frequency."""
        # Count word occurrences
        word_counts = Counter()
        for text in texts:
            words = text.lower().split()
            word_counts.update(words)
        
        # Select top words for vocabulary
        top_words = [word for word, _ in word_counts.most_common(self.vocab_size - 2)]  # -2 for <PAD> and <UNK>
        
        # Build mappings
        for word in top_words:
            if self.next_id < self.vocab_size:
                self.word_to_id[word] = self.next_id
                self.id_to_word[self.next_id] = word
                self.next_id += 1
        
        print(f"Vocabulary built with {len(self.word_to_id)} words")
        return self
    
    def tokenize(self, text):
        """Convert text to token IDs."""
        words = text.lower().split()
        return [self.word_to_id.get(word, self.word_to_id["<UNK>"]) for word in words]
    
    def decode(self, ids):
        """Convert token IDs back to words."""
        return [self.id_to_word.get(id, "<UNK>") for id in ids]
    
    def convert_ids_to_tokens(self, ids):
        """Alias for decode to match HuggingFace API."""
        return self.decode(ids)
    
    def save(self, path):
        """Save tokenizer to file."""
        with open(path, 'wb') as f:
            pickle.dump({
                'vocab_size': self.vocab_size,
                'word_to_id': self.word_to_id,
                'id_to_word': self.id_to_word,
                'next_id': self.next_id
            }, f)
        print(f"Tokenizer saved to {path}")
    
    @classmethod
    def load(cls, path):
        """Load tokenizer from file."""
        with open(path, 'rb') as f:
            data = pickle.load(f)
        
        tokenizer = cls(vocab_size=data['vocab_size'])
        tokenizer.word_to_id = data['word_to_id']
        tokenizer.id_to_word = data['id_to_word']
        tokenizer.next_id = data['next_id']
        
        print(f"Tokenizer loaded from {path} with {len(tokenizer.word_to_id)} words")
        return tokenizer

def process_text_files(folder_path):
    """Process all .txt files in a folder and return cleaned texts."""
    punct_pattern = re.compile(f'[{re.escape(string.punctuation)}]')
    texts = []
    
    # Get list of all text files
    txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    print(f"Found {len(txt_files)} text files in {folder_path}")
    
    # Process each file
    for filename in txt_files:
        filepath = os.path.join(folder_path, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                text = f.read()
                
            # Remove punctuation and convert to lowercase
            text = punct_pattern.sub(' ', text).lower()
            # Normalize whitespace
            text = ' '.join(text.split())
            
            texts.append(text)
            
        except Exception as e:
            print(f"Error processing {filename}: {e}")
    
    print(f"Processed {len(texts)} text files")
    return texts

def main():
    # Configuration
    folder_path = "../../data/voicebank_demand/trainset_28spk_txt"  # Change this to your folder path
    vocab_size = 1000
    output_path = "simple_tokenizer.pkl"
    
    # Process text files
    texts = process_text_files(folder_path)
    
    # Train tokenizer
    tokenizer = SimpleTokenizer(vocab_size=vocab_size)
    tokenizer.fit(texts)
    
    # Save tokenizer
    tokenizer.save(output_path)
    
    # Example usage
    if texts:
        sample_text = texts[0][:100]  # First 100 chars of first text
        print(f"\nSample text: '{sample_text}'")
        
        tokens = tokenizer.tokenize(sample_text)
        print(f"Tokenized: {tokens}")
        
        decoded = tokenizer.decode(tokens)
        print(f"Decoded: {' '.join(decoded)}")
        
        # Vocabulary stats
        print(f"\nVocabulary size: {len(tokenizer.word_to_id)}")
        print(f"Top 10 words: {list(tokenizer.word_to_id.keys())[:12]}")  # First 12 includes <PAD> and <UNK>

if __name__ == "__main__":
    main()

Found 11572 text files in ../../data/voicebank_demand/trainset_28spk_txt
Processed 11572 text files
Vocabulary built with 1000 words
Tokenizer saved to simple_tokenizer.pkl

Sample text: 'but you can go beyond that condition'
Tokenized: [38, 34, 44, 117, 173, 14, 278]
Decoded: but you can go beyond that condition

Vocabulary size: 1000
Top 10 words: ['<PAD>', '<UNK>', 'the', 'a', 'it', 'is', 'to', 'was', 'of', 'in', 'i', 'we']
