# Project Idea: Text-to-Phoneme Conversion

**Objective**: A critical step in many TTS systems is converting the text into phonemes (distinct units of sound). This project involve building a model that takes standard English text and predicts its phonemic transcription in X-SAMPA format.

**Steps**:

1. **Data Preparation**:
    - Split CSV file into training, validation, and test sets.

2. **Preprocessing**:
    - Character tokenization the English text
    - Tokenize the X-SAMPA transcriptions. Here, each unique X-SAMPA symbol can be a token.
    - Convert these tokens to integer representations.

3. **Modeling**:
    - Sequence-to-sequence models: LSTM
    - English text input tokenization

4. **Training & Evaluation**:
    - Train your model on the training set, evaluate on the validation set, and fine-tune accordingly.
    - Once satisfied, evaluate its performance on the test set.

5. **Inference**:
    - Build a function where you input standard English text, and it outputs the predicted X-SAMPA transcription.

**Technologies**:
- Python
    - ``csv``
    - ``keras``
    - ``sklearn``
- X-Sampa
- librosa (for audio processing)

#### 1.1 Split CSV file into training, validation, and test sets.

In [11]:
import pandas as pd

# Load the CSV data
data = pd.read_csv('transcription.csv')

from sklearn.model_selection import train_test_split

# Group by WordID
grouped = list(data.groupby('WordID'))

# Split the grouped data
train, temp = train_test_split(grouped, test_size=0.3, random_state=42)  
valid, test = train_test_split(temp, test_size=0.5, random_state=42)  

# Extracting the dataframes from the groups
train_df = pd.concat([item[1] for item in train])
valid_df = pd.concat([item[1] for item in valid])
test_df = pd.concat([item[1] for item in test])

# X-SAMPA tokenization
def tokenize_x_sampa(transcription):
    # List of known multi-character X-SAMPA symbols 
    # This list might need more symbols based on the entire dataset
    multi_char_symbols = [":l", "^m", "O:"]
    
    tokens = []
    for symbol in transcription.split():
        if symbol in multi_char_symbols:
            tokens.append(symbol)
        else:
            tokens.extend(list(symbol))
    return tokens

# Apply the function to the Transcription column for each dataframe
train_df['tokenized_x_sampa'] = train_df['Transcription'].apply(tokenize_x_sampa)
valid_df['tokenized_x_sampa'] = valid_df['Transcription'].apply(tokenize_x_sampa)
test_df['tokenized_x_sampa'] = test_df['Transcription'].apply(tokenize_x_sampa)

# Display the first few rows for verification
print(train_df[['Transcription', 'tokenized_x_sampa']].head())

    Transcription                     tokenized_x_sampa
34  fAr h@"raIz@n  [f, A, r, h, @, ", r, a, I, z, @, n]
35  fA: h@"raIz@n  [f, A, :, h, @, ", r, a, I, z, @, n]
92  Endl@s oUS@nz  [E, n, d, l, @, s, o, U, S, @, n, z]
93  EndlIs oUS@nz  [E, n, d, l, I, s, o, U, S, @, n, z]
98   glIs.nIN du:     [g, l, I, s, ., n, I, N, d, u, :]


#### 2.1 Character tokenization the English text

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# Function to convert the list of tokens back to space-separated string
def tokens_to_string(tokens):
    return ' '.join(tokens)

# Convert the list of tokens in 'tokenized_x_sampa' back to space-separated string
train_df['x_sampa_str'] = train_df['tokenized_x_sampa'].apply(tokens_to_string)

# Initialize a CountVectorizer
vectorizer = CountVectorizer(analyzer='word', token_pattern=r"(?:\S+)")
# We're using the token_pattern parameter to make sure we capture the entire X-SAMPA symbols

# Apply the vectorizer on the 'x_sampa_str' column
X = vectorizer.fit_transform(train_df['x_sampa_str'])

# Getting the token names
tokens = vectorizer.get_feature_names()

print("Tokens (X-SAMPA symbols):", tokens)



Tokens (X-SAMPA symbols): ['"', '.', '3', ':', '@', '^', 'a', 'b', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'z', 'ð', 'ŋ']


#### 2.2 Tokenize the X-SAMPA transcriptions.

In [14]:
# List of known multi-character X-SAMPA symbols (this is just a sample; there might be more)
multi_char_symbols = [":l", "^m", "O:"]
# If there are more multi-character symbols in your data, add them to the list

def tokenize_x_sampa(transcription):
    tokens = []
    i = 0
    while i < len(transcription):
        # Check if the current char and next char form a multi-char symbol
        if i < len(transcription) - 1 and transcription[i:i+2] in multi_char_symbols:
            tokens.append(transcription[i:i+2])
            i += 2
        # Check for single-char symbols
        elif transcription[i] != ' ':
            tokens.append(transcription[i])
            i += 1
        # If it's just a space, skip to the next character
        else:
            i += 1
    return tokens

# Apply the function to the Transcription column of train_df
train_df['tokenized_x_sampa'] = train_df['Transcription'].apply(tokenize_x_sampa)

# Display the results for train_df
print(train_df[['Transcription', 'tokenized_x_sampa']])



      Transcription                      tokenized_x_sampa
34    fAr h@"raIz@n   [f, A, r, h, @, ", r, a, I, z, @, n]
35    fA: h@"raIz@n   [f, A, :, h, @, ", r, a, I, z, @, n]
92    Endl@s oUS@nz   [E, n, d, l, @, s, o, U, S, @, n, z]
93    EndlIs oUS@nz   [E, n, d, l, I, s, o, U, S, @, n, z]
98     glIs.nIN du:      [g, l, I, s, ., n, I, N, d, u, :]
..              ...                                    ...
153  wO:m "s^nlaIt"  [w, O:, m, ", s, ^, n, l, a, I, t, "]
70       raIzIN s^n            [r, a, I, z, I, N, s, ^, n]
71       raIzIN s^n            [r, a, I, z, I, N, s, ^, n]
172   sAft "pIloUz"   [s, A, f, t, ", p, I, l, o, U, z, "]
173   sOft "pIl@Uz"   [s, O, f, t, ", p, I, l, @, U, z, "]

[160 rows x 2 columns]


#### 2.3 Convert these tokens to integer representations.

In [15]:
from keras.preprocessing.text import Tokenizer

# Convert the list of tokens back to space-separated strings for both English and X-SAMPA
train_df['english_str'] = train_df['Word'].apply(lambda x: ' '.join(list(x)))  # For English character tokenization
train_df['x_sampa_str'] = train_df['tokenized_x_sampa'].apply(' '.join)

valid_df['english_str'] = valid_df['Word'].apply(lambda x: ' '.join(list(x)))
valid_df['x_sampa_str'] = valid_df['tokenized_x_sampa'].apply(' '.join)

test_df['english_str'] = test_df['Word'].apply(lambda x: ' '.join(list(x)))
test_df['x_sampa_str'] = test_df['tokenized_x_sampa'].apply(' '.join)

# Initialize and fit the tokenizer for English text
english_tokenizer = Tokenizer(char_level=False, lower=True, split=' ')
english_tokenizer.fit_on_texts(train_df['english_str'])

# Convert English tokens to integer sequences
train_df['english_seq'] = english_tokenizer.texts_to_sequences(train_df['english_str'])
valid_df['english_seq'] = english_tokenizer.texts_to_sequences(valid_df['english_str'])
test_df['english_seq'] = english_tokenizer.texts_to_sequences(test_df['english_str'])

# Initialize and fit the tokenizer for X-SAMPA transcriptions
x_sampa_tokenizer = Tokenizer(char_level=False, lower=False, split=' ')
x_sampa_tokenizer.fit_on_texts(train_df['x_sampa_str'])

# Convert X-SAMPA tokens to integer sequences
train_df['x_sampa_seq'] = x_sampa_tokenizer.texts_to_sequences(train_df['x_sampa_str'])
valid_df['x_sampa_seq'] = x_sampa_tokenizer.texts_to_sequences(valid_df['x_sampa_str'])
test_df['x_sampa_seq'] = x_sampa_tokenizer.texts_to_sequences(test_df['x_sampa_str'])

# Let's check the conversion
print(train_df[['english_str', 'english_seq', 'x_sampa_str', 'x_sampa_seq']].head())


                    english_str                                 english_seq  \
34        F a r   h o r i z o n         [19, 8, 5, 11, 10, 5, 2, 23, 10, 4]   
35        F a r   h o r i z o n         [19, 8, 5, 11, 10, 5, 2, 23, 10, 4]   
92  E n d l e s s   o c e a n s  [3, 4, 12, 6, 3, 1, 1, 10, 16, 3, 8, 4, 1]   
93  E n d l e s s   o c e a n s  [3, 4, 12, 6, 3, 1, 1, 10, 16, 3, 8, 4, 1]   
98  G l i s t e n i n g   d e w   [7, 6, 2, 1, 9, 3, 4, 2, 4, 7, 12, 3, 13]   

                x_sampa_str                            x_sampa_seq  
34  f A r h @ " r a I z @ n         [21, 14, 5, 25, 5, 9, 1, 7, 3]  
35  f A : h @ " r a I z @ n            [21, 14, 25, 5, 9, 1, 7, 3]  
92  E n d l @ s o U S @ n z     [26, 3, 8, 4, 2, 24, 12, 18, 3, 7]  
93  E n d l I s o U S @ n z  [26, 3, 8, 4, 1, 2, 24, 12, 18, 3, 7]  
98    g l I s . n I N d u :         [19, 4, 1, 2, 3, 1, 10, 8, 29]  


#### 3.1 Split CSV file into training, validation, and test sets.

In [17]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding

# Define parameters
english_vocab_size = len(english_tokenizer.word_index) + 1
x_sampa_vocab_size = len(x_sampa_tokenizer.word_index) + 1
embedding_dim = 256
lstm_units = 512

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(english_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(x_sampa_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(x_sampa_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    6656        input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    9216        input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
#### 3.2 Split CSV file into training, validation, and test sets.

In [18]:
from keras.preprocessing.sequence import pad_sequences

# Ensure all sequences have the same length
max_english_seq_length = max([len(seq) for seq in train_df['Word']])
max_x_sampa_seq_length = max([len(seq) for seq in train_df['x_sampa_str']])

# Convert English characters to integers
english_tokenizer = Tokenizer(char_level=True, oov_token='UNK')
english_tokenizer.fit_on_texts(train_df['Word'])
english_int_seq = english_tokenizer.texts_to_sequences(train_df['Word'])
english_int_seq_padded = pad_sequences(english_int_seq, maxlen=max_english_seq_length, padding='post')

# Convert X-SAMPA tokens to integers
x_sampa_tokenizer = Tokenizer(oov_token='UNK')
x_sampa_tokenizer.fit_on_texts(train_df['x_sampa_str'])
x_sampa_int_seq = x_sampa_tokenizer.texts_to_sequences(train_df['x_sampa_str'])
x_sampa_int_seq_padded = pad_sequences(x_sampa_int_seq, maxlen=max_x_sampa_seq_length, padding='post')

# Preparing decoder input and output. Decoder input is the original sequence, and the output is the sequence shifted by one timestep.
decoder_input_data = x_sampa_int_seq_padded[:, :-1]
decoder_target_data = x_sampa_int_seq_padded[:, 1:]

# The encoder input remains the same
encoder_input_data = english_int_seq_padded


In [None]:
#### 4.1 Split CSV file into training, validation, and test sets.

In [21]:
# Convert and pad validation sequences for English
valid_english_int_seq = english_tokenizer.texts_to_sequences(valid_df['Word'])
valid_english_int_seq_padded = pad_sequences(valid_english_int_seq, maxlen=max_english_seq_length, padding='post')

# Convert and pad validation sequences for X-SAMPA
valid_x_sampa_int_seq = x_sampa_tokenizer.texts_to_sequences(valid_df['x_sampa_str'])
valid_x_sampa_int_seq_padded = pad_sequences(valid_x_sampa_int_seq, maxlen=max_x_sampa_seq_length, padding='post')

# Prepare decoder input and output data for the validation set
valid_decoder_input_data = valid_x_sampa_int_seq_padded[:, :-1]
valid_decoder_target_data = valid_x_sampa_int_seq_padded[:, 1:]

# The encoder input for validation remains the same
valid_encoder_input_data = valid_english_int_seq_padded

batch_size = 64
epochs = 30

# Convert the decoder target data for training set to one-hot encoding for training
decoder_target_onehot = np.zeros((len(train_df), max_x_sampa_seq_length-1, len(x_sampa_tokenizer.word_index)+1), dtype='float32')  # Notice the change in dimensions
for i, seq in enumerate(decoder_target_data):
    for j, token in enumerate(seq):
        if token > 0:
            decoder_target_onehot[i, j, token] = 1.

# Convert the decoder target data for validation set to one-hot encoding for validation
valid_decoder_target_onehot = np.zeros((len(valid_df), max_x_sampa_seq_length-1, len(x_sampa_tokenizer.word_index)+1), dtype='float32')  # Notice the change in dimensions
for i, seq in enumerate(valid_decoder_target_data):
    for j, token in enumerate(seq):
        if token > 0:
            valid_decoder_target_onehot[i, j, token] = 1.

history = model.fit(
    [encoder_input_data, decoder_input_data], 
    decoder_target_onehot,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([valid_encoder_input_data, valid_decoder_input_data], valid_decoder_target_onehot)
)


Epoch 1/30


ValueError: in user code:

    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\keras\engine\training.py:853 train_function  *
        return step_function(self, iterator)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\keras\engine\training.py:842 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\keras\engine\training.py:835 run_step  **
        outputs = model.train_step(data)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\keras\engine\training.py:789 train_step
        y, y_pred, sample_weight, regularization_losses=self.losses)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\keras\engine\compile_utils.py:201 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\keras\losses.py:141 __call__
        losses = call_fn(y_true, y_pred)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\keras\losses.py:245 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\keras\losses.py:1666 categorical_crossentropy
        y_true, y_pred, from_logits=from_logits, axis=axis)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\keras\backend.py:4839 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    C:\Users\josem\anaconda3\envs\mlcc\lib\site-packages\tensorflow\python\framework\tensor_shape.py:1161 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 34, 27) and (None, 34, 36) are incompatible


In [None]:
#### 4.2 Split CSV file into training, validation, and test sets.

In [None]:
#### 5.1 Split CSV file into training, validation, and test sets.