# Experiments

In [1]:
import os
import IPython.display as ipd

for filename in os.listdir('./test_audio'):
    ipd.Audio('./test_audio/' + filename)

In [None]:
import os
import pandas as pd
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf

for filename in os.listdir('./test'):
    data, sampling_rate = librosa.load('./test/' + filename)

    plt.figure(figsize=(12, 4))
    librosa.display.waveplot(data, sr=sampling_rate)

In [None]:
import speech_recognition as sr

In [None]:
r = sr.Recognizer()

In [None]:
zero = sr.AudioFile('0_jackson_2.wav')

In [None]:
with zero as source:
    audio = r.record(source)

In [None]:
print(r.recognize_google(audio))

# This is the main training part:

## training with audio files:

In [None]:
#imports:

from export_model import *
from preprocess import *
import numpy as np
import os
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import itertools
import matplotlib.pyplot as plt

In [None]:
%load_ext autoreload
%autoreload 2

# Second dimension of the feature is dim2
feature_dim_2 = 15  #11

# Save data to array file first
save_data_to_array(max_len=feature_dim_2)

# # Loading train set and test set
X_train, X_test, y_train, y_test = get_train_test()

# # Feature dimension
#defaults at the end
feature_dim_1 = 20   #20
channel = 1          #1
epochs = 50          #50
batch_size = 50      #100
verbose = 1          #1
# change num_classes depending on the amount of labels
num_classes = 12

# Reshaping to perform 2D convolution
X_train = X_train.reshape(X_train.shape[0], feature_dim_1, feature_dim_2, channel)
X_test = X_test.reshape(X_test.shape[0], feature_dim_1, feature_dim_2, channel)

y_train_hot = to_categorical(y_train)
y_test_hot = to_categorical(y_test)

## model & prediction

In [None]:
def get_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(feature_dim_1, feature_dim_2, channel)))
    model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
    model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(GaussianNoise(stddev=2))
    model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adadelta(),
                  metrics=['accuracy'])
    return model

# Predicts one sample
def predict(filepath, model):
    sample = wav2mfcc(filepath, feature_dim_2)
    sample_reshaped = sample.reshape(1, feature_dim_1, feature_dim_2, channel)
    return get_labels()[0][
            np.argmax(model.predict(sample_reshaped))
    ]

## Train model:

In [None]:
model = get_model()
history = model.fit(X_train, y_train_hot, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_data=(X_test, y_test_hot))

## Export current model:

In [None]:
#export_model(model)

y_predicted = model.predict_classes(X_test, batch_size=batch_size)
y_true_val = np.argmax(y_test_hot,axis=1)

class_rep = classification_report(y_true_val,y_predicted,digits=5)

settings = {
    "feature_dim_1": feature_dim_1,
    "feature_dim_2": feature_dim_2,
    "channel": channel,
    "epochs": epochs,
    "batch_size": batch_size,
    "train_accuracy": str(history.history.get('acc')[-1]),
    "test_accuracy": str(history.history.get('val_acc')[-1]),
    "train_loss": str(history.history.get('loss')[-1]),
    "test_loss": str(history.history.get('val_loss')[-1]),
    "classification_report": class_rep,
}

print(export_model(model, settings))

## Import a model:

In [None]:
# import_model(PATH)
imported_model = import_model("./models/xxx")

## Predict on a new file:

In [None]:
print(predict('./test_audio/12.wav', model=model))

## Predict on a folder:

In [None]:
import re

FOLDER_PATH = './test_audio/'

truefalse = {'True': 0, 'False': 0}

for filename in os.listdir(FOLDER_PATH):
    pred = predict(FOLDER_PATH + filename, model=model)
    number = re.match(r"[^_]*", filename)
    correct = number.group(0) == pred
    truefalse[str(correct)] += 1
    print("Correct: " + str(correct) + " --> " + filename + " was predicted as: " + pred)
    
print(truefalse)

## Accuracy / Loss

In [None]:
#Accuracy plot
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

#Loss plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

## Full report with confusion matrix:

In [None]:
#should import labels directly from folder:
labels, _, _= get_labels("./audio")
labArray = []
for label in labels:
    labArray.append(label)

le = LabelEncoder()

le.fit_transform(labArray)

full_multiclass_report(model, X_test, y_test_hot, classes=le.inverse_transform(np.arange(12)))

# Functions for all the metrics and results

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title='Normalized confusion matrix'
    else:
        title='Confusion matrix'

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
def full_multiclass_report(model,
                           x,
                           y_true,
                           classes,
                           batch_size=32,
                           binary=False):

    # 1. Transform one-hot encoded y_true into their class number
    if not binary:
        y_true = np.argmax(y_true,axis=1)
    
    # 2. Predict classes and stores in y_pred
    y_pred = model.predict_classes(x, batch_size=batch_size)
    
    # 3. Print accuracy score
    print("Accuracy : "+ str(accuracy_score(y_true,y_pred)))
    
    print("")
    
    # 4. Print classification report
    print("Classification Report")
    print(classification_report(y_true,y_pred,digits=5))    
    
    # 5. Plot confusion matrix
    cnf_matrix = confusion_matrix(y_true,y_pred)
    print(cnf_matrix)
    plot_confusion_matrix(cnf_matrix,classes=classes)

In [None]:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    ## As loss always exists
    epochs = range(1,len(history.history[loss_list[0]]) + 1)
    
    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    ## Accuracy
    plt.figure(2)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label='Training accuracy (' + str(format(history.history[l][-1],'.5f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label='Validation accuracy (' + str(format(history.history[l][-1],'.5f'))+')')

    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Example Swiss German - Standard German data (you would extract this from your JSON)
data = [
    {"swiss_german": "I bi am abend", "standard_german": "Ich bin am Abend"},
    {"swiss_german": "Chunsch au?", "standard_german": "Kommst du auch?"},
    # Add more data...
]

# Extract the Swiss German and Standard German texts
swiss_german_texts = [item['swiss_german'] for item in data]
standard_german_texts = [item['standard_german'] for item in data]

# Tokenizer initialization and fitting
src_tokenizer = Tokenizer(filters='')  # Don't filter anything
tgt_tokenizer = Tokenizer(filters='')

src_tokenizer.fit_on_texts(swiss_german_texts)
tgt_tokenizer.fit_on_texts(standard_german_texts)

# Convert texts to sequences of integers
src_sequences = src_tokenizer.texts_to_sequences(swiss_german_texts)
tgt_sequences = tgt_tokenizer.texts_to_sequences(standard_german_texts)

# Add start and end tokens to the target sequences (for sequence-to-sequence)
START_TOKEN = tgt_tokenizer.word_index['<start>'] if '<start>' in tgt_tokenizer.word_index else 1
END_TOKEN = tgt_tokenizer.word_index['<end>'] if '<end>' in tgt_tokenizer.word_index else 2

tgt_sequences = [[START_TOKEN] + seq + [END_TOKEN] for seq in tgt_sequences]

# Padding sequences to ensure they are all the same length
MAX_SEQ_LEN = 49  # Set this according to the average length of your sentences
src_sequences = pad_sequences(src_sequences, maxlen=MAX_SEQ_LEN, padding='post')
tgt_sequences = pad_sequences(tgt_sequences, maxlen=MAX_SEQ_LEN, padding='post')

# Train-test split
x_train, x_val, y_train, y_val = train_test_split(src_sequences, tgt_sequences, test_size=0.2)

In [12]:
import tensorflow.keras.backend as K

def get_position_encoding(seq_len, d_model):
    """Generates positional encoding for the given sequence length and model dimension."""
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rads = pos / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    
    # Apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # Apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    return K.constant(angle_rads, dtype=tf.float32)

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, seq_len, d_model):
        super(PositionalEncoding, self).__init__()
        self.position_encoding = get_position_encoding(seq_len, d_model)
        
    def call(self, inputs):
        return inputs + self.position_encoding

In [13]:
from tensorflow.keras.layers import Input, Embedding, LayerNormalization, Dense, Dropout, MultiHeadAttention

def transformer_encoder(inputs, num_heads, d_model, ff_dim):
    # Multi-Head Attention
    attention = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(inputs, inputs)
    attention = Dropout(0.1)(attention)
    attention = LayerNormalization(epsilon=1e-6)(inputs + attention)
    
    # Feed Forward Network
    ffn = Dense(ff_dim, activation='relu')(attention)
    ffn = Dropout(0.1)(ffn)
    ffn = Dense(d_model)(ffn)
    ffn = LayerNormalization(epsilon=1e-6)(attention + ffn)
    
    return ffn

def transformer_model(src_vocab_size, tgt_vocab_size, seq_len, d_model=256, num_heads=8, num_layers=4, ff_dim=512):
    # Input layers
    src_inputs = Input(shape=(seq_len,))
    tgt_inputs = Input(shape=(seq_len,))

    # Embedding layers
    src_embed = Embedding(src_vocab_size, d_model)(src_inputs)
    tgt_embed = Embedding(tgt_vocab_size, d_model)(tgt_inputs)

    # Add positional encoding
    src_embed = PositionalEncoding(seq_len, d_model)(src_embed)
    tgt_embed = PositionalEncoding(seq_len, d_model)(tgt_embed)

    # Transformer encoder layers
    for _ in range(num_layers):
        src_embed = transformer_encoder(src_embed, num_heads, d_model, ff_dim)

    # Transformer decoder layers
    for _ in range(num_layers):
        tgt_embed = transformer_encoder(tgt_embed, num_heads, d_model, ff_dim)

    # Final dense layer for prediction
    output = Dense(tgt_vocab_size, activation='softmax')(tgt_embed)

    # Build and compile the model
    model = tf.keras.models.Model(inputs=[src_inputs, tgt_inputs], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Define model parameters
src_vocab_size = len(src_tokenizer.word_index) + 1
tgt_vocab_size = len(tgt_tokenizer.word_index) + 1
seq_len = MAX_SEQ_LEN  # Set this according to the maximum sentence length

# Build and compile the model
model = transformer_model(src_vocab_size, tgt_vocab_size, seq_len)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 49)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 49, 256)      2048        ['input_4[0][0]']                
                                                                                                  
 positional_encoding_3 (Positio  (None, 49, 256)     0           ['embedding_3[0][0]']            
 nalEncoding)                                                                                     
                                                                                                  
 multi_head_attention_12 (Multi  (None, 49, 256)     2103552     ['positional_encoding_3[0][

In [14]:
# Shift the target sequence to prepare for teacher forcing (for training)
y_train_input = y_train[:, :-1]
y_train_output = y_train[:, 1:]

y_val_input = y_val[:, :-1]
y_val_output = y_val[:, 1:]

y_train_input = np.pad(y_train_input, ((0, 0), (0, 1)), mode='constant')
y_val_input = np.pad(y_val_input, ((0, 0), (0, 1)), mode='constant')



In [15]:
# Train the model
history = model.fit(
    [x_train, y_train_input],
    np.expand_dims(y_train_output, -1),
    epochs=10,
    batch_size=32,
    validation_data=([x_val, y_val_input], np.expand_dims(y_val_output, -1))
)

Epoch 1/10


ValueError: in user code:

    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\engine\training.py", line 998, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\engine\training.py", line 1092, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\engine\compile_utils.py", line 605, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\utils\metrics_utils.py", line 77, in decorated
        update_op = update_state_fn(*args, **kwargs)
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\metrics\base_metric.py", line 143, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\metrics\base_metric.py", line 700, in update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\metrics\metrics.py", line 3669, in sparse_categorical_accuracy
        matches = metrics_utils.sparse_categorical_matches(y_true, y_pred)
    File "c:\Users\admin\anaconda\envs\vision\lib\site-packages\keras\utils\metrics_utils.py", line 970, in sparse_categorical_matches
        matches = tf.cast(tf.equal(y_true, y_pred), backend.floatx())

    ValueError: Dimensions must be equal, but are 48 and 49 for '{{node Equal}} = Equal[T=DT_FLOAT, incompatible_shape_error=true](Squeeze, Cast_2)' with input shapes: [?,48], [?,49].


In [8]:
print(y_train_input.shape)  # Should print something like (1, 50)
print(y_val_input.shape)    # Should print something like (1, 50)

(1, 49)
(1, 49)


In [None]:
from nltk.translate.bleu_score import corpus_bleu

def translate_sequence(model, input_seq, src_tokenizer, tgt_tokenizer):
    # Prepare the input sequence and target sequence for the decoder
    input_seq = pad_sequences([input_seq], maxlen=MAX_SEQ_LEN, padding='post')
    start_token = np.array([START_TOKEN])
    
    # Get initial model prediction
    decoded_sentence = []
    for _ in range(MAX_SEQ_LEN):
        prediction = model.predict([input_seq, start_token])
        predicted_token = np.argmax(prediction[0, -1, :])
        decoded_sentence.append(predicted_token)
        
        # If we reach the end token, break
        if predicted_token == END_TOKEN:
            break
        
        start_token = np.array([predicted_token])
    
    return ' '.join(tgt_tokenizer.sequences_to_texts([decoded_sentence]))

# Example Translation
swiss_german_sentence = "I bi am abend"
swiss_german_seq = src_tokenizer.texts_to_sequences([swiss_german_sentence])
translated = translate_sequence(model, swiss_german_seq[0], src_tokenizer, tgt_tokenizer)
print("Translated:", translated)