# Install & Import Packages

In [1]:
pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install h5py
!pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, GlobalAveragePooling2D, BatchNormalization, Dropout, LSTM, TimeDistributed
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import VGG16
from tensorflow.image import resize

import warnings
warnings.filterwarnings('ignore')

2024-04-25 20:51:06.655575: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-25 20:51:06.700122: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Dataset & Split

In [4]:
print("Current working directory:", os.getcwd())

Current working directory: /projectnb/ba865/projects/Group3_A1


In [5]:
# extract MFCCs and labels from Pickle files
base_path = '/projectnb/ba865/projects/Group3_A1/dataset_pkl'
pkl_files = [file for file in os.listdir(base_path) if file.endswith('.pkl')]

label_counts = {}
all_mfccs = []
all_labels = []

max_cols = 0
for pkl_file in pkl_files:
    file_path = os.path.join(base_path, pkl_file)
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
        i = 0
        for item in data:
            if i >= 20000:
                break
            mfccs = item['mfccs']
            labels = item['label']
            
            # this was previously used to count the number of MFCCs from each language and reduce sample size
            if labels not in label_counts:
                label_counts[labels] = 0
            label_counts[labels] += 1

            # tracks max_cols to pad correctly later
            max_cols = max(max_cols, mfccs.shape[1])

            all_mfccs.append(mfccs)
            all_labels.append(labels)
            i += 1  

In [60]:
label_counts

{'russian': 20000,
 'thai': 20000,
 'arabic': 20000,
 'japanese': 20000,
 'english': 20000,
 'chinese': 20000,
 'spanish': 20000,
 'german': 20000,
 'french': 20000}

In [6]:
all_labels[1]

'russian'

In [7]:
# pad MFCCs to ensure same shape across
padded_mfccs = [np.pad(mfcc, ((0, 0), (0, max_cols - mfcc.shape[1])), mode='constant', constant_values=0) for mfcc in all_mfccs]

mfccs = np.stack(padded_mfccs)

# encode the lebels
encoder = LabelEncoder()
labels_encoded = encoder.fit_transform(all_labels)

# convert MFCCs and labels to tensor
mfccs_tensor=tf.convert_to_tensor(padded_mfccs,dtype=tf.float32)
labels_tensor = tf.convert_to_tensor(np.array(labels_encoded), dtype=tf.int32)

print(f"Shape of the MFCCs tensor: {mfccs_tensor.shape}")
print(f"Shape of the labels tensor: {labels_tensor.shape}")

Shape of the MFCCs tensor: (180000, 13, 972)
Shape of the labels tensor: (180000,)


2024-04-25 20:51:23.328447: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [8]:
mfccs.dtype

dtype('float32')

In [9]:
labels_encoded.dtype

dtype('int64')

In [28]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(mfccs, labels_encoded, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# reshape for model input
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], X_val.shape[2], 1)

# encoding the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# one-hot encode the labels
y_train_one_hot = to_categorical(y_train_encoded, num_classes=9)
y_val_one_hot = to_categorical(y_val_encoded, num_classes=9)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=9)

In [11]:
X_train.shape

(115200, 13, 972, 1)

# Random Forest

In [12]:
# flatten the data to feed into the model
mfccs_array=np.array(mfccs_tensor)
mfccs_flattened = np.array([mfcc.flatten() for mfcc in mfccs_array])

# train-test split
X_train, X_test, y_train, y_test = train_test_split(mfccs_flattened, all_labels, test_size=0.2, random_state=42)

# initialize and train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train) 

# evaluate the model and print accuracy
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print('Random Forest test accuracy:', accuracy_rf)

Random Forest test accuracy: 0.4428888888888889


# CNN

In [19]:
# design CNN model
model = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], 1)),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.1),
    
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.1),
    
    GlobalAveragePooling2D(),
    Dense(128, activation='relu'),
    Dropout(0.1),
    Dense(9, activation='softmax')
])

# compile and fit the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train_one_hot, epochs=15, batch_size=64, validation_data=(X_val, y_val_one_hot))

Epoch 1/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 212ms/step - accuracy: 0.4016 - loss: 1.6902 - val_accuracy: 0.5042 - val_loss: 1.4501
Epoch 2/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 211ms/step - accuracy: 0.5469 - loss: 1.3128 - val_accuracy: 0.5562 - val_loss: 1.2758
Epoch 3/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m387s[0m 215ms/step - accuracy: 0.5681 - loss: 1.2456 - val_accuracy: 0.5815 - val_loss: 1.2370
Epoch 4/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 213ms/step - accuracy: 0.5904 - loss: 1.1902 - val_accuracy: 0.5500 - val_loss: 1.2849
Epoch 5/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m439s[0m 211ms/step - accuracy: 0.5970 - loss: 1.1685 - val_accuracy: 0.6031 - val_loss: 1.1597
Epoch 6/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 211ms/step - accuracy: 0.6106 - loss: 1.1373 - val_accuracy: 0.5878 - val_loss:

<keras.src.callbacks.history.History at 0x1504c0f62d70>

In [24]:
# evaluate model accuracy on test set
test_loss, test_acc = model.evaluate(X_test, y_test_one_hot)
print('Test accuracy:', test_acc)

[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - accuracy: 0.6545 - loss: 1.0259
Test accuracy: 0.6516388654708862


# LSTM

In [26]:
# design LSTM model
lstm_model = Sequential([
    LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    Dropout(0.1),  # Apply dropout after the first LSTM layer
    BatchNormalization(),  # Optionally add batch normalization (though not typical for LSTMs)
    
    LSTM(64, return_sequences=False),
    Dropout(0.1),  # Apply dropout after the second LSTM layer
    
    Dense(128, activation='relu'),
    Dropout(0.1),  # Additional dropout before the final layer
    
    Dense(9, activation='softmax')
])

# compile and fit the model
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train, y_train_one_hot, epochs=15, batch_size=64, validation_data=(X_val, y_val_one_hot))

Epoch 1/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 21ms/step - accuracy: 0.2784 - loss: 1.9457 - val_accuracy: 0.4018 - val_loss: 1.6881
Epoch 2/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 20ms/step - accuracy: 0.4008 - loss: 1.6916 - val_accuracy: 0.4333 - val_loss: 1.6129
Epoch 3/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 20ms/step - accuracy: 0.4402 - loss: 1.6016 - val_accuracy: 0.4527 - val_loss: 1.5622
Epoch 4/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 20ms/step - accuracy: 0.4605 - loss: 1.5499 - val_accuracy: 0.4607 - val_loss: 1.5378
Epoch 5/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 20ms/step - accuracy: 0.4717 - loss: 1.5065 - val_accuracy: 0.4712 - val_loss: 1.5084
Epoch 6/15
[1m1800/1800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 20ms/step - accuracy: 0.4871 - loss: 1.4779 - val_accuracy: 0.4842 - val_loss: 1.4837
Epoc

<keras.src.callbacks.history.History at 0x1504641bd840>

In [28]:
# reshape X_test
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

# evaluate model accuracy on test set
test_loss, test_acc = lstm_model.evaluate(X_test, y_test_one_hot)
print('Test accuracy:', test_acc)

[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.5252 - loss: 1.3805
Test accuracy: 0.5286666750907898


# CRNN

In [20]:
# design CRNN model
crnn_model = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], 1)),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.1),
    
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.1),
    
    TimeDistributed(Flatten()),
    
    LSTM(128, return_sequences=True),
    Dropout(0.1),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    
    Dense(128, activation='relu'),
    Dropout(0.1),
    Dense(9, activation='softmax')
])

# compile and fit the model
crnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
crnn_model.fit(X_train, y_train_one_hot, epochs=15, batch_size=32, validation_data=(X_val, y_val_one_hot))

Epoch 1/15
[1m3600/3600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 53ms/step - accuracy: 0.2636 - loss: 1.9438 - val_accuracy: 0.3776 - val_loss: 1.7404
Epoch 2/15
[1m3600/3600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 47ms/step - accuracy: 0.3975 - loss: 1.6829 - val_accuracy: 0.4218 - val_loss: 1.6307
Epoch 3/15
[1m3600/3600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 49ms/step - accuracy: 0.4376 - loss: 1.5966 - val_accuracy: 0.4784 - val_loss: 1.4886
Epoch 4/15
[1m3600/3600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 49ms/step - accuracy: 0.4651 - loss: 1.5396 - val_accuracy: 0.4805 - val_loss: 1.4739
Epoch 5/15
[1m3600/3600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 50ms/step - accuracy: 0.4695 - loss: 1.5257 - val_accuracy: 0.4595 - val_loss: 1.5402
Epoch 6/15
[1m3600/3600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 50ms/step - accuracy: 0.4816 - loss: 1.4974 - val_accuracy: 0.5185 - val_loss: 1.407

<keras.src.callbacks.history.History at 0x150e7c66edd0>

In [29]:
# reshape X_test and convert X_test and y_test_one_hot to tensor
X_test = X_test[..., np.newaxis] 
X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
y_test_one_hot = tf.convert_to_tensor(y_test_one_hot, dtype=tf.float32)

# evaluate model accuracy on test set
test_loss, test_acc = crnn_model.evaluate(X_test, y_test_one_hot)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)

[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.5350 - loss: 1.3629
Test accuracy: 0.5353333353996277
Test loss: 1.3585960865020752


# Transfer Learning: VGG16

In [16]:
# convert X_train to match the model
X_train_rgb = np.repeat(X_train, 3, axis=-1)

# resize X_train
X_train_resized = np.array([resize(img, (32, 32)).numpy() for img in X_train_rgb])

# encode y_train
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_train_one_hot = to_categorical(y_train_encoded, num_classes=9)

# train test split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_resized, y_train_one_hot, test_size=0.2, random_state=42)

# load VGG16 model without the final layer
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(32, 32, 3))

# freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# add custom layers
x = Flatten()(base_model.output)
x = Dense(128, activation='relu')(x)
predictions = Dense(9, activation='softmax')(x)

# create the model
transfer_model = Model(inputs=base_model.input, outputs=predictions)
transfer_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# train the model
transfer_model.fit(X_train_split, y_train_split, epochs=15, batch_size=32, validation_data=(X_val_split, y_val_split))

Epoch 1/15
[1m2880/2880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 26ms/step - accuracy: 0.2263 - loss: 2.1173 - val_accuracy: 0.2647 - val_loss: 1.9887
Epoch 2/15
[1m2880/2880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 24ms/step - accuracy: 0.2691 - loss: 1.9838 - val_accuracy: 0.2734 - val_loss: 1.9601
Epoch 3/15
[1m2880/2880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 26ms/step - accuracy: 0.2769 - loss: 1.9601 - val_accuracy: 0.2789 - val_loss: 1.9472
Epoch 4/15
[1m2880/2880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 24ms/step - accuracy: 0.2854 - loss: 1.9429 - val_accuracy: 0.2830 - val_loss: 1.9485
Epoch 5/15
[1m2880/2880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 26ms/step - accuracy: 0.2912 - loss: 1.9322 - val_accuracy: 0.2920 - val_loss: 1.9281
Epoch 6/15
[1m2880/2880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 26ms/step - accuracy: 0.2897 - loss: 1.9288 - val_accuracy: 0.2873 - val_loss: 1.9334
Epoc

<keras.src.callbacks.history.History at 0x15115e08e770>

In [19]:
# convert X_test to match the model
X_test_rgb = np.repeat(X_test[..., np.newaxis], 3, axis=-1)  

# resize X_test
X_test_resized = np.array([resize(img, (32, 32)).numpy() for img in X_test_rgb])

# encode y_test
y_test_encoded = label_encoder.transform(y_test)  # using the same encoder as for y_train
y_test_one_hot = to_categorical(y_test_encoded, num_classes=9)

# evaluate model's accuracy on test set
test_loss, test_acc = transfer_model.evaluate(X_test_resized, y_test_one_hot)
print('Test accuracy:', test_acc)

[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 21ms/step - accuracy: 0.3025 - loss: 1.9162
Test accuracy: 0.300861120223999
