In [4]:
# Set main root 
from setup import setup_project_root
setup_project_root()

!hostname

node064


In [5]:
from langaugedetection.data.com_voice_dir import open_files, person_to_group

languages = ['de', 'en', 'es', 'it', ] # 'ja', 'nl', 'ta']
# Specifying a list of the languages to

files = {lang : open_files(lang, 100_000) for lang in languages}
# Loads a csv for each language that contains x 
# datapoints of audio recording, if unspecified
# then no datapoints

lang_people = {lang : person_to_group(df) for lang, df in files.items()}
# Inside each dataframe makes a dictionary mapping
# Each unique speaker to all the audio files
# They have produces

for lang, groups in lang_people.items():
    print(f'Unique {lang} speakers: {len(groups)}')

Loading de from: /om2/user/moshepol/prosody/data/raw_audio/de/validated.tsv
Loading en from: /om2/user/moshepol/prosody/data/raw_audio/en/validated.tsv
Loading es from: /om2/user/moshepol/prosody/data/raw_audio/es/validated.tsv
Loading it from: /om2/user/moshepol/prosody/data/raw_audio/it/validated.tsv
Unique de speakers: 17667
Unique en speakers: 77437
Unique es speakers: 16927
Unique it speakers: 6688


In [6]:
from langaugedetection.data.audio_length import valid_paths, random_audio
import random

choices = 2

lang_people_to_paths = {}

# Speedup --> Run once and save to .csv
# Maybe cap by maximum number of speaker
# 

for lang, lang_dict in lang_people.items():

    lang_people_to_paths[lang] = (
        {people : random.sample(x := valid_paths(df, 5, 1, lang), min(len(x), choices)) for people, df in lang_dict.items()}
    )

    print(f'Language: {lang} complete')

Language: de complete


KeyboardInterrupt: 

In [None]:
for lang, df in lang_people_to_paths.items():
    print(f'Language: {lang} with {len(df)} speakers')

Language: de with 11073 speakers
Language: en with 28449 speakers
Language: es with 12099 speakers
Language: it with 6157 speakers


In [None]:
def chooses_speaker(d, fraction = .8):
    '''
    Splits the dataset to be 80% of training
    '''
    train_keys = set(random.sample(list(d.keys()), int(.8 * len(d.keys()))))
    test_keys = set(d.keys()) - train_keys

    return [d[key] for key in train_keys], [d[key] for key in test_keys]


lang_train = {}
lang_test = {}

def flatten(array):
    '''
    Takes in an array within an array and flattens in
    '''
    x = []
    for x_val in array:
        if isinstance(x, list):
            for x__val in x_val:
                x.append(x__val)
        else:
            x.append(x_val)

    return x

for lang, dictionary in lang_people_to_paths.items():
    train, test = chooses_speaker(dictionary)

    lang_train[lang] = flatten(train)
    lang_test[lang] = flatten(test)

In [None]:
min_train = 1_000_000
min_test = 1_000_000

for (lang, train_item), (lang, test_item) in zip(lang_train.items(), lang_test.items()):
    print(f'Language: {lang}, Training: {len(train_item)}, Testing: {len(test_item)}')

    min_train = min(min_train, len(train_item))
    min_test = min(min_test, len(test_item))

print(f'Training Size will be: {min_train}')
print(f'Testing Size will be: {min_test}')

Language: de, Training: 3762, Testing: 938
Language: en, Training: 2341, Testing: 641
Language: es, Training: 4488, Testing: 1073
Language: it, Training: 3157, Testing: 860


In [None]:
# Shorten to shortest
train_choices = int((min_train // 100) * 100)
test_choices = int((min_test // 100) * 100)

lang_short_train = {}
lang_short_test = {}

for (lang, train_item), (lang, test_item) in zip(lang_train.items(), lang_test.items()):
    lang_short_train[lang] = random.sample(train_item, train_choices)
    lang_short_test[lang] = random.sample(test_item, test_choices)

In [None]:
for (lang, train_item), (lang, test_item) in zip(lang_short_train.items(), lang_short_test.items()):
    print(f'Language: {lang}, Training: {len(train_item)}, Testing: {len(test_item)}')

Language: de, Training: 2200, Testing: 600
Language: en, Training: 2200, Testing: 600
Language: es, Training: 2200, Testing: 600
Language: it, Training: 2200, Testing: 600


In [None]:
from langaugedetection.data.spectrogram import parse
from langaugedetection.data.tensor_construct import build_tensor

train_tensor = {}
test_tensor = {}

for (lang, train_item), (lang, test_item) in zip(lang_short_train.items(), lang_short_test.items()):
    train_tensor[lang] = build_tensor(parse(train_item))
    test_tensor[lang] = build_tensor(parse(test_item))

In [None]:
for lang, train in train_tensor.items():
    print(f'{lang} tensor shape is: {train.shape}')

de tensor shape is: torch.Size([2200, 1, 1025, 216])
en tensor shape is: torch.Size([2200, 1, 1025, 216])
es tensor shape is: torch.Size([2200, 1, 1025, 216])
it tensor shape is: torch.Size([2200, 1, 1025, 216])


In [None]:
from sklearn.preprocessing import LabelEncoder
import torch

encoder = LabelEncoder()
encoder.fit(list(train_tensor.keys()))

train_label_array = {lang : encoder.transform([lang] * train.shape[0]) for lang, train in train_tensor.items()}
test_label_array = {lang : encoder.transform([lang] * test.shape[0]) for lang, test in test_tensor.items()}

print(f'Classes: {encoder.classes_}')

Classes: ['de' 'en' 'es' 'it']


In [None]:
from torch.utils.data import TensorDataset, ConcatDataset
from langaugedetection.data.tensor_construct import split_dataset

train_datasets = []
for (lang, train), (lang, label) in zip(train_tensor.items(), train_label_array.items()):
    train_datasets.append(TensorDataset(train, torch.tensor(label, dtype = torch.long)))


test_datasets = []
for (lang, test), (lang, label) in zip(test_tensor.items(), test_label_array.items()):
    test_datasets.append(TensorDataset(test, torch.tensor(label, dtype = torch.long)))


train = ConcatDataset(train_datasets)
test = ConcatDataset(test_datasets)

print(f'Training values: {len(train)}')
print(f'Testing values: {len(test)}')

Training values: 8800
Testing values: 2400


In [None]:
from torch.utils.data import DataLoader

loader = DataLoader(train, batch_size=64, shuffle = True, )

In [None]:
import torch.nn as nn

class LanguageDetector(nn.Module):
    def __init__(self):
        super().__init__()

        # 2 Convolution Layers
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)

        # Neuron Layers
        # Now apply linear layer --> Dimensions of input are 
        # Con1 
        # (1025, 216) --> (512, 108)
        # Con2
        # (512, 108) --> (256, 54)
        # Now we have 32 of these with respective filters applied
        self.fc1 = nn.Linear(32 * 256 * 54, 256)
        self.fc2 = nn.Linear(256, 32)
        self.fc3 = nn.Linear(32, len(languages))

        # Relu, Pool Function
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride = 2)


    def forward(self, x):

        # 2D Convolution Apllication
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))

        # Flatten Dimensions
        x = x.view(x.size(0), -1)

        # Dense Layers
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)

        return x

model = LanguageDetector()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = .001)

In [None]:
# Train
model.train()
num_epochs = 16 # Should be around 25
total_loss = []

for i in range(num_epochs):

    epoch_loss = 0

    for x_batch, y_batch in loader:

        # Evaluate
        outputs = model(x_batch).squeeze(1)
        loss = criterion(outputs, y_batch)

        # Update Model
        model.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss

    total_loss.append(epoch_loss / len(loader))

    print(f'Epoch : [{i+1} /{num_epochs}], loss {total_loss[-1]}')

Epoch : [1 /2], loss 1.436499834060669
Epoch : [2 /2], loss 1.3583611249923706


In [None]:
# Save output, Training Loss
with open('models/model2.txt', 'w') as f:
    for i, loss in enumerate(total_loss):
        f.write(f'Epoch {i}: {loss:.4f} loss\n')

# Save output, Testing Tensor
torch.save(test, 'models/test2.pt')

# Save model
torch.save(model.state_dict(), 'models/model2.pth')