In [1]:
import os
import numpy as np
from scipy.io import wavfile
from python_speech_features import mfcc

import torch
import torch.nn as nn
from torch.utils.data import DataLoader


In [2]:
word_list = os.listdir("data/")
print(word_list)

for w in word_list :
    if w.find(".") >= 0 :
        word_list.remove(w)

print(word_list)

['.DS_Store', 'apple', 'banana', 'kiwi', 'lime', 'orange', 'peach', 'pineapple', 'README.md']
['apple', 'banana', 'kiwi', 'lime', 'orange', 'peach', 'pineapple']


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

data = []

for i in range(len(word_list)) :
    target = torch.tensor(i, dtype = torch.long, device = device).unsqueeze(0)
    dir_ = os.path.join("data", word_list[i])
    for file in os.listdir(dir_) :
        if file.find(".wav") < 0 :
            continue
        filedir = os.path.join(dir_, file)
        freq, signal = wavfile.read(filedir)
        seq = mfcc(signal, freq)
        tensor_x = torch.tensor(seq, dtype = torch.float, device = device).unsqueeze(0)
        data.append((tensor_x, target))

print(len(data))
print(data[0][0].shape)

    

105
torch.Size([1, 33, 13])


In [4]:
class NN(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.rnn = nn.LSTM(13, 20, batch_first = True)
        self.f = nn.Sequential(
            nn.Linear(20, 7)
        )
    def forward(self, x) :
        x, h = self.rnn(x)
        x = x[:,-1,:]
        x = self.f(x)
        return x

F = NN().to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(F.parameters(), lr = 0.0005)
epoch = 30

for e in range(epoch) :
    F.train()
    loss_sum = 0
    for x, t in data :
        y = F(x)

        loss = loss_function(y, t)
        loss_sum += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    loss_sum /= len(data)

    print(f"epoch {e+1} | loss {loss_sum}")

    

epoch 1 | loss 2.0029870124090285
epoch 2 | loss 1.8198511725380306
epoch 3 | loss 1.6447559038798014
epoch 4 | loss 1.4747316281000773
epoch 5 | loss 1.2947787687892005
epoch 6 | loss 1.1292818818773542
epoch 7 | loss 0.9831137895584107
epoch 8 | loss 0.8611260998816718
epoch 9 | loss 0.763541670356478
epoch 10 | loss 0.6470174040113177
epoch 11 | loss 0.578569879985991
epoch 12 | loss 0.4785529958350318
epoch 13 | loss 0.4243104487657547
epoch 14 | loss 0.37913066915103366
epoch 15 | loss 0.33824644549971533
epoch 16 | loss 0.30302230765422183
epoch 17 | loss 0.2723846958506675
epoch 18 | loss 0.24420659400167918
epoch 19 | loss 0.227638094269094
epoch 20 | loss 0.20532201762710298
epoch 21 | loss 0.1795580438914753
epoch 22 | loss 0.15924676564477738
epoch 23 | loss 0.14863436846506028
epoch 24 | loss 0.1353179300824801
epoch 25 | loss 0.12332987685998281
epoch 26 | loss 0.1139418475329876
epoch 27 | loss 0.10474772694565002
epoch 28 | loss 0.09726871447194191
epoch 29 | loss 0.0888

In [5]:
correct = 0
F.eval()
for x, t in data :
    y = F(x)
    if y.argmax().item() == t.item() :
        correct += 1
acc = correct / len(data)
print(acc)

1.0


In [6]:

torch.save(F.to("cpu"), "word.pt")