In [6]:
import sys
import os
import subprocess
import numpy as np
import pandas as pd
import glob
from collections import OrderedDict
import random
import torch
import torch.nn as nn

#Clone and import CNNWordReco repository
if ~os.path.isdir('CNNWordReco'):
    subprocess.call(['git', 'clone', 'https://github.com/saztorralba/CNNWordReco'])
if 'CNNWordReco' not in sys.path:
    sys.path.append('CNNWordReco')
from utils.cnn_func import load_data, train_model, validate_model, test_model
from models.SimpleCNN import SimpleCNN
from test_wordreco import show_matrix


KeyboardInterrupt: 

In [3]:
#Arguments
args = {
    'cv_percentage': 0.1,
    'xsize': 20,
    'ysize': 20,
    'num_blocks': 10,
    'channels': 32,
    'dropout': 0.3,
    'embedding_size': 128,
    'epochs': 20,
    'batch_size': 32,
    'learning_rate': 0.001,
    'seed': 0,
    'device': 'cpu',
    'verbose': 1,
    'augment': False,
    'vocab': OrderedDict({'ZERO': 0, 'ONE': 1, 'TWO': 2, 'THREE': 3, 'FOUR': 4, 'FIVE': 5, 'SIX': 6, 'SEVEN': 7, 'EIGHT': 8, 'NINE': 9})
}

#Initialise the random seeds
random.seed(args['seed'])
torch.manual_seed(args['seed'])
torch.cuda.manual_seed(args['seed'])
torch.backends.cudnn.deterministic = True

#Read data and store in dataframe
wavfiles = glob.glob('/kaggle/input/free-spoken-digit-dataset-fsdd/recordings/*.wav')
speakers = [file.split('/')[-1].split('_')[1] for file in wavfiles]
words = [list(args['vocab'].keys())[int(file.split('/')[-1].split('_')[0])] for file in wavfiles]
rec_number = [int(file.split('/')[-1].split('_')[2].split('.')[0]) for file in wavfiles]
data = pd.DataFrame({'wavfile':wavfiles,'speaker':speakers,'word':words,'rec_number':rec_number})


In [5]:
#Perform training as defined in https://github.com/Jakobovski/free-spoken-digit-dataset/
#Recordings [5-49] for training and recordings [0-4] for testing
print('Training model with recordings [5-49] from all speakers')
#Load data
train_data = data.loc[data['rec_number']>=5].reset_index(drop=True)
test_data = data.loc[data['rec_number']<5].reset_index(drop=True)
trainset, validset, trainlabels, validlabels = load_data(train_data,True,**args)
args['mean'] = torch.mean(trainset.float())
args['std'] = torch.std(trainset.float())
#Model, optimiser and criterion
model = SimpleCNN(**args).to(args['device'])
optimizer = torch.optim.Adam(model.parameters(),lr=args['learning_rate'])
criterion = nn.NLLLoss(reduction='mean').to(args['device'])
for ep in range(1,args['epochs']+1):
    #Do backpropgation and validation epochs
    loss = train_model(trainset,trainlabels,model,optimizer,criterion,**args)
    acc = validate_model(validset,validlabels,model,**args)
    print('Epoch {0:d} of {1:d}. Training loss: {2:.2f}, cross-validation accuracy: {3:.2f}%'.format(ep,args['epochs'],loss,acc))

Training model with recordings [5-49] from all speakers


NameError: name 'load_data' is not defined

In [None]:
#Load test data and test
testset, testlabels = load_data(test_data, False, **args)
conf_matrix = test_model(testset,testlabels,model,**args)
    
#Present results
print('Accuracy: {0:.2f}%'.format(100*np.sum(conf_matrix*np.eye(len(args['vocab'])))/np.sum(conf_matrix)))   
show_matrix(conf_matrix, **args)