## Spoken Language Analysis

An exercise of Speech Audio Data Analysis to classify the spoken language of speech recording files.

Data Source: https://www.kaggle.com/toponowicz/spoken-language-identification

Reference: https://github.com/micah5/pyAudioClassification

Install libraries by using ```pip install <library_name>```

Libraries:
- keras
- numpy
- ffmpeg
- librosa
- tensorflow
- pyaudioclassification

### Load Libraries

In [1]:
import os
import glob
import time
import shutil
import socket
import librosa
import numpy as np
import soundfile as sf
from tqdm import tqdm
from pyaudioclassification import train, predict

Using TensorFlow backend.


In [2]:
def extract_feature(file_name):
    X, sample_rate = sf.read(file_name, dtype='float32')
    if X.ndim > 1:
        X = X[:,0]
    X = X.T

    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs, chroma, mel, contrast, tonnetz

In [3]:
def parse_audio_files(parent_dir, sub_dirs, file_ext=None, verbose=True):
    if file_ext == None:
        file_types = ['*.flac']
    else:
        file_types = []
        file_types.push(file_ext)
    features, labels = np.empty((0,193)), np.empty(0)
    for label, sub_dir in enumerate(sub_dirs):
        for file_ext in file_types:
            # file names
            iter = glob.glob(os.path.join(parent_dir, sub_dir, file_ext))
            if len(iter) > 0:
                if verbose: print('Reading', os.path.join(parent_dir, sub_dir, file_ext), '...')
                for fn in tqdm(iter):
                    ext_features = get_ext_features(fn)
                    if type(ext_features) is np.ndarray:
                        features = np.vstack([features, ext_features])
                        labels = np.append(labels, label)
    return np.array(features), np.array(labels, dtype = np.int)

In [4]:
def feature_extraction(data_path ,verbose=False):
    r = os.listdir(data_path)
    r.sort()
    features, labels = parse_audio_files(data_path, r)
    return features, labels

In [5]:
def get_ext_features(fn):
    try:
        mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn)
        ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz])
        return ext_features
    except Exception as e:
        print("[Error] extract feature error. %s" % (e))
        return None

In [6]:
def print_leaderboard(pred, data_path):
    r = os.listdir(data_path)
    r.sort()
    sorted = np.argsort(pred)
    count = 0
    for index in (-pred).argsort()[0]:
        print('%d.' % (count + 1), r[index + 1], str(round(pred[0][index]*100)) + '%', '(index %s)' % index)
        count += 1

### Pre-processing

#### Classifying training data by creating individual folder as classes

In [7]:
train_path = "./data/train/"
test_path = "./data/test/"
de_path = train_path + "de"
en_path = train_path + "en"
es_path = train_path + "es"

In [8]:
try:  
    os.mkdir(de_path)
except OSError:  
    print ("Creation of the directory %s failed or already existed." % de_path)
else:  
    print ("Successfully created the directory %s " % de_path)

Creation of the directory ./data/train/de failed or already existed.


In [9]:
try:  
    os.mkdir(en_path)
except OSError:  
    print ("Creation of the directory %s failed or already existed." % en_path)
else:  
    print ("Successfully created the directory %s " % en_path)

Creation of the directory ./data/train/en failed or already existed.


In [10]:
try:  
    os.mkdir(es_path)
except OSError:  
    print ("Creation of the directory %s failed or already existed." % es_path)
else:  
    print ("Successfully created the directory %s " % es_path)

Creation of the directory ./data/train/es failed or already existed.


#### Move audio files according to its language referring to the filename.

In [None]:
start_time = time.time()

de_count = 0
en_count = 0
es_count = 0
sample_size = 1800

for file in os.listdir(train_path):
    path = os.path.join(train_path, file)
    if os.path.isdir(path):
        continue
    elif (file[0:2] == "de" and de_count < sample_size):
        shutil.move(train_path + file, de_path)
        de_count = de_count + 1
    elif (file[0:2] == "en" and en_count < sample_size):
        shutil.move(train_path + file, en_path)
        en_count = en_count + 1
    elif (file[0:2] == "es" and es_count < sample_size):
        shutil.move(train_path + file, es_path)
        es_count = es_count + 1
    elif (de_count == sample_size and en_count == sample_size and es_count == sample_size):
        break
        
print("Process time: " + str(round((time.time()-start_time),2)) + " secs")

#### Remove the remaining files

In [None]:
start_time = time.time()

for file in os.listdir(train_path):
    path = os.path.join(train_path, file)
    if os.path.isdir(path):
        continue
    else:
        os.remove(train_path + file)

print("Process time: " + str(round((time.time()-start_time),2)) + " secs")

#### Feature Extraction

In [11]:
start_time = time.time()

features, labels = feature_extraction(train_path)

print("Process time: " + str(round((time.time()-start_time),2)) + " secs")

Reading ./data/train/de\*.flac ...


100%|██████████████████████████████████████████████████████████████████████████████| 1800/1800 [26:13<00:00,  1.15it/s]


Reading ./data/train/en\*.flac ...


100%|██████████████████████████████████████████████████████████████████████████████| 1800/1800 [24:58<00:00,  1.20it/s]


Reading ./data/train/es\*.flac ...


100%|██████████████████████████████████████████████████████████████████████████████| 1800/1800 [25:09<00:00,  1.19it/s]


Process time: 4581.86 secs


#### Save Features

In [23]:
# Save
# np.save('language_feat.npy', features)
# np.save('language_label.npy', labels)
# Load
# features, labels = np.load('language_feat.npy'), np.load('language_label.npy')

### Training

In [24]:
start_time = time.time()

model = train(features, labels)

print("Process time: " + str(round((time.time()-start_time),2)) + " secs")

ValueError: Error when checking target: expected activation_14 to have shape (2,) but got array with shape (1,)

#### Save Models

In [None]:
# from keras.models import load_model

# model.save('language_model.h5')
# model = load_model('language_model.h5')

### Predict

In [None]:
pred = predict(model, test_path + "de_f_63f5b79c76cf5a1a4bbd1c40f54b166e.fragment87.flac")

In [None]:
print_leaderboard(pred, train_path)