In [1]:
import crepe
from scipy.io import wavfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def zeropadding(array, SamplePerWindow):
    if(array.size%SamplePerWindow != 0):
        left = array.size%SamplePerWindow
        add = np.zeros(SamplePerWindow - left)
        new_arr = np.append(array, add)
        return new_arr
    else:
        return array

In [3]:
import os
import python_speech_features

X = []
SamplePerWindow = 480 ## sample rate 48000/s 0.01sec/window
for dirname, _, filenames in os.walk('new_dataset/'):    ## Read data
    for filename in filenames:
        sr, audio = wavfile.read(os.path.join(dirname,filename))
        audio = zeropadding(audio, SamplePerWindow)
        ##Feature Extraction
        mfcc_speech = python_speech_features.mfcc(signal=audio, samplerate=sr, winlen=0.01, winstep=0.005,nfft=1024)
        newdata = []
        i=0
        while i<mfcc_speech.shape[0]:
            row = []
            if i == mfcc_speech.shape[0]-1:
                row.append(mfcc_speech[i])
                row = np.append(row, mfcc_speech[i])
            else:
                sub = mfcc_speech[i] - mfcc_speech[i+1]
                row.append(mfcc_speech[i])
                row = np.append(row, sub)
            newdata.append(row)
            i = i+2
#             newdata = np.array(newdata)
            
        X = np.append(X, newdata)
#         print(os.path.join(dirname, filename), 'MFCC data shape: ', newdata.shape) 
        
        
X = np.array(X)
length = int(X.size/26)
X = np.reshape(X, (length, 26))
print('Total Training datasize:',X.shape)

Total Training datasize: (166753, 26)


In [4]:
Y=[]

for dirname, _, filenames in os.walk('txtfile/'):  ##Read the Y labels for the data
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        data= pd.read_csv(os.path.join(dirname, filename))
        temp = np.array(data.frequency)
#         print(os.path.join(dirname, filename), 'Frames:', temp.size)
        Y = np.append(Y,temp)
print('Label file Y loaded')

minY = np.amin(Y)
for i in range(Y.size):
    Y[i] = Y[i] - minY
maxY = np.amax(Y)
print(minY, maxY)
Y = Y[0:Y.size-1]

print('Total labels:', Y.shape)
print('Total frames:', X.shape)

Label file Y loaded
31.0 1951.0
Total labels: (166753,)
Total frames: (166753, 26)


In [10]:
print('X data shape:',X.shape)
print('Y label shape:', Y.shape)
# for i in range(Y.size):
#     if Y[i] > 400:
#         Y[i] = 400
        
X_new = X[40000:160000]
Y_new = Y[40000:160000]

X data shape: (166753, 26)
Y label shape: (166753,)


In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X_new,Y_new,test_size=0.2)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
SamplePerFrame = 480

(96000, 26) (96000,)
(24000, 26) (24000,)


In [12]:
import tensorflow as tf

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(512, activation=tf.nn.relu)) 
model.add(tf.keras.layers.Dense(1024, activation=tf.nn.relu)) 
model.add(tf.keras.layers.Dense(2048, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(2048, activation=tf.nn.relu)) 
model.add(tf.keras.layers.Dense(1955, activation=tf.nn.softmax)) 

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.build((1,26))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (1, 26)                   0         
                                                                 
 dense_5 (Dense)             (1, 512)                  13824     
                                                                 
 dense_6 (Dense)             (1, 1024)                 525312    
                                                                 
 dense_7 (Dense)             (1, 2048)                 2099200   
                                                                 
 dense_8 (Dense)             (1, 2048)                 4196352   
                                                                 
 dense_9 (Dense)             (1, 1955)                 4005795   
                                                                 
Total params: 10,840,483
Trainable params: 10,840,483


In [13]:
model.fit(x_train, y_train, epochs=10) #15

model.save('MyModel')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: MyModel\assets


In [14]:
print('Testing data')
val_loss, val_acc = model.evaluate(x_test,y_test)

Testing data


In [15]:
sr, audio = wavfile.read('test_dataset/test.wav')

new_audio = zeropadding(audio, SamplePerWindow)

mfcc_speech = python_speech_features.mfcc(signal=audio, samplerate=sr, winlen=0.01, winstep=0.005,nfft=1024)
newdata = []
i=0
while i<mfcc_speech.shape[0]:
    row = []
    if i == mfcc_speech.shape[0]-1:
        row.append(mfcc_speech[i])
        row = np.append(row, mfcc_speech[i])
    else:
        sub = mfcc_speech[i] - mfcc_speech[i+1]
        row.append(mfcc_speech[i])
        row = np.append(row, sub)
    newdata.append(row)
    i = i+2

new_audio = np.array(newdata)

print(audio.size, new_audio.shape)
dim = new_audio.shape[0]

test_output = model.predict(new_audio)
print('predict output:',test_output.shape)
output = []


for i in range(dim):
    #print('max:', np.argmax(test_output[i][1:400]))
    result = np.where(test_output[i][50:500] == np.amax(test_output[i][80:500]))   
    result = result
    #print(i, result)
    output = np.append(output, result) 
    
file1 = open("pitch_demo\MFCC_MyModel_test.csv","w")
for i in range(dim):
    file1.write(str(output[i]))
    file1.write('\n')
file1.close()
    
print(output.shape)

111718 (233, 26)
predict output: (233, 1955)
(233,)


In [16]:
sr, audio = wavfile.read('test_dataset/test1.wav')

new_audio = zeropadding(audio, SamplePerWindow)

mfcc_speech = python_speech_features.mfcc(signal=audio, samplerate=sr, winlen=0.01, winstep=0.005,nfft=1024)
newdata = []
i=0
while i<mfcc_speech.shape[0]:
    row = []
    if i == mfcc_speech.shape[0]-1:
        row.append(mfcc_speech[i])
        row = np.append(row, mfcc_speech[i])
    else:
        sub = mfcc_speech[i] - mfcc_speech[i+1]
        row.append(mfcc_speech[i])
        row = np.append(row, sub)
    newdata.append(row)
    i = i+2

new_audio = np.array(newdata)

print(audio.size, new_audio.shape)
dim = new_audio.shape[0]

test_output = model.predict(new_audio)
print('predict output:',test_output.shape)
output = []


for i in range(dim):
    #print('max:', np.argmax(test_output[i][1:400]))
    result = np.where(test_output[i][50:500] == np.amax(test_output[i][80:500]))   
    result = result
    #print(i, result)
    output = np.append(output, result) 
    
file1 = open("pitch_demo\MFCC_MyModel_test1.csv","w")
for i in range(dim):
    file1.write(str(output[i]))
    file1.write('\n')
file1.close()
    
print(output.shape)

101393 (211, 26)
predict output: (211, 1955)
(211,)
