In [15]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

In [18]:
#DataFlair - Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, mfcc, chroma, mel, zcr):
        with soundfile.SoundFile(file_name) as sound_file:
            X = sound_file.read(dtype="float32")
            sample_rate=sound_file.samplerate
            if chroma:
                stft=np.abs(librosa.stft(X))
            result=np.array([])
            #Chromagram
            if chroma:
                chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
                result=np.hstack((result, chroma))
            #Mel Frequency Cepstral Coefficents
            if mfcc:
                mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
                result=np.hstack((result, mfccs))
            #Zero Crossing Point
            if zcr:
                mfccs=np.mean(librosa.feature.zero_crossing_rate(y=X).T, axis=0)
                result=np.hstack((result, mfccs))
            #Mel Frequency
            if mel:
                mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
                result=np.hstack((result, mel))
        return result



In [3]:
#DataFlair - Emotions in the RAVDESS dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
#DataFlair - Emotions to observe
observed_emotions=['neutral', 'happy', 'angry']

In [4]:
def load_data(test_size=0.25):
    x,y=[],[]
    for file in glob.glob("./ravdess data/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True, zcr = True)
        x.append(feature)
        y.append(emotion)
    
    return train_test_split(np.array(x), y, test_size=test_size, random_state=45)

In [5]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)

In [6]:
#DataFlair - Get the shape of the training and testing datasets
print(x_test.shape)

(120, 181)


In [7]:
print(x_train)

[[6.18730510e-01 6.74243378e-01 7.32388280e-01 ... 9.88873566e-05
  7.44776134e-05 2.91970132e-05]
 [6.11271409e-01 5.46938598e-01 5.46428929e-01 ... 6.05488833e-05
  7.67448184e-05 3.31462757e-05]
 [6.18966405e-01 5.60614117e-01 5.16037408e-01 ... 1.34777307e-03
  5.97548923e-04 2.67410641e-04]
 ...
 [7.27984544e-01 7.65102161e-01 6.37286177e-01 ... 2.28287767e-03
  1.38554941e-03 5.29100683e-04]
 [5.30477191e-01 5.48714752e-01 5.70941440e-01 ... 5.44573887e-03
  2.44326007e-03 2.28718499e-03]
 [6.18425093e-01 6.80970422e-01 6.38472150e-01 ... 9.06038853e-04
  4.81843252e-04 2.89588680e-04]]


In [8]:
#DataFlair - Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 181


In [9]:
#DataFlair - Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(500,), learning_rate='adaptive', max_iter=500)

In [10]:
#DataFlair - Train the model
model.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(500,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
#DataFlair - Predict for the test set
y_pred=model.predict(x_test)

In [12]:
#DataFlair - Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
#DataFlair - Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 86.67%


In [13]:
result = pd.DataFrame({"actual":y_test, "predict":y_pred})

In [14]:
result

Unnamed: 0,actual,predict
0,happy,happy
1,angry,angry
2,angry,angry
3,happy,happy
4,angry,angry
...,...,...
115,angry,angry
116,neutral,neutral
117,happy,happy
118,angry,angry


In [3]:
import pyaudio
import wave
import wave
import struct
import sys
import numpy 

CHUNK = 4096
FORMAT = pyaudio.paInt32
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 4
WAVE_OUTPUT_FILENAME = "output.wav"

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    decoded = numpy.frombuffer(data, dtype = 'Float32');
    frames.append(decoded)

print("* done recording")


stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))

wf.close()

* recording




* done recording


In [4]:
import matplotlib.pyplot as plt
import numpy as np
import wave
import sys
from pydub import AudioSegment

sound = AudioSegment.from_wav("./output.wav")
sound = sound.set_channels(1)
sound.export("./output_mono.wav", format="wav")

record =[]
for file in glob.glob("./398059000062861.wav"):
    feature=extract_feature(file, mfcc=True, chroma=True, mel=True, zcr =True)
    record.append(feature)
record = np.array(record)




NameError: name 'glob' is not defined

In [5]:
import emoji
prediction = model.predict(record)
if prediction[0] == 'happy':
    emo = emoji.emojize(':smiley:', use_aliases=True)
elif prediction[0] == 'angry':
    emo = emoji.emojize(':rage:', use_aliases=True)
else:
    emo = emoji.emojize(':neutral_face:', use_aliases=True)
print (prediction[0], emo)


NameError: name 'model' is not defined

In [1]:
import speech_recognition as sr
r = sr.Recognizer()

with sr.AudioFile("./output.wav") as source:
    audio = r.record(source)
    s = r.recognize_google(audio)
    print("Text: "+s)


RequestError: recognition connection failed: [Errno 11003] getaddrinfo failed