5) Classify - combines solutions from 1-4 to classify sentiment of customer and to which agent should the customer be assigned to

In [1]:
import nbformat
from IPython import get_ipython
from IPython.core.interactiveshell import InteractiveShell

def import_from_notebook(notebook_path, function_name):
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)

    shell = InteractiveShell.instance()
    code = "\n".join([cell.source for cell in nb.cells if cell.cell_type == 'code'])
    exec(code, shell.user_ns)
    return shell.user_ns[function_name]

In [2]:
is_caller_angry = import_from_notebook('Text Semantic Analysis.ipynb', 'is_caller_angry')

pygame 2.5.2 (SDL 2.28.3, Python 3.11.7)
Hello from the pygame community. https://www.pygame.org/contribute.html



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Transcribing. Please wait...


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Transcript: {'text': " I'm dissatisfied with the technician who helped me yesterday. I'm unhappy with your service. I actually have a problem with the way you handled my request. This is unacceptable. I would like to speak with your supervisor. Fuck you! I hate you for treating me like shit!"}
User is angry.
Transcribing. Please wait...
Transcript: {'text': " I'm dissatisfied with the technician who helped me yesterday. I'm unhappy with your service. I actually have a problem with the way you handled my request. This is unacceptable. I would like to speak with your supervisor. Fuck you! I hate you for treating me like shit!"}
User is angry.


In [3]:
# Import necessary libraries
import os
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import load_model
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load and process the audio files in a folder
def load_audio_file(file_path):
    data, sampling_rate = librosa.load(file_path, duration=2.5, offset=0.6)
    return data, sampling_rate

# Feature extraction function
def extract_features(data, sample_rate):
    result = np.array([])
    
    # Zero Crossing Rate
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))
    
    # Chroma STFT
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft))
    
    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc))
    
    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms))
    
    # Mel Spectrogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel))
    
    return result

# Function to preprocess and standardize data
def preprocess_data(features, scaler):
    features = scaler.transform(features.reshape(1, -1))
    features = np.expand_dims(features, axis=2)
    return features

In [4]:
def get_emotion(file_path):
    # Load the saved model
    model = load_model('emotion_recognition_model.h5')

    # Load the scaler used for training
    scaler = StandardScaler()
    scaler.fit_transform(pd.read_csv('features.csv').iloc[:, :-1].values)

    file_path = file_path

    # Load the label encoder used for training
    label_encoder = OneHotEncoder()
    label_encoder.fit(pd.read_csv('features.csv')['labels'].values.reshape(-1, 1))

    # Load and preprocess the audio file
    data, sampling_rate = load_audio_file(file_path)
    features = extract_features(data, sampling_rate)
    processed_features = preprocess_data(features, scaler)

    # Make a prediction
    prediction = model.predict(processed_features)
    predicted_label = np.argmax(prediction, axis=1)

    # Map the predicted label to the corresponding emotion
    emotion = label_encoder.categories_[0][predicted_label[0]]

    return emotion


In [5]:
agent_country_dict = {
    'african': 'Caller assigned to African agent.',
    'australia': 'Caller assigned to Australian agent.',
    'bermuda': 'Caller assigned to Bermudian agent.',
    'canada': 'Caller assigned to Canadian agent.',
    'england': 'Caller assigned to English agent.',
    'hongkong': 'Caller assigned to Hong Kong agent.',
    'indian': 'Caller assigned to Indian agent.',
    'ireland': 'Caller assigned to Irish agent.',
    'malaysia': 'Caller assigned to Malaysian agent.',
    'newzealand': 'Caller assigned to New Zealand agent.',
    'philippines': 'Caller assigned to Filipino agent.',
    'scotland': 'Caller assigned to Scottish agent.',
    'singapore': 'Caller assigned to Singaporean agent.',
    'southatlandtic': 'Caller assigned to South Atlantic agent.',
    'us': 'Caller assigned to US agent.',
    'wales': 'Caller assigned to Welsh agent.'
}

In [6]:
def endorse_to_agent(file_path):
    import torch
    import torchaudio
    from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, pipeline, TrainingArguments, Trainer
    
    RATE_HZ = 16000
    
    # Load the saved model
    model_name = "english_accents_classification"
    model = AutoModelForAudioClassification.from_pretrained(model_name)

    # Load the feature extractor
    feature_extractor = AutoFeatureExtractor.from_pretrained("dima806/english_accents_classification")

    # Move the model to the appropriate device (CPU or GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    audio,rate=torchaudio.load(file_path)
    transform=torchaudio.transforms.Resample(rate,RATE_HZ)
    audio=transform(audio).numpy().reshape(-1)

    target_sample_rate = feature_extractor.sampling_rate

    # Preprocess the audio
    inputs = feature_extractor(audio, sampling_rate=target_sample_rate, return_tensors="pt")

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits (raw predictions)
    logits = outputs.logits

    # Get predicted class
    predicted_class_id = logits.argmax().item()
    predicted_class_label = model.config.id2label[predicted_class_id]

    # Print the predicted class
    print(f"Predicted class ID: {predicted_class_id}")
    print(f"Predicted class label: {predicted_class_label}")
    
    return agent_country_dict[predicted_class_label]

In [9]:
#runs customer to all checkings if he truly is not angry

def verify_customer_is_calm(file_path):
    transcript_result = is_caller_angry(file_path)
    emotion = get_emotion(file_path)
    if emotion == 'angry' or transcript_result == 'OFFENSIVE-LANGUAGE':
        endorsement = endorse_to_agent(file_path)
        print('customer is angry')
        print(endorsement)
        return 1
    else:
        print('customer is calm')
        return 0

In [10]:
file_path = "ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_12/03-01-05-02-01-02-12.wav"
verify_customer_is_calm(file_path)

Transcribing. Please wait...
Transcript: {'text': ' Kids are talking by the door!'}
User is calm.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 627ms/step
Predicted class ID: 4
Predicted class label: england
customer is angry
Caller assigned to English agent.


1