In [None]:
import numpy as np
import pandas as pd
import librosa
import os
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from warnings import simplefilter
from sklearn.metrics import f1_score
from sklearn.utils import resample

In [2]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [3]:
data_train = pd.read_csv('Data_Train_modified.csv')
data_val = pd.read_csv('Data_Val_modified.csv')
data_test = pd.read_csv('Data_Test_original.csv')

In [4]:
tqdm.pandas()
data_train['sentiment'] = data_train['sentiment'].progress_apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutaral')
data_val['sentiment'] = data_val['sentiment'].progress_apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutaral')
data_test['sentiment'] = data_test['sentiment'].progress_apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutaral')

100%|██████████| 16274/16274 [00:00<00:00, 938294.41it/s]
100%|██████████| 1861/1861 [00:00<00:00, 524464.14it/s]
100%|██████████| 4662/4662 [00:00<00:00, 1537735.55it/s]


In [5]:
neutral_count = data_train[data_train["sentiment"] == "neutaral"].shape[0]
data_positive = data_train[data_train["sentiment"] == "positive"]
data_negative = data_train[data_train["sentiment"] == "negative"]

data_positive_downsampled = resample(data_positive, replace=False, n_samples=neutral_count, random_state=42)
data_negative_downsampled = resample(data_negative, replace=False, n_samples=neutral_count, random_state=42)
data_train = pd.concat([data_train[data_train["sentiment"] == "neutaral"], data_positive_downsampled, data_negative_downsampled])

In [6]:
audio_folder = "./Audio/WAV_16000/"
SAMPLE_RATE = 16000
HOP_LENGTH = 512
N_MFCC= 13
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased").to(DEVICE)

In [7]:
def extract_mfcc(file_path, start_time, end_time, n_mfcc):
    y, sr = librosa.load(file_path, sr=None)
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    y_segment = y[start_sample:end_sample]
        
    mfcc = librosa.feature.mfcc(y=y_segment, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = np.mean(mfcc.T, axis=0)   

    return mfcc_mean.flatten()

def extract_bert_features(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=600).to(DEVICE)
    with torch.no_grad():
        outputs = model(**inputs)
        
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()[0]

In [8]:
def add_mfcc(data, n_mfcc):
    data["mfcc"] = data.progress_apply(lambda row: extract_mfcc(os.path.join(audio_folder, row["video"] + ".wav"), row["start_time"], row["end_time"], N_MFCC), axis=1)
    mfcc_columns = [f"mfcc_{i}" for i in range(n_mfcc)]
    data[mfcc_columns] = pd.DataFrame(data["mfcc"].tolist(), index=data.index)
    data.drop(columns=["mfcc"], inplace=True)
    return data

def add_bert_embedings(data):
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

    data["bert_features"] = data["text"].progress_apply(lambda x: extract_bert_features(x))
    bert_columns = [f"bert_{i}" for i in range(768)]
    data[bert_columns] = pd.DataFrame(data["bert_features"].tolist(), index=data.index)
    data.drop(columns=["bert_features"], inplace=True)
    return data

In [9]:
data_train = add_mfcc(data_train, N_MFCC)
data_val = add_mfcc(data_val, N_MFCC)
data_test = add_mfcc(data_test, N_MFCC)

data_train = add_bert_embedings(data_train)
data_val = add_bert_embedings(data_val)
data_test = add_bert_embedings(data_test)

100%|██████████| 10563/10563 [02:53<00:00, 60.88it/s]
100%|██████████| 1861/1861 [00:29<00:00, 63.18it/s]
100%|██████████| 4662/4662 [01:17<00:00, 60.34it/s]
100%|██████████| 10563/10563 [02:44<00:00, 64.23it/s]
100%|██████████| 1861/1861 [00:27<00:00, 67.75it/s]
100%|██████████| 4662/4662 [01:09<00:00, 67.15it/s]


In [10]:
data_train = pd.concat([data_train, pd.get_dummies(data_train['sentiment'])], axis=1)
data_val = pd.concat([data_val, pd.get_dummies(data_val['sentiment'])], axis=1)
data_test = pd.concat([data_test, pd.get_dummies(data_test['sentiment'])], axis=1)

In [11]:
X_train = data_train.drop(['video', 'start_time', 'end_time', 'sentiment', 'happy', 'sad',	'anger', 'surprise', 'disgust',	'fear', 'text', 'ASR', 'negative', 'neutaral', 'positive'], axis=1)
y_train = data_train[['negative', 'neutaral', 'positive']]

X_val = data_val.drop(['video', 'start_time', 'end_time', 'sentiment', 'happy', 'sad',	'anger', 'surprise', 'disgust',	'fear', 'text', 'ASR', 'negative', 'neutaral', 'positive'], axis=1)
y_val = data_val[['negative', 'neutaral', 'positive']]

X_test = data_test.drop(['video', 'start_time', 'end_time', 'sentiment', 'happy', 'sad',	'anger', 'surprise', 'disgust',	'fear', 'text', 'ASR', 'negative', 'neutaral', 'positive'], axis=1)
y_test = data_test[['negative', 'neutaral', 'positive']]

In [12]:
class CustomDataset(Dataset):
    def __init__(self, inputs, targets, device):
        self.inputs = torch.tensor(inputs, dtype=torch.float32).to(device)
        self.targets = torch.tensor(targets, dtype=torch.float32).to(device)
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        return self.inputs[index], self.targets[index]

class DeepNeural(nn.Module):
    def __init__(self, input_shape, num_classes):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(2, 2)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(2, 2)
        self.conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool1d(2, 2)

        self.linear1 = nn.Linear(input_shape[1] // 8 * 64, 512)
        self.linear2 = nn.Linear(512, 1024)
        self.linear3 = nn.Linear(1024, 64)
        self.linear4 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.pool1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.pool2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.pool3(x))

        x = torch.flatten(x, start_dim=1)

        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = self.linear4(x)
        return F.softmax(x, dim=1)

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"

trainset = CustomDataset(X_train.to_numpy(), y_train.to_numpy(), device)
trainloader = DataLoader(trainset, batch_size=128, shuffle=True)

valset = CustomDataset(X_val.to_numpy(), y_val.to_numpy(), device)
valloader = DataLoader(valset, batch_size=128, shuffle=False)

testset = CustomDataset(X_test.to_numpy(), y_test.to_numpy(), device)
testloader = DataLoader(testset, batch_size=128, shuffle=False)

model = DeepNeural(X_train.shape, y_train.shape[1]).to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
loss_function = nn.CrossEntropyLoss()

In [14]:
train_losses = []
val_losses = []

for epoch in range(25):
    model.train()
    train_loss = 0
    for data in trainloader:
        X, y = data
        X = X.unsqueeze(1)
        output = model(X)
        loss = loss_function(output, y)
        train_loss += loss.item()

        model.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= len(trainloader)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for data in valloader:
            X, y = data
            X = X.unsqueeze(1)
            output = model(X)
            val_loss += loss_function(output, y).item()
    
    val_loss /= len(valloader)

    print(f"Epoch {epoch+1}, Training Loss: {train_loss:.4f} , Validation Loss: {val_loss:.4f}")

Epoch 1, Training Loss: 1.0833 , Validation Loss: 1.0310
Epoch 2, Training Loss: 1.0037 , Validation Loss: 0.9697
Epoch 3, Training Loss: 0.9611 , Validation Loss: 0.9732
Epoch 4, Training Loss: 0.9382 , Validation Loss: 0.9300
Epoch 5, Training Loss: 0.9263 , Validation Loss: 0.9658
Epoch 6, Training Loss: 0.9123 , Validation Loss: 0.9126
Epoch 7, Training Loss: 0.9036 , Validation Loss: 0.9015
Epoch 8, Training Loss: 0.8968 , Validation Loss: 0.9357
Epoch 9, Training Loss: 0.8868 , Validation Loss: 0.9115
Epoch 10, Training Loss: 0.8848 , Validation Loss: 0.9556
Epoch 11, Training Loss: 0.8774 , Validation Loss: 0.9168
Epoch 12, Training Loss: 0.8746 , Validation Loss: 0.9063
Epoch 13, Training Loss: 0.8604 , Validation Loss: 0.9079
Epoch 14, Training Loss: 0.8578 , Validation Loss: 0.9088
Epoch 15, Training Loss: 0.8472 , Validation Loss: 0.9163
Epoch 16, Training Loss: 0.8400 , Validation Loss: 0.9059
Epoch 17, Training Loss: 0.8320 , Validation Loss: 0.9325
Epoch 18, Training Loss

In [15]:
def evaluate_f1(model, dataloader, device):
    model.eval()  # Переводим модель в режим оценки
    all_preds = []
    all_targets = []
    
    with torch.no_grad():  # Отключаем градиенты для экономии памяти
        for data in valloader:
            X, y = data
            X = X.to(device).unsqueeze(1)  # Возможно, потребуется изменить размерность
            y = y.to(device)

            output = model(X)
            preds = torch.argmax(output, dim=1)  # Получаем предсказанные индексы классов
            
            all_preds.extend(preds.cpu().numpy())  # Переносим на CPU и в numpy
            all_targets.extend(torch.argmax(y, dim=1).cpu().numpy())

        
        all_preds = np.array(all_preds).flatten()
        all_targets = np.array(all_targets).flatten()
        f1 = f1_score(all_targets, all_preds, average='weighted')  # Вычисляем F1-score

        print("F1-score:", f1)

evaluate_f1(model, valloader, device)

F1-score: 0.6267584585111223


MFCC 90: F1-score: 0.619552539933813 \
MFCC 80: F1-score: 0.6096668476012584 \
MFCC 70: F1-score: 0.6105239623728058 \
MFCC 60: F1-score: 0.6261661736068509 \
MFCC 50: F1-score: 0.6107048637994825 \
MFCC 40: F1-score: 0.6010986178719756 \
MFCC 30: F1-score: 0.5804465885872316 \
MFCC 20: F1-score: 0.6065589058584931 \
MFCC 13: F1-score: 0.6267584585111223