# Сбор и предварительная обработка датасета

In [2]:
import numpy as np
import pandas as pd

## Dailydialog

### Собираем датасет из сырых данных

In [3]:
with open('data/dailydialog/dialogues_text.txt', 'r', encoding='utf-8') as f:
    dialogues = [line.strip().split('__eou__')[:-1] for line in f.readlines()]  # remove last empty after __eou__

with open('data/dailydialog/dialogues_topic.txt', 'r', encoding='utf-8') as f:
    topics = [int(line.strip()) for line in f.readlines()]

with open('data/dailydialog/dialogues_act.txt', 'r', encoding='utf-8') as f:
    acts = [list(map(int, line.strip().split())) for line in f.readlines()]

with open('data/dailydialog/dialogues_emotion.txt', 'r', encoding='utf-8') as f:
    emotions = [list(map(int, line.strip().split())) for line in f.readlines()]

# Prepare dataframe
data = []

for dialog_id, (utterances, topic, act_list, emotion_list) in enumerate(zip(dialogues, topics, acts, emotions)):
    previous_person_message = None
    for i, (message, act, emotion) in enumerate(zip(utterances, act_list, emotion_list)):
        entry = {
            "dialog_id": dialog_id,
            "person_message": message.strip(),
            "previous_person_message": previous_person_message,
            "topic": topic,    # {1: Ordinary Life, 2: School Life, 3: Culture & Education, 4: Attitude & Emotion, 5: Relationship, 6: Tourism , 7: Health, 8: Work, 9: Politics, 10: Finance}
            "act": act,        # { 1: inform，2: question, 3: directive, 4: commissive }
            "emotion": emotion # { 0: no emotion, 1: anger, 2: disgust, 3: fear, 4: happiness, 5: sadness, 6: surprise}
        }
        previous_person_message = message.strip()
        data.append(entry)


df = pd.DataFrame(data)
df.to_csv('data/dailydialog/processed/dialogues_topic.csv', index=False)
df.head(10)

Unnamed: 0,dialog_id,person_message,previous_person_message,topic,act,emotion
0,0,The kitchen stinks .,,1,3,2
1,0,I'll throw out the garbage .,The kitchen stinks .,1,4,0
2,1,"So Dick , how about getting some coffee for to...",,1,3,4
3,1,Coffee ? I don ’ t honestly like that kind of ...,"So Dick , how about getting some coffee for to...",1,4,2
4,1,"Come on , you can at least try a little , besi...",Coffee ? I don ’ t honestly like that kind of ...,1,3,0
5,1,What ’ s wrong with that ? Cigarette is the th...,"Come on , you can at least try a little , besi...",1,1,1
6,1,"Not for me , Dick .",What ’ s wrong with that ? Cigarette is the th...,1,1,0
7,2,Are things still going badly with your housegu...,,1,2,0
8,2,Getting worse . Now he ’ s eating me out of ho...,Are things still going badly with your housegu...,1,1,1
9,2,"Leo , I really think you ’ re beating around t...",Getting worse . Now he ’ s eating me out of ho...,1,3,0


### Преобразование текста в числовой формат (word2vec, bert)

### word2vec

In [47]:
from gensim.models import Word2Vec
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Загрузка датасета
df = pd.read_csv("data/dailydialog/processed/dialogues_topic.csv")

# Объединяем person_message и previous_person_message
# df['text_combined'] = df['previous_person_message'].fillna('') + ' ' + df['person_message'] TODO: check
df['text_combined'] = df['person_message']

# Токенизация
sentences = df['text_combined'].apply(lambda x: x.lower().split()).tolist()

# Обучаем модель Word2Vec
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Функция усреднения эмбеддингов
def get_avg_w2v(sentence, model):
    words = sentence.lower().split()
    valid_words = [w for w in words if w in model.wv]
    if not valid_words:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[valid_words], axis=0)

# Применим к каждому тексту
text_vectors = np.array([get_avg_w2v(t, w2v_model) for t in df['text_combined']])

# Кодируем topic и act
encoder = OneHotEncoder(sparse_output=False)
cat_features_w2v = encoder.fit_transform(df[['topic', 'act']])

# Объединяем признаки
# X_w2v = np.hstack([text_vectors, cat_features_w2v]) // TODO: check
X_w2v = text_vectors
y_w2v = df['emotion'].values

# Разделим на 80% train и 20% test
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    X_w2v,
    y_w2v,
    test_size=0.2,
    random_state=42,
    stratify=y_w2v
)


#### bert

In [49]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

# Загрузка датасета
df = pd.read_csv("data/dailydialog/processed/dialogues_topic.csv")
# df['text_combined'] = df['previous_person_message'].fillna('') + ' ' + df['person_message'] // TODO: check
df['text_combined'] = df['person_message']

# Подключаем BERT
model_name = "Kostya165/rubert_emotion_slicer" # or bert-base-uncased
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)
bert_model.to(device)
bert_model.eval()

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        outputs = bert_model(**inputs.to(device))  # только модель на MPS
    cls_embedding = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()  # возвращаем на CPU
    return cls_embedding

bert_vectors = []
for t in tqdm(df['text_combined'].tolist(), desc="Embedding BERT"):
    bert_vectors.append(get_bert_embedding(t))

bert_vectors = np.array(bert_vectors)

Using device: mps


Embedding BERT:   0%|          | 430/102979 [00:08<32:02, 53.35it/s]


KeyboardInterrupt: 

In [42]:
from sklearn.model_selection import train_test_split

bert_vectors1 = np.squeeze(bert_vectors, axis=1)  # убираем лишнюю размерность

# Кодируем topic и act
encoder = OneHotEncoder(sparse_output=False)
cat_features_bert = encoder.fit_transform(df[['topic', 'act']])

# Объединяем признаки
# X_bert = np.hstack([bert_vectors1, cat_features_bert]) TODO: check
X_bert = bert_vectors1
y_bert = df['emotion'].values

# Разделим на 80% train и 20% test
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(
    X_bert,
    y_bert,
    test_size=0.2,
    random_state=42,
    stratify=y_bert
)


### Обучаем и сравниваем классификаторы

#### word2vec

In [48]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score


def fit_predict_with_progress(model, X_train, y_train, X_test, desc):
    tqdm.write(f"Обучение: {desc}")
    model.fit(X_train, y_train)
    tqdm.write(f"Предсказание: {desc}")
    return model.predict(X_test)

logreg_w2v = LogisticRegression(max_iter=1000)
y_pred_logreg_w2v = fit_predict_with_progress(logreg_w2v, X_train_w2v, y_train_w2v, X_test_w2v, "Word2Vec + LogisticRegression")

rf_w2v = RandomForestClassifier()
y_pred_rf_w2v = fit_predict_with_progress(rf_w2v, X_train_w2v, y_train_w2v, X_test_w2v, "Word2Vec + RandomForest")

dummy = DummyClassifier(strategy="most_frequent")
y_pred_dummy_w2v = fit_predict_with_progress(dummy, X_train_w2v, y_train_w2v, X_test_w2v, "Word2Vec + DummyClassifier")

print("Word2Vec - Logistic Regression Accuracy:", accuracy_score(y_test_w2v, y_pred_logreg_w2v))
print("Word2Vec - Random Forest Accuracy:", accuracy_score(y_test_w2v, y_pred_rf_w2v))
print("Word2Vec - Dummy Accuracy:", accuracy_score(y_test_w2v, y_pred_dummy_w2v))

print("Word2Vec - Logistic Regression f1:", f1_score(y_test_w2v, y_pred_logreg_w2v, average='macro'))
print("Word2Vec - Random Forest f1:", f1_score(y_test_w2v, y_pred_rf_w2v, average='macro'))
print("Word2Vec - Dummy f1:", f1_score(y_test_w2v, y_pred_dummy_w2v, average='macro'))

print("Word2Vec - Logistic Regression recall:", recall_score(y_test_w2v, y_pred_logreg_w2v, average='macro'))
print("Word2Vec - Random Forest recall:", recall_score(y_test_w2v, y_pred_rf_w2v, average='macro'))
print("Word2Vec - Dummy recall:", recall_score(y_test_w2v, y_pred_dummy_w2v, average='macro'))

print("\n=== Classification Report: Word2Vec + Logistic Regression ===")
print(classification_report(y_test_w2v, y_pred_logreg_w2v))

print("\n=== Classification Report: Word2Vec + Random Forest ===")
print(classification_report(y_test_w2v, y_pred_rf_w2v))

print("\n=== Classification Report: Word2Vec + Dummy ===")
print(classification_report(y_test_w2v, y_pred_dummy_w2v))


Обучение: Word2Vec + LogisticRegression
Предсказание: Word2Vec + LogisticRegression
Обучение: Word2Vec + RandomForest
Предсказание: Word2Vec + RandomForest
Обучение: Word2Vec + DummyClassifier
Предсказание: Word2Vec + DummyClassifier
Word2Vec - Logistic Regression Accuracy: 0.8434647504369781
Word2Vec - Random Forest Accuracy: 0.8499223150126238
Word2Vec - Dummy Accuracy: 0.8309380462225675
Word2Vec - Logistic Regression f1: 0.20843588266673657
Word2Vec - Random Forest f1: 0.3503168813619682
Word2Vec - Dummy f1: 0.12966624995264614
Word2Vec - Logistic Regression recall: 0.1897338541777263
Word2Vec - Random Forest recall: 0.2895348736348187
Word2Vec - Dummy recall: 0.14285714285714285

=== Classification Report: Word2Vec + Logistic Regression ===
              precision    recall  f1-score   support

           0       0.86      0.98      0.91     17114
           1       0.13      0.01      0.02       204
           2       0.00      0.00      0.00        71
           3       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### bert

In [43]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score
from tqdm import tqdm

# Обёртка для отслеживания этапов
def fit_predict_with_progress(model, X_train, y_train, X_test, desc):
    tqdm.write(f"Обучение: {desc}")
    model.fit(X_train, y_train)
    tqdm.write(f"Предсказание: {desc}")
    return model.predict(X_test)

logreg_bert = LogisticRegression(max_iter=10000)
y_pred_logreg_bert = fit_predict_with_progress(logreg_bert, X_train_bert, y_train_bert, X_test_bert, "BERT + LogisticRegression")

rf_bert = RandomForestClassifier()
y_pred_rf_bert = fit_predict_with_progress(rf_bert, X_train_bert, y_train_bert, X_test_bert, "BERT + RandomForest")

dummy = DummyClassifier(strategy="most_frequent")
y_pred_dummy_bert = fit_predict_with_progress(dummy, X_train_bert, y_train_bert, X_test_bert, "BERT + DummyClassifier")

print("BERT - Logistic Regression Accuracy:", accuracy_score(y_test_bert, y_pred_logreg_bert))
print("BERT - Random Forest Accuracy:", accuracy_score(y_test_bert, y_pred_rf_bert))
print("BERT - Dummy Accuracy:", accuracy_score(y_test_bert, y_pred_dummy_bert))

print("BERT - Logistic Regression f1:", f1_score(y_test_bert, y_pred_logreg_bert, average='macro'))
print("BERT - Random Forest f1:", f1_score(y_test_bert, y_pred_rf_bert, average='macro'))
print("BERT - Dummy f1:", f1_score(y_test_bert, y_pred_dummy_bert, average='macro'))

print("BERT - Logistic Regression recall:", recall_score(y_test_bert, y_pred_logreg_bert, average='macro'))
print("BERT - Random Forest recall:", recall_score(y_test_bert, y_pred_rf_bert, average='macro'))
print("BERT - Dummy recall:", recall_score(y_test_bert, y_pred_dummy_bert, average='macro'))

print("\n=== Classification Report: BERT + Logistic Regression ===")
print(classification_report(y_test_bert, y_pred_logreg_bert))

print("\n=== Classification Report: BERT + Random Forest ===")
print(classification_report(y_test_bert, y_pred_rf_bert))

print("\n=== Classification Report: BERT + Dummy ===")
print(classification_report(y_test_bert, y_pred_dummy_bert))


Обучение: BERT + LogisticRegression
Предсказание: BERT + LogisticRegression
Обучение: BERT + RandomForest
Предсказание: BERT + RandomForest
Обучение: BERT + DummyClassifier
Предсказание: BERT + DummyClassifier
BERT - Logistic Regression Accuracy: 0.8347737424742668
BERT - Random Forest Accuracy: 0.8402602447077102
BERT - Dummy Accuracy: 0.8309380462225675
BERT - Logistic Regression f1: 0.1544869409978658
BERT - Random Forest f1: 0.2723920908598581
BERT - Dummy f1: 0.12966624995264614
BERT - Logistic Regression recall: 0.15585365814630284
BERT - Random Forest recall: 0.2248226889825826
BERT - Dummy recall: 0.14285714285714285

=== Classification Report: BERT + Logistic Regression ===
              precision    recall  f1-score   support

           0       0.84      0.99      0.91     17114
           1       0.00      0.00      0.00       204
           2       0.00      0.00      0.00        71
           3       0.00      0.00      0.00        35
           4       0.56      0.10    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
