In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import string

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df = pd.read_csv("emotion_sentimen_dataset.csv")

print(df.head())

print("""




""")

print(df.info())

print(df['Emotion'].value_counts())

print("""




""")



In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|\#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['cleaned_text'] = df['text'].apply(clean_text)

print(df[['text','cleaned_text']].head())

In [None]:

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)

    tokens = [word for word in tokens if word not in stop_words]
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

df['preprocessed_text'] = df['cleaned_text'].apply(preprocess_text)
print(df[['text', 'cleaned_text', 'preprocessed_text']])


In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='Emotion', order=df['Emotion'].value_counts().index)
plt.title('Распределение эмоций в датасете')
plt.xlabel('Эмоция')
plt.ylabel('Количество')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def get_top_n_words(corpus,n=10):
    words = ' '.join(corpus).split()
    word_counts = Counter(words)
    return word_counts.most_common(n)

emotion_words = {}

for emotion in df['Emotion'].unique():
    emotion_text = df[df['Emotion'] == emotion]['preprocessed_text']
    emotion_words[emotion] = get_top_n_words(emotion_text, n=10)

for emotion, words in emotion_words.items():
    print(f"Most popular words for emotion - {emotion}:")
    for word, count in words:
        print(f"{word}: {count}")
    print('\n')

In [7]:
df.to_csv('preprocessed_emotion_dataset.csv', index=False)

In [16]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import joblib

In [None]:
df = pd.read_csv('preprocessed_emotion_dataset.csv')
print(df.head())

In [None]:
X = df['preprocessed_text']
y = df['Emotion']

print(f"Количество NaN в данных: {X.isna().sum()}")
X = X.fillna('')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


tfidf_vectorizer = TfidfVectorizer(
    max_features = 5000,
    min_df = 5,
    max_df = 0.8
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
logreg = LogisticRegression(
    max_iter = 1000,
    random_state = 42,
    class_weight = 'balanced'
)

logreg.fit(X_train_tfidf, y_train)

y_pred_logreg = logreg.predict(X_test_tfidf)

print("\nРезультаты для Логистической Регрессии:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_logreg):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_logreg))

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_proba = logreg.predict_proba(X_test_tfidf)

roc_auc_ovr = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print(f"\nROC AUC (OvR): {roc_auc_ovr:.4f}")

roc_auc_macro = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
print(f"ROC AUC (macro): {roc_auc_macro:.4f}")

roc_auc_weighted = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')
print(f"ROC AUC (weighted): {roc_auc_weighted:.4f}")


In [None]:
plt.figure(figsize=(12, 10))
unique_emotions = sorted(y.unique())
cm = confusion_matrix(y_test, y_pred_logreg, labels=unique_emotions)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', 
            xticklabels=unique_emotions, yticklabels=unique_emotions)
plt.title('Нормализованная матрица ошибок (Логистическая Регрессия)')
plt.xlabel('Предсказанные метки')
plt.ylabel('Истинные метки')
plt.tight_layout()
plt.show()


In [None]:
"""
param_distr = {
    'C': [0.1,1,10],
    'solver': ['liblinear','saga'],
    'penalty': ['l1','l2']
}

random_search = RandomizedSearchCV(
    LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
    param_distributions=param_distr,
    n_iter = 5,
    cv = 5,
    scoring='f1_macro',
    n_jobs= -1,
    random_state=42
)

random_search.fit(X_train_tfidf,y_train)
print("\nЛучшие параметры для Логистической Регрессии:")
print(random_search.best_params_)
print(f"Лучший F1-macro: {random_search.best_score_:.4f}")
"""

In [None]:
"""
best_logreg = LogisticRegression(
    **random_search.best_params_,
    max_iter=1000,
    random_state=42,
    class_weight = 'balanced'
)
best_logreg.fit(X_train_tfidf,y_train)
y_pred_best_logreg = best_logreg.predict(X_test_tfidf)
print("\nРезультаты для оптимизированной Логистической Регрессии:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_logreg):.4f}")
print(classification_report(y_test, y_pred_best_logreg))
"""

In [None]:

pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('model', logreg)
])


joblib.dump(pipeline, 'emotion_classification_model.pkl')
print("Модель сохранена в файл 'emotion_classification_model.pkl'")

In [None]:
test_examples = [
    "I'm so happy to see you today!",
    "This makes me really angry and frustrated.",
    "I'm feeling quite sad and depressed after what happened.",
    "I'm worried about the upcoming exam.",
    "I absolutely hate when people do this to me.",
    "I love spending time with my family.",
    "I'm feeling neutral about the whole situation."
]

for text in test_examples:
    emotion = pipeline.predict([text])[0]
    
    if hasattr(pipeline['model'], 'predict_proba'):
        probas = pipeline.predict_proba([text])[0]
        emotion_probas = dict(zip(pipeline['model'].classes_, probas))
        top_emotions = sorted(emotion_probas.items(), key=lambda x: x[1], reverse=True)[:3]
        
        print(f"Текст: '{text}'")
        print(f"Предсказанная эмоция: {emotion}")
        print("Топ-3 вероятностей:")
        for emotion_name, prob in top_emotions:
            print(f"  - {emotion_name}: {prob:.4f} ({prob*100:.1f}%)")
        print()

In [None]:
def analyze_emotion(text):
    emotion = pipeline.predict([text])[0]
    
    print(f"Текст: '{text}'")
    print(f"Определенная эмоция: {emotion}")
    
    if hasattr(pipeline['model'], 'predict_proba'):
        probas = pipeline.predict_proba([text])[0]
        emotion_probas = dict(zip(pipeline['model'].classes_, probas))
        top_emotions = sorted(emotion_probas.items(), key=lambda x: x[1], reverse=True)[:3]
        
        print("Топ-3 вероятных эмоций:")
        for emotion_name, prob in top_emotions:
            print(f"  - {emotion_name}: {prob:.4f} ({prob*100:.1f}%)")
    
    return emotion

print("\nВведите текст для анализа эмоций (для выхода введите 'выход'):")
while True:
    user_input = input("> ")
    if user_input.lower() == 'выход':
        break
    analyze_emotion(user_input)


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time
import datetime
import re
import nltk
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('preprocessed_emotion_dataset.csv')

labels = df['Emotion'].unique()
num_labels = len(labels)

labels = df['Emotion'].unique()
label_dict = {}
for index,label in enumerate(labels):
    label_dict[label] = index

df['label'] = df['Emotion'].map(label_dict)

In [None]:
def reduce_dataset(df, reduction_factor=8):

    original_size = len(df)
    reduced_df = pd.DataFrame()

    class_counts = df['Emotion'].value_counts()
    print("Исходное распределение классов:")
    print(class_counts)
    
    rare_threshold = original_size * 0.01
    
    for emotion, count in class_counts.items():
        emotion_df = df[df['Emotion'] == emotion]

        if count < rare_threshold:
            sample_size = min(count, int(count * 0.5))
        else:
            sample_size = max(500, int(count / reduction_factor))
        
        sampled = emotion_df.sample(n=sample_size, random_state=42)
        reduced_df = pd.concat([reduced_df, sampled])

    reduced_df = reduced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"Исходный размер: {original_size}, Новый размер: {len(reduced_df)}")
    print("Новое распределение классов:")
    print(reduced_df['Emotion'].value_counts())
    
    return reduced_df

df = reduce_dataset(df, reduction_factor=12)


In [None]:
df = df.dropna(subset=['preprocessed_text'])
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_LEN = 128

def tokenize_text(texts, tokenizer, max_len=MAX_LEN):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length = max_len,
            pad_to_max_length = True,
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks

train_inputs, train_masks = tokenize_text(X_train, tokenizer)
test_inputs, test_masks = tokenize_text(X_test, tokenizer)

train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

In [12]:
class EmotionDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx],
            'attention_mask': self.masks[idx],
            'labels': self.labels[idx]
        }
    
batch_size = 32

train_dataset = EmotionDataset(train_inputs,train_masks,train_labels)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

test_dataset = EmotionDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используется устройство: {device}")

In [None]:


model = BertForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",  
    num_labels=num_labels, 
    output_attentions=False,
    output_hidden_states=False,
)
model = model.to(device)

optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-8)

epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps=total_steps
)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def train_model(model, train_dataloader, optimizer, scheduler, device, epochs=4):
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):
        print(f"\n======== Эпоха {epoch_i + 1} / {epochs} ========")
        print('Обучение...')
        
        t0 = time.time()
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print(f'  Батч {step}  из  {len(train_dataloader)}. Прошло: {elapsed}')

            b_input_ids = batch['input_ids'].to(device)
            b_input_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)

            model.zero_grad()

            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        training_time = format_time(time.time() - t0)

        print(f"\n  Среднее значение потери: {avg_train_loss:.2f}")
        print(f"  Время обучения: {training_time}")

        print("\nОценка модели на тестовом наборе...")

        t0 = time.time()
        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0
        
        with torch.no_grad():
            for batch in test_dataloader:
                b_input_ids = batch['input_ids'].to(device)
                b_input_mask = batch['attention_mask'].to(device)
                b_labels = batch['labels'].to(device)
                
                outputs = model(
                    b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels
                )
                
                loss = outputs.loss
                total_eval_loss += loss.item()
                
                logits = outputs.logits
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                
                total_eval_accuracy += flat_accuracy(logits, label_ids)

        avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
        avg_val_loss = total_eval_loss / len(test_dataloader)
        
        validation_time = format_time(time.time() - t0)
        
        print(f"  Точность: {avg_val_accuracy:.2f}")
        print(f"  Потери валидации: {avg_val_loss:.2f}")
        print(f"  Время валидации: {validation_time}")
        
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )
    
    print("\nОбучение завершено!")
    print(f"Общее время обучения: {format_time(time.time() - total_t0)}")
    
    return training_stats



In [None]:
training_stats = train_model(
    model, 
    train_dataloader, 
    optimizer, 
    scheduler, 
    device, 
    epochs=4
)

stats_df = pd.DataFrame(training_stats)
stats_df = stats_df.set_index('epoch')

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(stats_df['Training Loss'], 'b-o', label='Training')
plt.plot(stats_df['Valid. Loss'], 'g-o', label='Validation')
plt.title('Training & Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(stats_df['Valid. Accur.'], 'r-o')
plt.title('Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.tight_layout()
plt.show()

In [None]:
model_save_path = './bert_emotion_model/'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Модель сохранена в {model_save_path}")

In [None]:
def predict_emotion_bert(text, model, tokenizer, label_map, device):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|\#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()

    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
        logits = outputs.logits

    probs = torch.softmax(logits, dim=1).detach().cpu().numpy()[0]
    pred_class = np.argmax(probs)

    emotion = label_map[pred_class]

    emotion_probs = {label_map[i]: float(probs[i]) for i in range(len(probs))}

    emotion_probs = dict(sorted(emotion_probs.items(), key=lambda x: x[1], reverse=True))
    
    return {
        'emotion': emotion,
        'probabilities': emotion_probs
    }

label_map_inverse = {v: k for k, v in label_dict.items()}

test_examples = [
    "I'm so happy to see you today!",
    "This makes me really angry and frustrated.",
    "I'm feeling quite sad and depressed after what happened.",
    "I'm worried about the upcoming exam.",
    "I absolutely hate when people do this to me.",
    "I love spending time with my family.",
    "I'm feeling neutral about the whole situation."
]

for text in test_examples:
    result = predict_emotion_bert(text, model, tokenizer, label_map_inverse, device)
    
    print(f"Текст: '{text}'")
    print(f"Предсказанная эмоция: {result['emotion']}")
    print("Топ-3 вероятностей:")
    top3 = list(result['probabilities'].items())[:3]
    for emotion, prob in top3:
        print(f"  - {emotion}: {prob:.4f} ({prob*100:.1f}%)")
    print()


In [None]:
def analyze_emotion_interactive():
    print("\nВведите текст для анализа эмоций (для выхода введите 'выход'):")
    while True:
        user_input = input("> ")
        if user_input.lower() == 'выход':
            break
        result = predict_emotion_bert(user_input, model, tokenizer, label_map_inverse, device)
        
        print(f"Определенная эмоция: {result['emotion']}")
        print("Топ-3 вероятных эмоций:")
        top3 = list(result['probabilities'].items())[:3]
        for emotion, prob in top3:
            print(f"  - {emotion}: {prob:.4f} ({prob*100:.1f}%)")
        print()

analyze_emotion_interactive()
