In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('mainData.csv')
data

Unnamed: 0,text,emotion
0,[RU] Эта игра причинила мне боль.,['sadness']
1,"[RU] Ты правильно поступаешь, если тебе все ра...",['neutral']
2,"[RU] Чувак, я обожаю Reddit.",['love']
3,"[RU] [ИМЯ] не было рядом с ними, он был рядом ...",['neutral']
4,"[RU] Верно? Учитывая, что это такой важный док...",['gratitude']
...,...,...
479720,"[KZ] ""Сен онымен на хабаре болсаң, менің өтіні...","['desire', 'caring', 'nervousness']"
479721,"[KZ] ""Оның ""танкасы бар"" болса да, заң алдында...","['approval', 'determination', 'anger']"
479722,"[KZ] ""Ол шындықты айтып тұр ма, или өтірік пе,...","['curiosity', 'nervousness', 'confusion']"
479723,"[KZ] Университетте оқып жүргенде, біз общягада...","['nostalgia', 'caring', 'joy']"


In [3]:
def clean_individual_emotion_token(token_str):
    if not isinstance(token_str, str):
        return ""
    
    s = token_str.strip()
    
    s = s.replace("['", "")
    s = s.replace("']", "")
    s = s.replace("[\"", "")
    s = s.replace("\"]", "")
    s = s.replace("[", "")
    s = s.replace("]", "")
    s = s.replace("'", "")
    s = s.replace('"', "")
    
    return s.strip().lower()

In [4]:
def robust_emotion_processor_updated(entry):
    if pd.isna(entry):
        return []
    
    processed_emotions = []
    if isinstance(entry, list):
        for item in entry:
            cleaned_token = clean_individual_emotion_token(item)
            processed_emotions.append(cleaned_token)
    elif isinstance(entry, str):
        if not entry.strip():
            return []
        
        split_emotions = entry.split(',')
        for item_from_split in split_emotions:
            cleaned_token = clean_individual_emotion_token(item_from_split)
            if cleaned_token:
                processed_emotions.append(cleaned_token)
    
    final_emotions = []
    if processed_emotions:
        seen = set()
        for em in processed_emotions:
            if em not in seen:
                final_emotions.append(em)
                seen.add(em)
    return final_emotions

In [5]:
data['emotion_list_processed'] = data['emotion'].apply(robust_emotion_processor_updated)

mlb_for_model_config = MultiLabelBinarizer()
mlb_for_model_config.fit(data['emotion_list_processed'])
num_labels = len(mlb_for_model_config.classes_)
print(f"Number of unique labels (num_labels): {num_labels}")
print(f"Classes: {mlb_for_model_config.classes_}")

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
special_tokens_dict = {'additional_special_tokens': ['[KZ]', '[RU]', '[EN]']}
tokenizer.add_special_tokens(special_tokens_dict)

Number of unique labels (num_labels): 55
Classes: ['admiration' 'amusement' 'anger' 'annoyance' 'anticipation' 'anxiety'
 'approv' 'approval' 'caring' 'caution' 'concern' 'confidence' 'confusion'
 'contentment' 'creativity' 'curiosity' 'deceit' 'desire' 'determination'
 'disappointment' 'disapproval' 'disbelief' 'discomfort' 'disgust'
 'embarrassment' 'envy' 'excitement' 'fear' 'frustration' 'gratitude'
 'grief' 'hope' 'joy' 'love' 'nervousness' 'neutral' 'nostalgia'
 'optimism' 'panic' 'passion' 'pride' 'realization' 'reassurance' 'regret'
 'relief' 'remorse' 'sadness' 'satisfaction' 'shame' 'shock' 'surprise'
 'suspicion' 'tradition' 'trust' 'urgency']


3

In [6]:
def clean_text(text):
    text = str(text)

    match = re.match(r"\[(KZ|RU|EN)\]", text)
    lang_tag = match.group(0) if match else ""

    text_wo_tag = text.replace(lang_tag, "") if lang_tag else text

    text_wo_tag = text_wo_tag.lower()
    text_wo_tag = re.sub(r"http\S+|www\S+|https\S+", '', text_wo_tag)
    text_wo_tag = re.sub(r"\s+", " ", text_wo_tag).strip()

    return f"{lang_tag} {text_wo_tag}" if lang_tag else text_wo_tag

def get_language_weight(text):
    if text.startswith('[KZ]'):
        return 2.0
    else:
        return 1.0

def preprocess_multilingual_multilabel_cleaned(data):
    data['cleaned_text_internal'] = data['text'].apply(clean_text)
    data['weights_internal'] = data['cleaned_text_internal'].apply(get_language_weight)
    weights_tensor = torch.tensor(data['weights_internal'].values, dtype=torch.float)

    if 'emotion_list_processed' not in data.columns:
        print("Warning: 'emotion_list_processed' column not found in input to preprocess_multilingual_multilabel_cleaned. Creating it now.")
        if not hasattr(data, 'emotion_list_processed'): # Check if the global step actually added it.
             data['emotion_list_processed'] = data['emotion'].apply(robust_emotion_processor_lambda)


    internal_mlb = MultiLabelBinarizer()
    y_transformed = internal_mlb.fit_transform(data['emotion_list_processed'])

    encodings = tokenizer(
        data['cleaned_text_internal'].tolist(),
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt",
        return_token_type_ids=False
    )

    return encodings, torch.tensor(y_transformed, dtype=torch.float), internal_mlb, weights_tensor

In [7]:
la = data['text'].apply(clean_text)

In [8]:
la

0                         [RU] эта игра причинила мне боль.
1         [RU] ты правильно поступаешь, если тебе все ра...
2                              [RU] чувак, я обожаю reddit.
3         [RU] [имя] не было рядом с ними, он был рядом ...
4         [RU] верно? учитывая, что это такой важный док...
                                ...                        
479720    [KZ] "сен онымен на хабаре болсаң, менің өтіні...
479721    [KZ] "оның "танкасы бар" болса да, заң алдында...
479722    [KZ] "ол шындықты айтып тұр ма, или өтірік пе,...
479723    [KZ] университетте оқып жүргенде, біз общягада...
479724    [KZ] "қораға кіргеніңді түсінген кезде кеш бол...
Name: text, Length: 479725, dtype: object

In [9]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from transformers import AutoModelForSequenceClassification


In [10]:
class MultilingualEmotionDataset(Dataset):
    def __init__(self, encodings, labels, weights):
        self.encodings = encodings
        self.labels = labels
        self.weights = weights

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        item['weight'] = self.weights[idx]
        return item


In [11]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased',
                                                           num_labels=num_labels,
                                                           problem_type="multi_label_classification")
model.resize_token_embeddings(len(tokenizer))
encodings, labels, mlb_returned, weights = preprocess_multilingual_multilabel_cleaned(data.copy())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [12]:
print(mlb_returned.classes_)

['admiration' 'amusement' 'anger' 'annoyance' 'anticipation' 'anxiety'
 'approv' 'approval' 'caring' 'caution' 'concern' 'confidence' 'confusion'
 'contentment' 'creativity' 'curiosity' 'deceit' 'desire' 'determination'
 'disappointment' 'disapproval' 'disbelief' 'discomfort' 'disgust'
 'embarrassment' 'envy' 'excitement' 'fear' 'frustration' 'gratitude'
 'grief' 'hope' 'joy' 'love' 'nervousness' 'neutral' 'nostalgia'
 'optimism' 'panic' 'passion' 'pride' 'realization' 'reassurance' 'regret'
 'relief' 'remorse' 'sadness' 'satisfaction' 'shame' 'shock' 'surprise'
 'suspicion' 'tradition' 'trust' 'urgency']


In [11]:
indices = list(range(len(labels)))
train_idx, val_idx = train_test_split(indices, test_size=0.1, random_state=42)

train_encodings = {key: val[train_idx] for key, val in encodings.items()}
val_encodings = {key: val[val_idx] for key, val in encodings.items()}

train_labels = labels[train_idx]
val_labels = labels[val_idx]

train_weights = weights[train_idx]
val_weights = weights[val_idx]

In [12]:
train_dataset = MultilingualEmotionDataset(train_encodings, train_labels, train_weights)
val_dataset = MultilingualEmotionDataset(val_encodings, val_labels, val_weights)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


In [None]:
from torch.optim import AdamW
from torch.nn import BCEWithLogitsLoss

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = BCEWithLogitsLoss(reduction='none')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 10

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float()
        weights = batch['weight'].to(device).float()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        raw_loss = criterion(logits, labels)
        weighted_loss = (raw_loss.mean(dim=1) * weights).mean()

        weighted_loss.backward()
        optimizer.step()

        total_loss += weighted_loss.item()

        if (i + 1) % 1000 == 0:
            print(f"Epoch {epoch+1}, Batch {i+1} - Loss: {weighted_loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Train loss: {avg_train_loss:.4f}")


In [None]:
from sklearn.metrics import f1_score

model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.sigmoid(logits).cpu().numpy()
        target = labels.cpu().numpy()

        all_preds.extend(preds)
        all_targets.extend(target)

pred_labels = (np.array(all_preds) >= 0.5).astype(int)
f1 = f1_score(all_targets, pred_labels, average='micro')
print(f"Validation Micro F1: {f1:.4f}")


In [None]:
import joblib
import os

model.save_pretrained('mbert')
tokenizer.save_pretrained('mbert')


In [None]:

model_directory = 'mbert'
mlb_filename = 'mlb.joblib'
mlb_path = os.path.join(model_directory, mlb_filename)

joblib.dump(mlb_returned, mlb_path)