In [3]:
import re
from pymorphy3 import MorphAnalyzer
from sklearn.preprocessing import MinMaxScaler
import json

In [4]:
morph = MorphAnalyzer()

def clean_text(text: str) -> str:
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatize(text: str) -> str:
    return ' '.join([morph.parse(word)[0].normal_form for word in text.split()])

In [5]:
def preprocess_user(user: dict) -> dict:
    combined_text = ' '.join(post['text'] for post in user['posts'] if post['text'])
    cleaned_text = clean_text(combined_text)
    lemmatized_text = lemmatize(cleaned_text)
    
    return {
        # "user_id": user["user_id"],
        "label": user["label"],
        "text": lemmatized_text,
        "meta": {
            "sex": user.get("sex", 0),
            "followers_count": user.get("followers_count", 0),
            "alcohol": user.get("alcohol", 0) or 0,
            "smoking": user.get("smoking", 0) or 0,
            "life_main": user.get("life_main", 0) or 0,
            "people_main": user.get("people_main", 0) or 0,
        }
    }

In [6]:
with open('dataset/data.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

users = [preprocess_user(user) for user in raw_data]
texts = [user['text'] for user in users]
labels = [user['label'] for user in users]
meta_features = [list(user['meta'].values()) for user in users]

In [7]:
scaler = MinMaxScaler()
meta_scaled = scaler.fit_transform(meta_features)

In [52]:
import torch
from torch.utils.data import Dataset
import numpy as np

class DepressionDataset(Dataset):
    def __init__(self, texts, metas, labels, ft_model, max_len=500):
        self.texts = texts
        self.metas = metas
        self.labels = labels
        self.ft_model = ft_model
        self.max_len = max_len
        self.embedding_dim = ft_model.get_dimension()
        self.cache = {}

    def __len__(self):
        return len(self.texts)

    def get_embedding(self, tokens):
        vecs = []
        for token in tokens:
            if token not in self.cache:
                self.cache[token] = self.ft_model.get_word_vector(token)
            vecs.append(self.cache[token])
        
        # padding/truncation
        if len(vecs) > self.max_len:
            vecs = vecs[:self.max_len]
        else:
            vecs += [np.zeros(self.embedding_dim)] * (self.max_len - len(vecs))
        
        return np.array(vecs)


    def __getitem__(self, idx):
        tokens = self.texts[idx].split()
        text_embed = self.get_embedding(tokens)
        meta = np.array(self.metas[idx], dtype=np.float32)
        label = self.labels[idx]
        return {
            'text': torch.tensor(text_embed, dtype=torch.float32),
            'meta': torch.tensor(meta, dtype=torch.float32),
            'label': torch.tensor(label, dtype=torch.float32)
        }

In [53]:
from sklearn.model_selection import train_test_split

texts_train, texts_val, metas_train, metas_val, labels_train, labels_val = train_test_split(
    texts, meta_scaled, labels, test_size=0.2, stratify=labels, random_state=42
)

In [54]:
import fasttext

ft = fasttext.load_model("cc.ru.300.bin")



In [55]:
train_dataset = DepressionDataset(texts_train, metas_train, labels_train, ft_model=ft)
val_dataset = DepressionDataset(texts_val, metas_val, labels_val, ft_model=ft)

In [56]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


ml

In [57]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BiLSTMWithAttention(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, meta_dim, output_dim=1, dropout=0.5):
        super(BiLSTMWithAttention, self).__init__()
        self.hidden_dim = hidden_dim

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention_weights = nn.Linear(hidden_dim * 2, 1)

        self.meta_fc = nn.Sequential(
            nn.Linear(meta_dim, 16),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2 + 16, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, output_dim)
        )

    def attention_net(self, lstm_output):
        attn_scores = self.attention_weights(lstm_output).squeeze(-1)
        attn_weights = F.softmax(attn_scores, dim=1).unsqueeze(-1)
        context = torch.sum(lstm_output * attn_weights, dim=1)
        return context

    def forward(self, text_embeddings, meta_features):
        lstm_out, _ = self.lstm(text_embeddings)
        text_rep = self.attention_net(lstm_out)
        meta_rep = self.meta_fc(meta_features)
        combined = torch.cat((text_rep, meta_rep), dim=1)
        out = self.classifier(combined)
        return torch.sigmoid(out).squeeze(1)


# def train_model(model, dataloader, optimizer, criterion, device):
#     model.train()
#     total_loss = 0
#     for batch in dataloader:
#         text_embeds = batch['text'].to(device)
#         meta = batch['meta'].to(device)
#         labels = batch['label'].to(device)

#         optimizer.zero_grad()
#         outputs = model(text_embeds, meta)
#         loss = criterion(outputs, labels.float())
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     return total_loss / len(dataloader)


# def evaluate_model(model, dataloader, criterion, device):
#     model.eval()
#     total_loss = 0
#     correct = 0
#     total = 0
#     with torch.no_grad():
#         for batch in dataloader:
#             text_embeds = batch['text'].to(device)
#             meta = batch['meta'].to(device)
#             labels = batch['label'].to(device)

#             outputs = model(text_embeds, meta)
#             loss = criterion(outputs, labels.float())
#             total_loss += loss.item()

#             preds = (outputs > 0.5).long()
#             correct += (preds == labels).sum().item()
#             total += labels.size(0)
#     return total_loss / len(dataloader), correct / total


In [58]:
# Параметры
embedding_dim = 300  # размерность FastText
hidden_dim = 128
meta_dim = 6  # количество мета признаков
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTMWithAttention(embedding_dim, hidden_dim, meta_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


# Например: 900 положительных из 85914
num_pos = 900
num_neg = 85000
pos_weight = torch.tensor([num_neg / num_pos], dtype=torch.float32).to(device)

criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        text = batch['text'].to(device)
        meta = batch['meta'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(text, meta)  # выход до сигмоиды!
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            text = batch['text'].to(device)
            meta = batch['meta'].to(device)
            labels = batch['label'].to(device)

            logits = model(text, meta)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = (torch.sigmoid(logits) > 0.5).long()
            correct += (preds == labels.long()).sum().item()
            total += labels.size(0)
    return total_loss / len(dataloader), correct / total


In [59]:
for epoch in range(10):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

Epoch 1: Train Loss=1.3729, Val Loss=1.3716, Val Acc=0.9887
Epoch 2: Train Loss=1.3710, Val Loss=1.3716, Val Acc=0.9833
Epoch 3: Train Loss=1.3717, Val Loss=1.3716, Val Acc=0.9858
Epoch 4: Train Loss=1.3716, Val Loss=1.3716, Val Acc=0.9894
Epoch 5: Train Loss=1.3716, Val Loss=1.3716, Val Acc=0.9895


KeyboardInterrupt: 