# Часть 3. Векторизация текста

**Цель:** преобразовать предобработанные тексты в числовые признаки для последующего обучения моделей.

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
DATA_DIR = Path("../data/processed")

In [3]:
train_path = DATA_DIR / "data_train.csv"
val_path   = DATA_DIR / "data_val.csv"
test_path  = DATA_DIR / "data_test.csv"

In [4]:
train = pd.read_csv(train_path)
val   = pd.read_csv(val_path)
test  = pd.read_csv(test_path)

In [5]:
for df in (train, val, test):
    df["processed_text"] = df["processed_text"].fillna("").astype(str)

In [6]:
train.shape, val.shape, test.shape

((2420, 14), (428, 14), (503, 14))

In [7]:
X_train_text = train["processed_text"].values
X_val_text   = val["processed_text"].values
X_test_text  = test["processed_text"].values

In [8]:
y_train = train["target"].values if "target" in train.columns else None
y_val   = val["target"].values if "target" in val.columns else None
y_test  = test["target"].values if "target" in test.columns else None

In [9]:
len(X_train_text), len(X_val_text), len(X_test_text)

(2420, 428, 503)

## BoW
BoW формирует вектор признаков как частоты слов, без учета порядка
Используем только обучающую выборку для обучения векторизатора, затем применяем к val/test

In [10]:
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

In [11]:
bow = CountVectorizer(
    min_df=2,          # можно подкрутить, чтобы убрать редкие слова
    max_df=0.95
)

In [12]:
X_train_bow = bow.fit_transform(X_train_text)
X_val_bow   = bow.transform(X_val_text)
X_test_bow  = bow.transform(X_test_text)

In [56]:
len(bow.vocabulary_)

6682

In [13]:
X_train_bow.shape, X_val_bow.shape, X_test_bow.shape

((2420, 6682), (428, 6682), (503, 6682))

Сохранение

In [14]:
VEC_DIR  = Path("../models/vectorizers")
FEAT_DIR = Path("../data/features")

In [15]:
joblib.dump(bow, VEC_DIR / "bow.joblib")

['..\\models\\vectorizers\\bow.joblib']

In [16]:
sparse.save_npz(FEAT_DIR / "X_train_bow.npz", X_train_bow)
sparse.save_npz(FEAT_DIR / "X_val_bow.npz",   X_val_bow)
sparse.save_npz(FEAT_DIR / "X_test_bow.npz",  X_test_bow)

## TF-IDF

TF-IDF снижает вклад слов, которые встречаются часто, и повышает вклад информативных слов

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tfidf = TfidfVectorizer(
    min_df=2,
    max_df=0.95,
    ngram_range=(1, 2)  # биграммы часто дают буст на текстах
)

In [19]:
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_val_tfidf   = tfidf.transform(X_val_text)
X_test_tfidf  = tfidf.transform(X_test_text)

In [57]:
len(tfidf.vocabulary_)

13202

In [20]:
X_train_tfidf.shape

(2420, 13202)

Сохранение

In [21]:
joblib.dump(tfidf, VEC_DIR / "tfidf.joblib")

sparse.save_npz(FEAT_DIR / "X_train_tfidf.npz", X_train_tfidf)
sparse.save_npz(FEAT_DIR / "X_val_tfidf.npz",   X_val_tfidf)
sparse.save_npz(FEAT_DIR / "X_test_tfidf.npz",  X_test_tfidf)

## Word2Vec

Word2Vec обучается на корпусе текстов и строит вектор для каждого слова.
Для получения вектора текста используется усреднение векторов его слов.

In [24]:
from gensim.models import Word2Vec

In [29]:
corpus_tokens = [t.split() for t in X_train_text]

w2v_model = Word2Vec(
    sentences=corpus_tokens,
    vector_size=300,
    window=5,
    min_count=2,
    workers=4,
    epochs=30,
    sg=1  # skip-gram
)

In [30]:
len(w2v_model.wv.key_to_index), w2v_model.vector_size

(7092, 300)

In [31]:
def w2v(texts, model):
    dim = model.vector_size
    out = np.zeros((len(texts), dim), dtype=np.float32)

    for i, txt in enumerate(texts):
        tokens = txt.split()
        vecs = [model.wv[w] for w in tokens if w in model.wv]
        if vecs:
            out[i] = np.mean(vecs, axis=0)
        else:
            out[i] = np.zeros(dim, dtype=np.float32)
    return out

In [33]:
X_train_w2v = w2v(X_train_text, w2v_model)
X_val_w2v   = w2v(X_val_text, w2v_model)
X_test_w2v  = w2v(X_test_text, w2v_model)

X_train_w2v.shape

(2420, 300)

In [53]:
print(w2v_model.wv['tesla'])

[-1.85362082e-02 -3.02317172e-01 -1.22372098e-01  2.46591628e-01
 -2.78923631e-01 -6.26711845e-02  5.60295805e-02  5.07379174e-01
  1.61740825e-01 -3.66258062e-02 -1.16450027e-01 -2.47824714e-01
 -1.67768728e-02  1.70084536e-02 -3.20238292e-01 -7.39222690e-02
 -3.75880390e-01 -1.10278532e-01  3.42577428e-01 -1.58759326e-01
  3.22359465e-02 -4.88416478e-02  1.77635565e-01 -1.32499784e-01
 -8.26681554e-02  2.62693435e-01 -1.45722210e-01  1.14757076e-01
 -3.05073522e-02 -1.51523739e-01  1.39749408e-01 -9.01349783e-02
 -8.47139359e-02 -3.25300172e-02 -8.29679668e-02  1.98650092e-01
 -3.54386419e-02  2.39063948e-01  1.00757487e-01 -2.01146200e-01
 -4.94157746e-02  4.64006662e-02  1.12735324e-01 -1.35847358e-02
 -1.23815618e-01  1.17701907e-02  2.33827047e-02  1.90789416e-01
 -1.76305488e-01  2.03776091e-01 -1.00713842e-01 -2.33924817e-02
 -2.93975770e-01 -1.22901618e-01 -1.12255901e-01 -5.24049699e-02
  5.67219779e-02  3.86181027e-01  1.46009251e-01  1.42199382e-01
 -2.16965050e-01 -1.13056

In [54]:
# ближайшие слова
w2v_model.wv.most_similar('apple')

[('intelligence', 0.6442187428474426),
 ('airtag', 0.5924407839775085),
 ('watch', 0.5889187455177307),
 ('homepod', 0.571516752243042),
 ('линейка', 0.5145259499549866),
 ('fcc', 0.5070827603340149),
 ('подписывать', 0.5065293908119202),
 ('сентябрьский', 0.5042416453361511),
 ('music', 0.5007003545761108),
 ('макроданные', 0.49501731991767883)]

In [55]:
# ближайшие слова
w2v_model.wv.most_similar('смартфон')

[('samsung', 0.49828338623046875),
 ('разъём', 0.4805389642715454),
 ('шифроваться', 0.4763013422489166),
 ('pixel', 0.4714619815349579),
 ('canon', 0.4686709940433502),
 ('складный', 0.46739572286605835),
 ('троттлинг', 0.4616629481315613),
 ('usb', 0.45923134684562683),
 ('фактор', 0.45651426911354065),
 ('зарядка', 0.4533364176750183)]

In [58]:
# косинусная близость
w2v_model.wv.similarity('apple', 'iphone')

np.float32(0.33037722)

In [34]:
EMB_DIR  = Path("../models/embeddings")

In [36]:
w2v_model.save(str(EMB_DIR / "word2vec.model"))

np.save(FEAT_DIR / "X_train_w2v.npy", X_train_w2v)
np.save(FEAT_DIR / "X_val_w2v.npy",   X_val_w2v)
np.save(FEAT_DIR / "X_test_w2v.npy",  X_test_w2v)

## BERT

Используем мультиязычную модель, чтобы корректно работать с русским текстом.
Извлекаем вектор текста:
- cls: вектор токена [CLS]
- mean: среднее по всем токенам последнего слоя

In [37]:
import torch
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [39]:
MODEL_NAME = "bert-base-multilingual-cased"

In [40]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
bert = BertModel.from_pretrained(MODEL_NAME).to(device)
bert.eval()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 299.91it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: bert-base-multilingual-cased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.weight                | U

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
 

In [41]:
bert.config.hidden_size

768

In [42]:
# def bert_embed_texts(texts, pooling="cls", batch_size=16, max_len=128):
#     all_vecs = []
#     with torch.no_grad():
#         for i in range(0, len(texts), batch_size):
#             batch = list(texts[i:i+batch_size])
#             enc = tokenizer(
#                 batch,
#                 return_tensors="pt",
#                 padding=True,
#                 truncation=True,
#                 max_length=max_len
#             ).to(device)

#             out = bert(**enc).last_hidden_state 

#             if pooling == "cls":
#                 vec = out[:, 0, :]  # [B, H]
#             elif pooling == "mean":
#                 attn = enc["attention_mask"].unsqueeze(-1)  # [B, T, 1]
#                 summed = (out * attn).sum(dim=1)
#                 counts = attn.sum(dim=1).clamp(min=1)
#                 vec = summed / counts
#             else:
#                 raise ValueError("pooling must be 'cls' or 'mean'")

#             all_vecs.append(vec.cpu().numpy())

#     return np.vstack(all_vecs)

# # пример на маленьком куске (можешь убрать ограничение потом)
# X_train_bert = bert_embed_texts(X_train_text[:500], pooling="cls", batch_size=16, max_len=128)
# X_train_bert.shape


In [43]:
def bert_embeddings(texts, batch_size=16, max_len=128, pooling="cls"):
    """texts: list/np.array of strings -> np.array [N, hidden_size]"""
    vecs = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = list(texts[i:i+batch_size])

            enc = tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=max_len,
                return_tensors="pt"
            ).to(device)

            last_hidden = bert(**enc).last_hidden_state  # [B, T, H]

            if pooling == "cls":
                emb = last_hidden[:, 0, :]  # [B, H]
            elif pooling == "mean":
                # среднее только по “реальным” токенам (без паддинга)
                mask = enc["attention_mask"].unsqueeze(-1)  # [B, T, 1]
                emb = (last_hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
            else:
                raise ValueError("pooling must be 'cls' or 'mean'")

            vecs.append(emb.cpu().numpy())

    return np.vstack(vecs)

In [44]:
X_train_bert = bert_embeddings(X_train_text, pooling="cls", batch_size=16, max_len=128)
X_val_bert   = bert_embeddings(X_val_text,   pooling="cls", batch_size=16, max_len=128)
X_test_bert  = bert_embeddings(X_test_text,  pooling="cls", batch_size=16, max_len=128)

In [45]:
print(X_train_bert.shape, X_val_bert.shape, X_test_bert.shape)

(2420, 768) (428, 768) (503, 768)


In [46]:
np.save(FEAT_DIR / "X_train_bert_cls.npy", X_train_bert)
np.save(FEAT_DIR / "X_val_bert_cls.npy",   X_val_bert)
np.save(FEAT_DIR / "X_test_bert_cls.npy",  X_test_bert)

## Оценка тональности по тональному словарю 

Дополнительно оцениваем тональность текста поста по словарю/
В качестве интерпретируемого базового признака использован русскоязычный тональный словарь RuSentiLex

In [47]:
LEX_PATH = Path("../data/external/rusentilex.txt")

In [48]:
def load_rusentilex_txt(path):
    if not path.exists():
        raise FileNotFoundError(f"Нет файла: {path.resolve()}")

    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("!"):
                continue
            parts = [p.strip() for p in line.split(",")]
            if len(parts) < 5:
                continue
            rows.append(parts[:5])

    df = pd.DataFrame(rows, columns=["word", "pos", "lemma", "sentiment", "source"])

    df["lemma"] = df["lemma"].astype(str).str.lower().str.strip()
    df["sentiment"] = df["sentiment"].astype(str).str.lower().str.strip()

    df = df[~df["lemma"].str.contains(r"\s+", regex=True)]
    pos_set = set(df.loc[df["sentiment"] == "positive", "lemma"])
    neg_set = set(df.loc[df["sentiment"] == "negative", "lemma"])

    return pos_set, neg_set, df

In [49]:
pos_set, neg_set, rusenti_df = load_rusentilex_txt(LEX_PATH)

print("pos:", len(pos_set), "neg:", len(neg_set))
rusenti_df.head()

pos: 2790 neg: 7867


Unnamed: 0,word,pos,lemma,sentiment,source
0,аборт,Noun,аборт,negative,fact
1,абортивный,Adj,абортивный,negative,fact
2,абракадабра,Noun,абракадабра,negative,opinion
3,абсурд,Noun,абсурд,negative,opinion
4,абсурдность,Noun,абсурдность,negative,opinion


In [50]:
def lexicon_score(text, pos_set, neg_set):
    toks = str(text).split() 
    if not toks:
        return 0.0
    pos_cnt = sum(t in pos_set for t in toks)
    neg_cnt = sum(t in neg_set for t in toks)
    return (pos_cnt - neg_cnt) / max(1, len(toks))

In [51]:
for df in (train, val, test):
    df["sent_score"] = df["processed_text"].apply(lambda s: lexicon_score(s, pos_set, neg_set))

train[["processed_text", "sent_score"]].head(10)

Unnamed: 0,processed_text,sent_score
0,apple новый mac следующий неделя вица президен...,-0.038462
1,имб вайбкодер рабочий промт долгий проект gpt ...,-0.003546
2,мощный генератор видео veo открыть пользовател...,0.050847
3,внезапно приложение sora лидер американский ap...,0.054054
4,valve сразу игровой гаджет консоль vr очки гей...,0.034483
5,apple новый macbook air чип главное раз быстры...,0.0
6,google бюджетный pixel смартфон получить сразу...,-0.027778
7,курсовой минута grok появиться генерация pdf ф...,0.0
8,поездка сапсан новогодний ночь цена распростра...,0.0
9,проблема будущее мужчина попытаться угнать бес...,-0.090909


In [52]:
train.to_csv(DATA_DIR / "data_train_with_sent.csv", index=False)
val.to_csv(DATA_DIR / "data_val_with_sent.csv", index=False)
test.to_csv(DATA_DIR / "data_test_with_sent.csv", index=False)