In [1]:
import os
import random
import math
from pathlib import Path
from collections import Counter
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", DEVICE)

Device: cuda


In [4]:
class CharTokenizer:
    def __init__(self, extra_chars=None):
        self.extra_chars = extra_chars or []
        self.pad_token = '<PAD>'
        self.unk_token = '<UNK>'
        self.pad_id = 0
        self.unk_id = 1
        self.char2id = {self.pad_token: self.pad_id, self.unk_token: self.unk_id}
        self.id2char = {self.pad_id: self.pad_token, self.unk_id: self.unk_token}
        self.next_id = 2

    def build_vocab(self, texts, min_freq=1):
        cnt = Counter()
        for t in texts:
            for ch in t:
                cnt[ch] += 1
        for ch, c in cnt.items():
            if c >= min_freq and ch not in self.char2id:
                self.char2id[ch] = self.next_id
                self.id2char[self.next_id] = ch
                self.next_id += 1
        for ch in self.extra_chars:
            if ch not in self.char2id:
                self.char2id[ch] = self.next_id
                self.id2char[self.next_id] = ch
                self.next_id += 1

    def encode(self, text):
        return [self.char2id.get(ch, self.unk_id) for ch in text]

    def decode(self, ids):
        return ''.join(self.id2char.get(i, '?') for i in ids)

    def __len__(self):
        return len(self.char2id)

In [5]:
def boundaries_from_text_with_spaces(text):
    positions = []
    idx = 0
    for ch in text:
        if ch == ' ':
            positions.append(idx)
        else:
            idx += 1
    return positions

In [6]:
def labels_from_text_without_spaces(orig_with_spaces):
    s = orig_with_spaces
    positions = boundaries_from_text_with_spaces(s)
    compact = s.replace(' ', '')
    n = len(compact)
    labels = [0] * n
    for p in positions:
        if p == 0:
            continue
        labels[p-1] = 1
    return compact, labels

In [7]:
s = 'книга в хорошем состоянии'
compact, labels = labels_from_text_without_spaces(s)
print(s)
print(compact)
print(labels)

книга в хорошем состоянии
книгавхорошемсостоянии
[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [8]:
def synthesize_from_corpus(lines, remove_prob=1.0, keep_word_prob=0.0):
    out = []
    for ln in lines:
        ln = ln.strip()
        if not ln:
            continue
        if random.random() < remove_prob:
            compact = ln.replace(' ', '')
            compact2, labels = labels_from_text_without_spaces(ln)
            out.append((compact2, labels))
        else:
            parts = ln.split()
            if len(parts) == 1:
                compact2, labels = labels_from_text_without_spaces(ln)
                out.append((compact2, labels))
                continue
            new_s = parts[0]
            labels_positions = []
            for i in range(1, len(parts)):
                if random.random() < keep_word_prob:
                    new_s += ' ' + parts[i]
                else:
                    new_s += parts[i]
            compact2, labels = labels_from_text_without_spaces(new_s)
            out.append((compact2, labels))
    return out

In [9]:
class BoundaryDataset(Dataset):
    def __init__(self, samples, tokenizer, max_len=256):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        s, labels = self.samples[idx]
        ids = self.tokenizer.encode(s)
        if len(ids) > self.max_len:
            ids = ids[:self.max_len]
            labels = labels[:self.max_len]
        return torch.tensor(ids, dtype=torch.long), torch.tensor(labels, dtype=torch.float)

In [10]:
def collate_fn(batch):
    ids_list, labels_list = zip(*batch)
    lengths = [len(x) for x in ids_list]
    max_len = max(lengths)
    padded_ids = torch.zeros(len(batch), max_len, dtype=torch.long)
    padded_labels = torch.zeros(len(batch), max_len, dtype=torch.float)
    mask = torch.zeros(len(batch), max_len, dtype=torch.bool)
    for i, (ids, labs) in enumerate(zip(ids_list, labels_list)):
        L = len(ids)
        padded_ids[i, :L] = ids
        padded_labels[i, :L] = labs
        mask[i, :L] = 1
    return padded_ids, padded_labels, mask

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

In [12]:
class LightweightTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=4, dim_feedforward=256, max_len=256, dropout=0.1):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_enc = PositionalEncoding(d_model, max_len=max_len)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.out = nn.Linear(d_model, 1)

    def forward(self, ids, src_key_padding_mask=None):
        x = self.token_emb(ids)
        x = self.pos_enc(x)
        x = self.transformer(x, src_key_padding_mask=src_key_padding_mask)
        logits = self.out(x).squeeze(-1)
        return logits

In [13]:
from sklearn.metrics import precision_score, recall_score

In [14]:
bce_loss_fn = nn.BCEWithLogitsLoss(reduction='none')

In [15]:
def compute_masked_loss(logits, labels, mask, pos_weight=1.0):
    loss = bce_loss_fn(logits, labels)
    if pos_weight != 1.0:
        pos_mask = (labels == 1.0).float()
        loss = loss * (1.0 + (pos_weight - 1.0) * pos_mask)
    loss = loss * mask.float()
    return loss.sum() / (mask.float().sum() + 1e-9)

In [16]:
def positions_from_logits(logits_row, mask_row, threshold=0.5):
    probs = torch.sigmoid(logits_row)
    assert probs.shape == mask_row.shape
    preds = (probs >= threshold) & (mask_row.bool())
    return [int(i) for i, v in enumerate(preds.tolist()) if v]

In [17]:
def f1_batch_from_logits(logits, labels, mask, threshold=0.5):
    batch = logits.size(0)
    f1s = []
    for i in range(batch):
        logits_row = logits[i]
        labels_row = labels[i]
        mask_row = mask[i]
        pred_pos = positions_from_logits(logits_row, mask_row, threshold)
        true_pos = [int(j) for j in range(labels_row.shape[0]) if mask_row[j] and float(labels_row[j]) >= 0.5]
        f1s.append(_f1_single(pred_pos, true_pos))
    return sum(f1s) / len(f1s)

In [18]:
def _f1_single(pred, true):
    if not pred and not true:
        return 1.0
    if not pred:
        return 0.0
    if not true:
        return 0.0
    pred_set = set(pred)
    true_set = set(true)
    tp = len(pred_set & true_set)
    prec = tp / len(pred_set) if len(pred_set) > 0 else 0.0
    rec = tp / len(true_set) if len(true_set) > 0 else 0.0
    if prec + rec == 0:
        return 0.0
    return 2 * (prec * rec) / (prec + rec)

In [19]:
def train_model(model, train_loader, val_loader=None, epochs=5, lr=2e-4, pos_weight=3.0, grad_clip=1.0, model_path='model.pt'):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    best_val = -1.0
    model.to(DEVICE)
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch}")
        for ids, labels, mask in pbar:
            ids = ids.to(DEVICE)
            labels = labels.to(DEVICE)
            mask = mask.to(DEVICE)
            src_key_padding_mask = ~mask  # transformer expects True for pads
            logits = model(ids, src_key_padding_mask=src_key_padding_mask)
            loss = compute_masked_loss(logits, labels, mask, pos_weight=pos_weight)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            total_loss += float(loss.item())
            pbar.set_postfix(loss=total_loss / (pbar.n + 1))
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch} avg_loss={avg_loss:.4f}")

        if val_loader is not None:
            model.eval()
            with torch.no_grad():
                val_f1 = 0.0
                count = 0
                for ids, labels, mask in val_loader:
                    ids = ids.to(DEVICE)
                    labels = labels.to(DEVICE)
                    mask = mask.to(DEVICE)
                    src_key_padding_mask = ~mask
                    logits = model(ids, src_key_padding_mask=src_key_padding_mask)
                    f1 = f1_batch_from_logits(logits.cpu(), labels.cpu(), mask.cpu(), threshold=0.5)
                    val_f1 += f1
                    count += 1
                val_f1 = val_f1 / count if count > 0 else 0.0
            print(f"Validation F1: {val_f1:.4f}")
            if val_f1 > best_val:
                best_val = val_f1
                torch.save(model.state_dict(), model_path)
                print(f"Saved best model with F1={best_val:.4f} to {model_path}")
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    return model

In [20]:
MAX_LEN = 256
BATCH_SIZE = 32
EMBED_DIM = 128
NHEAD = 4
NUM_LAYERS = 4
FF_DIM = 256
EPOCHS = 1
LR = 2e-4
POS_WEIGHT = 3.0
MODEL_PATH = 'light_transformer.pt'

In [21]:
CORPUS_PATH = 'corpus.txt'
TASK_PATH = 'task_data.txt'
OUT_SUBMISSION = 'submission.csv'

In [23]:
import re, random
from pathlib import Path

TRAIN_CSV = 'train.csv'
OUT_CORPUS = 'corpus.txt'

df = pd.read_csv(TRAIN_CSV, usecols=lambda c: c in ('title','description'), dtype=str)
# Из-за ограничений по времени возьму только малый сабсет выборки, это сильно повлияет на качество модели, но моя цель показать
# идею подхода
df = df[:1000]
df = df.fillna('')

def clean_text(s: str) -> str:
    s = str(s)
    s = re.sub(r'<[^>]+>', ' ', s)
    s = re.sub(r'http\\S+|www\\.\\S+', ' ', s)
    s = re.sub(r'\\s+', ' ', s)
    return s.strip()

combined = (df['title'].fillna('') + ' ' + df['description'].fillna('')).map(clean_text)

def is_russian_line(s: str) -> bool:
    return bool(re.search('[а-яА-ЯёЁ]', s))

keywords = ['продам','куплю','продаю','обмен','новый','б/у','торг','продажа','продается','срочно']

out_lines = []
for s in combined:
    if not isinstance(s, str) or len(s) < 3:
        continue
    if not is_russian_line(s):
        continue
    if len(s) > 800:
        s = s[:800]
    low = s.lower()
    if any(k in low for k in keywords) or (20 <= len(low) <= 200 and random.random() < 0.03):
        out_lines.append(s)

seen = set()
corpus_lines = []
for ln in out_lines:
    if ln in seen:
        continue
    seen.add(ln)
    corpus_lines.append(ln)

with open(OUT_CORPUS, 'w', encoding='utf-8') as f:
    for ln in corpus_lines:
        f.write(ln + '\n')

print("Сохранено", len(corpus_lines), "строк в", OUT_CORPUS)

Сохранено 457 строк в corpus.txt


In [24]:
if Path(CORPUS_PATH).exists():
    with open(CORPUS_PATH, 'r', encoding='utf-8') as f:
        corpus_lines = [ln.strip() for ln in f if ln.strip()]
else:
    corpus_lines = [
        'книга в хорошем состоянии',
        'продам айфон 11 в отличном состоянии',
        'новая кофеварка недорого',
        'велосипед для детей 12 дюймов',
        'работа удаленно программист',
        'куплю часы брендовые',
        'посудомоечная машина б/у',
    ]

In [25]:
train_samples = synthesize_from_corpus(corpus_lines * 200, remove_prob=1.0)  # amplify
random.shuffle(train_samples)

In [26]:
split = int(0.9 * len(train_samples))
train_samples_list = train_samples[:split]
val_samples_list = train_samples[split:]

In [27]:
tokenizer = CharTokenizer()
all_texts = [s for s, _ in train_samples_list] + [s for s, _ in val_samples_list]
all_texts += [ln.replace(' ', '') for ln in corpus_lines]
tokenizer.build_vocab(all_texts, min_freq=1)

In [28]:
train_ds = BoundaryDataset(train_samples_list, tokenizer, max_len=MAX_LEN)
val_ds = BoundaryDataset(val_samples_list, tokenizer, max_len=MAX_LEN)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [29]:
model = LightweightTransformer(vocab_size=len(tokenizer), d_model=EMBED_DIM, nhead=NHEAD, num_layers=NUM_LAYERS, dim_feedforward=FF_DIM, max_len=MAX_LEN)
print(model)

LightweightTransformer(
  (token_emb): Embedding(174, 128, padding_idx=0)
  (pos_enc): PositionalEncoding()
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (out): Linear(in_features=128, out_features=1, bias=True)
)


In [30]:
model = train_model(model, train_loader, val_loader=val_loader, epochs=EPOCHS, lr=LR, pos_weight=POS_WEIGHT, model_path=MODEL_PATH)

Epoch 1: 100%|█████████████████████████████████████████████████████████| 6610/6610 [03:00<00:00, 36.60it/s, loss=0.278]
  output = torch._nested_tensor_from_mask(


Epoch 1 avg_loss=0.2780
Validation F1: 0.9886
Saved best model with F1=0.9886 to light_transformer.pt


In [32]:
rows = []
with open("task_data.txt", encoding="utf-8") as f:
    header = next(f).rstrip("\n")

    for lineno, line in enumerate(f, start=2):
        line = line.rstrip("\n")
        if not line:
            continue
        parts = line.split(",", 1)
        if len(parts) == 2:
            rows.append(parts)
        else:
            rows.append([parts[0], ""])

df = pd.DataFrame(rows, columns=["id", "text"])
texts = df['text'].astype(str).tolist()

In [33]:
def infer_positions_for_texts(texts, model, tokenizer, max_len=256, threshold=0.5):
    model.to(DEVICE)
    model.eval()
    preds = []
    with torch.no_grad():
        for t in texts:
            s = t.strip()
            ids = tokenizer.encode(s)
            if len(ids) > max_len:
                ids = ids[:max_len]
            ids_t = torch.tensor([ids], dtype=torch.long).to(DEVICE)
            mask = torch.ones(1, ids_t.size(1), dtype=torch.bool).to(DEVICE)
            src_key_padding_mask = ~mask
            logits = model(ids_t, src_key_padding_mask=src_key_padding_mask)
            logits = logits[0].cpu()
            pos = positions_from_logits(logits, torch.ones_like(logits, dtype=torch.bool), threshold=threshold)
            preds.append(pos)
    return preds

In [34]:
preds = infer_positions_for_texts(texts, model, tokenizer, max_len=MAX_LEN, threshold=0.5)
print('Preds example:', preds[:5])

Preds example: [[4, 6, 9], [2, 5, 10], [5, 6, 11, 28], [4, 12, 13], [4]]


In [36]:
df['predicted_positions'] = [str(p) for p in preds]
df = df.drop('text', axis=1)
OUT = 'submission.csv'
df.to_csv(OUT, index=False)
print('Saved submission to', OUT)

Saved submission to submission2.csv


In [116]:
def decode_spaces(text, spaces):
    spaces = set(spaces)
    _str = []
    for i, c in enumerate(text):
        _str.append(c)
        if i in spaces:
            _str.append(" ")
    res_str = "".join(_str)
    return res_str