In [1]:
# –£—Å—Ç–∞–Ω–æ–≤–∫–∞ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–µ–π
!pip install pandas openpyxl sentence-transformers scikit-learn tqdm



In [2]:
# –í—Å—Ç–∞–≤–ª—è–µ–º –≤–µ—Å—å –∫–æ–¥ chunker.py —Å—é–¥–∞
import argparse
import os
import re
import hashlib
import json
import logging
from tqdm import tqdm
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# --- –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ª–æ–≥–≥–∏—Ä–æ–≤–∞–Ω–∏—è ---
def setup_logger(output_dir: str):
    log_path = os.path.join(output_dir, "chunker.log")
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[
            logging.FileHandler(log_path, mode="w", encoding="utf-8"),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger("chunker")


# --- –ó–∞–≥—Ä—É–∑–∫–∞ NOISE_PATTERNS –∏–∑ –∫–æ–Ω—Ñ–∏–≥–∞ (–≤ —Ç–æ–π –∂–µ –ø–∞–ø–∫–µ) ---
def load_noise_patterns(config_path: str = "noise_patterns.json"):
    default_patterns = [
        r'\?oirutpspid=[^&\s]*',
        r'\?oirutpspsc=[^&\s]*',
        r'\?oirutpspjs=[^&\s]*',
        r'#\S*',
        r'tel\.', r'Tel\.', r'—Ç–µ–ª\.', r'–¢–µ–ª\.', r'—Ç–µ–ª:', r'–¢–µ–ª:',
        r'¬©\s*–ê–ª—å—Ñ–∞-–ë–∞–Ω–∫',
        r'–ü–æ–ª—å–∑—É—è—Å—å —Å–∞–π—Ç–æ–º',
        r'—Å–æ–≥–ª–∞—à–∞–µ—Ç–µ—Å—å —Å –ø–æ–ª–∏—Ç–∏–∫–æ–π –∫–æ–Ω—Ñ–∏–¥–µ–Ω—Ü–∏–∞–ª—å–Ω–æ—Å—Ç–∏',
        r'–ö–æ–Ω—Ç–∞–∫—Ç—ã', r'–ü–æ–¥–¥–µ—Ä–∂–∫–∞', r'–ö–∞—Ä—Ç–∞ —Å–∞–π—Ç–∞',
        r'–ú–æ—Å–∫–≤–∞', r'–†–æ—Å—Å–∏—è',
        r'199\d‚Äì202\d',
        r'https?://[^\s]+\.(?:png|jpg|jpeg|gif|pdf)',
        r'\{[^{}]*\}', # JSON-—Ñ—Ä–∞–≥–º–µ–Ω—Ç—ã
    ]

    if os.path.exists(config_path):
        with open(config_path, "r", encoding="utf-8") as f:
            try:
                config = json.load(f)
                return config.get("noise_patterns", default_patterns)
            except Exception:
                print(f"‚ö†Ô∏è –û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –∫–æ–Ω—Ñ–∏–≥–∞ {config_path}, –∏—Å–ø–æ–ª—å–∑—É–µ–º —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–µ –ø–∞—Ç—Ç–µ—Ä–Ω—ã")
                return default_patterns
    else:
        print(f"‚ö†Ô∏è –ö–æ–Ω—Ñ–∏–≥ {config_path} –Ω–µ –Ω–∞–π–¥–µ–Ω, –∏—Å–ø–æ–ª—å–∑—É–µ–º —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–µ –ø–∞—Ç—Ç–µ—Ä–Ω—ã")
        return default_patterns


NOISE_PATTERNS = load_noise_patterns()


def clean_text(s: str) -> str:
    """–£–¥–∞–ª–µ–Ω–∏–µ HTML –∏ —à—É–º–∞"""
    if not isinstance(s, str):
        return ""
    for pattern in NOISE_PATTERNS:
        s = re.sub(pattern, '', s, flags=re.IGNORECASE)
    s = re.sub(r'<script.*?</script>', ' ', s, flags=re.IGNORECASE | re.DOTALL)
    s = re.sub(r'<style.*?</style>', ' ', s, flags=re.IGNORECASE | re.DOTALL)
    s = re.sub(r'<[^>]+>', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'([.,;!?])\1+', r'\1', s)
    return s.strip()


def semantic_chunk_by_similarity(text: str, model, max_chunk_len: int = 400, threshold: float = 0.5):
    """
    –°–µ–º–∞–Ω—Ç–∏—á–µ—Å–∫–æ–µ —á–∞–Ω–∫–∏—Ä–æ–≤–∞–Ω–∏–µ —á–µ—Ä–µ–∑ —Å—Ä–∞–≤–Ω–µ–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ —Å–æ—Å–µ–¥–Ω–∏—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π
    """
    sentences = [s.strip() for s in re.split(r'\n\s*\n|\. ', text) if s.strip()]
    if not sentences:
        return []

    if len(sentences) < 2:
        # –ü—Ä–æ—Å—Ç–æ —Ä–∞–∑–±–∏–≤–∞–µ–º –ø–æ –¥–ª–∏–Ω–µ
        chunks = []
        current_chunk = ""
        for sent in sentences:
            if len(current_chunk) + len(sent) < max_chunk_len:
                current_chunk += " " + sent
            else:
                if current_chunk.strip():
                    chunks.append(current_chunk.strip())
                current_chunk = sent
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        return [c for c in chunks if len(c) >= 100]

    embeddings = model.encode(sentences)

    chunks = []
    current_chunk = sentences[0]
    current_emb = embeddings[0].reshape(1, -1)

    for i in range(1, len(sentences)):
        next_emb = embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(current_emb, next_emb)[0][0]

        if similarity > threshold and len(current_chunk) + len(sentences[i]) < max_chunk_len:
            current_chunk += " " + sentences[i]
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentences[i]
            current_emb = next_emb

    if current_chunk:
        chunks.append(current_chunk.strip())

    return [c for c in chunks if len(c) >= 100]


def hash_text(text: str) -> str:
    """–•—ç—à–∏—Ä–æ–≤–∞–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –¥–ª—è –¥–µ–¥—É–ø–ª–∏–∫–∞—Ü–∏–∏"""
    return hashlib.md5(text.encode('utf-8', errors='ignore')).hexdigest()


def main(args):
    output_dir = os.path.dirname(args.output)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    logger = setup_logger(os.path.dirname(args.output))

    logger.info(f"–ß—Ç–µ–Ω–∏–µ {args.input}...")

    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ä–∞—Å—à–∏—Ä–µ–Ω–∏–µ —Ñ–∞–π–ª–∞
    if args.input.lower().endswith('.xlsx'):
        try:
            df = pd.read_excel(args.input, engine="openpyxl")
        except ImportError:
            raise ImportError("–î–ª—è —á—Ç–µ–Ω–∏—è .xlsx –Ω—É–∂–µ–Ω –ø–∞–∫–µ—Ç 'openpyxl'. –£—Å—Ç–∞–Ω–æ–≤–∏—Ç–µ –µ–≥–æ: pip install openpyxl")
    elif args.input.lower().endswith('.csv'):
        df = pd.read_csv(args.input)
    else:
        raise ValueError(f"–§–∞–π–ª {args.input} –Ω–µ —è–≤–ª—è–µ—Ç—Å—è .csv –∏–ª–∏ .xlsx")

    required_cols = {"web_id", "url", "kind", "title", "text"}
    if not required_cols.issubset(set(df.columns)):
        raise SystemExit(f"–û–∂–∏–¥–∞–ª–∏—Å—å –∫–æ–ª–æ–Ω–∫–∏: {required_cols}. –ï—Å—Ç—å: {set(df.columns)}")

    logger.info(f"–û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º {len(df)} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")

    # –ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å –æ–¥–∏–Ω —Ä–∞–∑
    logger.info(f"–ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ {args.model_name}...")
    model = SentenceTransformer(args.model_name, device='cuda')

    rows_csv = []
    rows_jsonl = []

    web_ids = set()
    seen_chunk_ids = set()

    for _, r in tqdm(df.iterrows(), total=len(df), desc="–ß–∞–Ω–∫–∏–Ω–≥"):
        # –ü—Ä–æ–≤–µ—Ä–∫–∏
        web_id_raw = r.get("web_id")
        if pd.isna(web_id_raw):
            logger.warning(f"–ü—Ä–æ–ø—É—Å–∫ —Å—Ç—Ä–æ–∫–∏: web_id –ø—É—Å—Ç–æ–π: {r.name}")
            continue
        try:
            web_id = int(web_id_raw)
        except (ValueError, TypeError):
            logger.warning(f"–ü—Ä–æ–ø—É—Å–∫ —Å—Ç—Ä–æ–∫–∏: web_id –Ω–µ –∫–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ—Ç—Å—è –≤ int: {web_id_raw}")
            continue

        title = r.get("title")
        url = r.get("url")
        kind = r.get("kind")
        raw_text = r.get("text")

        if pd.isna(title) or pd.isna(kind) or pd.isna(raw_text):
            logger.warning(f"–ü—Ä–æ–ø—É—Å–∫ —Å—Ç—Ä–æ–∫–∏: –æ–¥–∏–Ω –∏–∑ –æ–±—è–∑–∞—Ç–µ–ª—å–Ω—ã—Ö –ø–æ–ª–µ–π –ø—É—Å—Ç: {r.name}")
            continue

        title = str(title).strip()
        url = str(url).strip() if pd.notna(url) else ""
        kind = str(kind).strip()
        raw_text = str(raw_text).strip()

        if len(raw_text) < 100:
            logger.info(f"–ü—Ä–æ–ø—É—Å–∫ —Å—Ç—Ä–æ–∫–∏: —Ç–µ–∫—Å—Ç —Å–ª–∏—à–∫–æ–º –∫–æ—Ä–æ—Ç–∫–∏–π: {r.name}")
            continue

        # –û—á–∏—â–∞–µ–º –∏ —á–∞–Ω–∫—É–µ–º
        clean_full_text = clean_text(raw_text)
        chunks = semantic_chunk_by_similarity(
            clean_full_text,
            model,
            max_chunk_len=args.max_chunk_len,
            threshold=args.threshold
        )

        for i, chunk_text in enumerate(chunks):
            chunk_id = f"{web_id}__{i}"

            if chunk_id in seen_chunk_ids:
                logger.warning(f"–ü—Ä–æ–ø—É—Å–∫ –¥—É–±–ª–∏–∫–∞—Ç–∞ chunk_id: {chunk_id}")
                continue
            seen_chunk_ids.add(chunk_id)

            # –¥–ª—è CSV: text ‚Äî —Ç–æ–ª—å–∫–æ —Ç–µ–∫—Å—Ç, –±–µ–∑ kind –∏ title
            rows_csv.append({
                "web_id": web_id,
                "chunk_id": chunk_id,
                "title": title,
                "url": url,
                "kind": kind,
                "text": chunk_text
            })
            # –¥–ª—è JSONL: —Ç–æ–∂–µ —Å–∞–º–æ–µ
            rows_jsonl.append({
                "chunk_id": chunk_id,
                "text": chunk_text,
                "url": url,
                "title": title,
                "web_id": web_id,
                "kind": kind
            })

        web_ids.add(web_id)

    # –î–µ–¥—É–ø–ª–∏–∫–∞—Ü–∏—è –ø–æ text
    seen_hashes = set()
    unique_csv = []
    unique_jsonl = []
    for csv_row, jsonl_row in zip(rows_csv, rows_jsonl):
        h = hash_text(csv_row["text"])
        if h in seen_hashes:
            continue
        seen_hashes.add(h)
        unique_csv.append(csv_row)
        unique_jsonl.append(jsonl_row)

    logger.info(f"–£–¥–∞–ª–µ–Ω–æ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤: {len(rows_csv) - len(unique_csv)}")

    # --- –°–¢–ê–¢–ò–°–¢–ò–ö–ê ---
    chunks_count = len(unique_csv)
    total_len = sum(len(row["text"]) for row in unique_csv)
    avg_len = total_len / chunks_count if chunks_count > 0 else 0
    uniq_web_ids = len(web_ids)

    logger.info(f"üìä –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞:")
    logger.info(f"   - –ß–∞–Ω–∫–æ–≤: {chunks_count}")
    logger.info(f"   - –°—Ä–µ–¥–Ω—è—è –¥–ª–∏–Ω–∞: {avg_len:.2f}")
    logger.info(f"   - –£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö web_id: {uniq_web_ids}")

    # --- –°–û–•–†–ê–ù–ï–ù–ò–ï CSV ---
    out_df = pd.DataFrame(unique_csv, columns=["web_id", "chunk_id", "title", "url", "kind", "text"])
    out_df.to_csv(args.output, index=False)
    logger.info(f"chunks.csv: {len(out_df)} —á–∞–Ω–∫–æ–≤")

    # --- –î–û–ë–ê–í–õ–ï–ù–ò–ï –°–¢–ê–¢–ò–°–¢–ò–ö–ò –í –ö–û–ù–ï–¶ CSV ---
    with open(args.output, "a", encoding="utf-8") as f:
        f.write(f"\n#chunks_count,{chunks_count}\n")
        f.write(f"#avg_len,{avg_len:.2f}\n")
        f.write(f"#uniq_web_ids,{uniq_web_ids}\n")
    logger.info(f"‚úÖ –ú–µ—Ç—Ä–∏–∫–∏ –¥–æ–±–∞–≤–ª–µ–Ω—ã –≤ –∫–æ–Ω–µ—Ü {args.output}")

    # --- –°–û–•–†–ê–ù–ï–ù–ò–ï JSONL ---
    jsonl_path = args.output.replace(".csv", ".jsonl")
    with open(jsonl_path, "w", encoding="utf-8") as f:
        for item in unique_jsonl:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
        # --- –î–û–ë–ê–í–õ–ï–ù–ò–ï –°–¢–ê–¢–ò–°–¢–ò–ö–ò –í –ö–û–ù–ï–¶ JSONL ---
        stats = {
            "chunks_count": chunks_count,
            "avg_len": round(avg_len, 2),
            "uniq_web_ids": uniq_web_ids
        }
        f.write(json.dumps(stats, ensure_ascii=False) + "\n")
    logger.info(f"chunks.jsonl: {len(unique_jsonl)} —á–∞–Ω–∫–æ–≤ ‚Üí –¥–ª—è Generator")
    logger.info(f"‚úÖ –ú–µ—Ç—Ä–∏–∫–∏ –¥–æ–±–∞–≤–ª–µ–Ω—ã –≤ –∫–æ–Ω–µ—Ü {jsonl_path}")

In [3]:
# –Ø—á–µ–π–∫–∞: –ó–∞–ø—É—Å–∫ main() —Å GPU

import torch

if torch.cuda.is_available():
    print(f"‚úÖ –ò—Å–ø–æ–ª—å–∑—É–µ–º GPU: {torch.cuda.get_device_name(0)}")
    device = 'cuda'
else:
    print("‚ö†Ô∏è GPU –Ω–µ–¥–æ—Å—Ç—É–ø–Ω–∞, –∏—Å–ø–æ–ª—å–∑—É–µ–º CPU")
    device = 'cpu'

class Args:
    input = "websites_updated.csv"  # –∏–ª–∏ .xlsx
    output = "chunks.csv"
    model_name = "all-MiniLM-L6-v2"
    max_chunk_len = 400
    threshold = 0.5

args = Args()

# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –Ω–∞ GPU
model = SentenceTransformer(args.model_name, device=device)

# –ó–∞–ø—É—Å–∫–∞–µ–º main —Å –º–æ–¥–µ–ª—å—é
main(args)

‚úÖ –ò—Å–ø–æ–ª—å–∑—É–µ–º GPU: Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
–ß–∞–Ω–∫–∏–Ω–≥: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1938/1938 [02:37<00:00, 12.30it/s]


In [4]:
# –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞
df_result = pd.read_csv("chunks.csv")
print(df_result.head())
print(df_result.shape)
print()
print("–ü—Ä–∏–º–µ—Ä —á–∞–Ω–∫–∞:")
print(df_result.iloc[0]['text'][:300] + "...")

  web_id chunk_id                                              title  \
0      1     1__0  –ê–ª—å—Ñ–∞-–ë–∞–Ω–∫ - –∫—Ä–µ–¥–∏—Ç–Ω—ã–µ –∏ –¥–µ–±–µ—Ç–æ–≤—ã–µ –∫–∞—Ä—Ç—ã, –∫—Ä–µ–¥...   
1      2     2__0                      –ê-–ö–ª—É–±. –î–µ–Ω—å–≥–∏ –∏–º–µ—é—Ç –∑–Ω–∞—á–µ–Ω–∏–µ   
2      2     2__1                      –ê-–ö–ª—É–±. –î–µ–Ω—å–≥–∏ –∏–º–µ—é—Ç –∑–Ω–∞—á–µ–Ω–∏–µ   
3      2     2__2                      –ê-–ö–ª—É–±. –î–µ–Ω—å–≥–∏ –∏–º–µ—é—Ç –∑–Ω–∞—á–µ–Ω–∏–µ   
4      2     2__3                      –ê-–ö–ª—É–±. –î–µ–Ω—å–≥–∏ –∏–º–µ—é—Ç –∑–Ω–∞—á–µ–Ω–∏–µ   

                           url  kind  \
0         https://alfabank.ru/  html   
1  https://alfabank.ru/a-club/  html   
2  https://alfabank.ru/a-club/  html   
3  https://alfabank.ru/a-club/  html   
4  https://alfabank.ru/a-club/  html   

                                                text  
0  –ü–µ—Ä—Å–æ–Ω–∞–ª—å–Ω—ã–µ —É—Å–ª–æ–≤–∏—è –≤—ã —Å–º–æ–∂–µ—Ç–µ —É–∑–Ω–∞—Ç—å –ø–æ—Å–ª–µ –æ...  
1  –†–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–π –∏–Ω—Å—Ç—Ä—É–º–µ–