In [None]:
# =============================================================================
# HEPSIBURADA ADDRESS MATCHING HYBRID PIPELINE - FULL TRAINING FROM SCRATCH
# Architecture: BGE-M3 + BM25 + BERT Reranker
# =============================================================================

import os
import sys
import re
import time
import pickle
import pathlib
import zipfile
import glob
import gc
import warnings
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
from contextlib import contextmanager
from typing import Dict, List, Tuple
import bm25s

# CUDA Memory Optimization
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================
class Config:
    def __init__(self):
        self.BASE_PATH = "/content/"
        self.DRIVE_SAVE_PATH = "/content/drive/MyDrive/Hepsiburada_AddressMatching_Scratch/"
        self.BGE_MODEL_NAME = 'BAAI/bge-m3'
        self.BERT_MODEL_NAME = "dbmdz/bert-base-turkish-cased"

        # New model directories for full training
        self.BERT_MODEL_DIR_BEST = "bert_full_train_best"
        self.BERT_MODEL_DIR_LATEST = "bert_full_train_latest"

        # Retrieval Hyperparameters
        self.K_NEIGHBORS_BGE = 30
        self.K_NEIGHBORS_BM25 = 30
        self.K_NEIGHBORS_HARD_MINING = 20

        # Batch Sizes
        self.BATCH_SIZE_EMBEDDING = 256
        self.BATCH_SIZE_BERT_TRAIN = 64
        self.BATCH_SIZE_GPU_SEARCH = 256

        # Training Settings
        self.NUM_EPOCHS = 2
        self.VALIDATION_SPLIT = 0.15
        self.RANDOM_STATE = 42

        # Filenames
        self.PROCESSED_TRAIN_FILE = "train_processed.csv"
        self.PROCESSED_TEST_FILE = "test_processed.csv"
        self.EMBEDDINGS_FILE = "bge_m3_embeddings.npy"
        self.BM25_INDEX_DIR = "bm25s_index"
        self.BERT_CANDIDATES_FILE = f"bert_candidates_hybrid.pkl"
        self.BERT_TRAIN_FILE = "bert_train_dataset_full.csv"
        self.SUBMISSION_FILE = "submission_scratch.csv"

    def get_path(self, filename: str) -> str:
        return os.path.join(self.DRIVE_SAVE_PATH, filename)

config = Config()

# =============================================================================
# TEXT PREPROCESSING
# =============================================================================
def turkish_to_lower(text: str) -> str:
    return str(text).replace('İ', 'i').replace('I', 'ı').lower()

ABBREVIATIONS = {
    r"\bmh\b|\bmah\b|\bmahalle\b": "mahallesi", r"\bcd\b|\bcad\b|\bcaddesi\b": "caddesi",
    r"\bsk\b|\bsok\b|\bsokağı\b": "sokak", r"\bblv\b|\bbulvarı\b": "bulvar",
    r"\bapt\b|\baprt\b|\bapartmanı\b": "apartman", r"\bst\b|\bsitesi\b": "site",
    r"\bno\b|\baptno\b": "numara", r"\bk\b|\bkatı\b|\bkat\b": "kat",
}

def hybrid_preprocess(text: str) -> str:
    if pd.isnull(text): return ""
    text = turkish_to_lower(text)
    text = re.sub(r"(\d+)([a-zA-Zçğıöşü]+)", r"\1 \2", text)
    text = re.sub(r"([a-zA-Zçğıöşü]+)(\d+)", r"\1 \2", text)
    for pattern, repl in ABBREVIATIONS.items():
        text = re.sub(pattern, f" {repl} ", text)
    text = re.sub(r"[^a-zçğıöşü0-9\-\/\s]", " ", text)
    return re.sub(r'\s+', ' ', text).strip()

# =============================================================================
# GPU OPTIMIZED SEARCH ENGINE
# =============================================================================
class VectorSearch:
    def __init__(self, embeddings: np.ndarray):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.embeddings_norm = torch.nn.functional.normalize(
            torch.from_numpy(embeddings).to(self.device, dtype=torch.float32)
        )

    def search(self, queries: np.ndarray, k: int, batch_size: int) -> np.ndarray:
        all_indices = []
        for i in range(0, len(queries), batch_size):
            batch = torch.from_numpy(queries[i:i + batch_size]).to(self.device, dtype=torch.float32)
            batch_norm = torch.nn.functional.normalize(batch)
            sims = torch.mm(batch_norm, self.embeddings_norm.T)
            all_indices.append(torch.topk(sims, k)[1].cpu().numpy())
        return np.vstack(all_indices)

# =============================================================================
# PHASE 1: DATA PREP & RETRIEVAL
# =============================================================================
class Phase1Engine:
    def __init__(self, cfg: Config):
        self.cfg = cfg

    def run(self) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
        from google.colab import drive
        drive.mount('/content/drive')
        os.makedirs(self.cfg.DRIVE_SAVE_PATH, exist_ok=True)

        # 1. Load Data
        train_path = self.cfg.get_path(self.cfg.PROCESSED_TRAIN_FILE)
        test_path = self.cfg.get_path(self.cfg.PROCESSED_TEST_FILE)

        if os.path.exists(train_path):
            print("--- Loading Preprocessed Data ---")
            df_train = pd.read_csv(train_path).fillna('')
            df_test = pd.read_csv(test_path).fillna('')
        else:
            print("--- Preprocessing Raw Data ---")
            df_train = pd.read_csv(os.path.join(self.cfg.BASE_PATH, "train.csv"))
            df_test = pd.read_csv(os.path.join(self.cfg.BASE_PATH, "test.csv"))
            df_train['clean_address'] = df_train['address'].apply(hybrid_preprocess)
            df_test['clean_address'] = df_test['address'].apply(hybrid_preprocess)
            df_train.to_csv(train_path, index=False)
            df_test.to_csv(test_path, index=False)

        # 2. BGE-M3 Embeddings (The Foundation)
        emb_path = self.cfg.get_path(self.cfg.EMBEDDINGS_FILE)
        from FlagEmbedding import BGEM3FlagModel
        bge_model = BGEM3FlagModel(self.cfg.BGE_MODEL_NAME, use_fp16=True)

        if os.path.exists(emb_path):
            train_embs = np.load(emb_path)
        else:
            print("--- Generating Train Embeddings ---")
            train_embs = []
            for i in tqdm(range(0, len(df_train), self.cfg.BATCH_SIZE_EMBEDDING)):
                batch_txt = df_train['clean_address'].iloc[i:i+self.cfg.BATCH_SIZE_EMBEDDING].tolist()
                train_embs.append(bge_model.encode(batch_txt)['dense_vecs'])
            train_embs = np.vstack(train_embs).astype('float32')
            np.save(emb_path, train_embs)

        # 3. Hybrid Candidate Search for Test Set
        cand_path = self.cfg.get_path(self.cfg.BERT_CANDIDATES_FILE)
        if os.path.exists(cand_path):
            with open(cand_path, 'rb') as f: candidates = pickle.load(f)
        else:
            print("--- Searching Candidates (Hybrid: BGE + BM25) ---")
            # BGE Search
            test_embs = bge_model.encode(df_test['clean_address'].tolist())['dense_vecs']
            v_search = VectorSearch(train_embs)
            bge_idx = v_search.search(test_embs, k=self.cfg.K_NEIGHBORS_BGE, batch_size=self.cfg.BATCH_SIZE_GPU_SEARCH)

            # BM25 Search
            bm25 = bm25s.BM25()
            bm25.index([d.split() for d in df_train['clean_address']])
            bm25_idx, _ = bm25.retrieve([q.split() for q in df_test['clean_address']], k=self.cfg.K_NEIGHBORS_BM25)

            candidates = {row['id']: list(set(bge_idx[i]).union(set(bm25_idx[i])))
                          for i, row in df_test.iterrows()}
            with open(cand_path, 'wb') as f: pickle.dump(candidates, f)

        # 4. Create Hard Negatives for BERT Training
        hn_path = self.cfg.get_path(self.cfg.BERT_TRAIN_FILE)
        if not os.path.exists(hn_path):
            print("--- Generating Hard Negative Dataset for BERT ---")
            v_search = VectorSearch(train_embs)
            indices = v_search.search(train_embs, k=self.cfg.K_NEIGHBORS_HARD_MINING, batch_size=self.cfg.BATCH_SIZE_GPU_SEARCH)
            labels, addrs = df_train['label'].values, df_train['clean_address'].values

            pairs = []
            for i in tqdm(range(len(df_train))):
                for n_idx in indices[i]:
                    if i != n_idx:
                        pairs.append({'sentence1': addrs[i], 'sentence2': addrs[n_idx], 'label': 1 if labels[i] == labels[n_idx] else 0})

            hn_df = pd.DataFrame(pairs)
            pos, neg = hn_df[hn_df.label == 1], hn_df[hn_df.label == 0]
            balanced_df = pd.concat([pos, neg.sample(len(pos))]).sample(frac=1)
            balanced_df.to_csv(hn_path, index=False)

        del bge_model; gc.collect(); torch.cuda.empty_cache()
        return df_train, df_test, candidates

# =============================================================================
# PHASE 2: BERT RERANKER TRAINING & SUBMISSION
# =============================================================================
class Phase2Engine:
    def __init__(self, cfg: Config):
        self.cfg = cfg
        self.tokenizer = AutoTokenizer.from_pretrained(cfg.BERT_MODEL_NAME)

    def train_reranker(self):
        print("--- Starting BERT Reranker Training from Scratch ---")
        df = pd.read_csv(self.cfg.get_path(self.cfg.BERT_TRAIN_FILE)).dropna()
        train_df, val_df = train_test_split(df, test_size=self.cfg.VALIDATION_SPLIT, stratify=df['label'], random_state=self.cfg.RANDOM_STATE)

        def tokenize_fn(ex):
            return self.tokenizer(ex['sentence1'], ex['sentence2'], truncation=True, max_length=128, padding='max_length')

        train_ds = Dataset.from_pandas(train_df).map(tokenize_fn, batched=True)
        val_ds = Dataset.from_pandas(val_df).map(tokenize_fn, batched=True)

        model = AutoModelForSequenceClassification.from_pretrained(self.cfg.BERT_MODEL_NAME, num_labels=2)

        args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=self.cfg.NUM_EPOCHS,
            learning_rate=2e-5,
            per_device_train_batch_size=self.cfg.BATCH_SIZE_BERT_TRAIN,
            gradient_accumulation_steps=8,
            fp16=True,
            eval_strategy="steps",
            eval_steps=2000,
            save_strategy="steps",
            save_steps=2000,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            report_to="none"
        )

        def compute_metrics(p):
            preds = np.argmax(p.predictions, axis=1)
            return {'f1': f1_score(p.label_ids, preds), 'acc': accuracy_score(p.label_ids, preds)}

        trainer = Trainer(
            model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds,
            tokenizer=self.tokenizer, compute_metrics=compute_metrics
        )
        trainer.train()

        # Save results
        trainer.save_model(self.cfg.get_path(self.cfg.BERT_MODEL_DIR_BEST))
        model.save_pretrained(self.cfg.get_path(self.cfg.BERT_MODEL_DIR_LATEST))
        self.tokenizer.save_pretrained(self.cfg.get_path(self.cfg.BERT_MODEL_DIR_LATEST))
        return model

    def create_submission(self, model, df_train, df_test, candidates):
        print("--- Creating Final Submission ---")
        model.eval().cuda()
        train_labels, train_addrs = df_train['label'].values, df_train['clean_address'].values
        test_map = dict(zip(df_test.id, df_test.clean_address))

        results = []
        for tid, c_indices in tqdm(candidates.items(), desc="Reranking"):
            if not c_indices: continue
            t_addr = test_map[tid]
            c_addrs = [train_addrs[i] for i in c_indices]

            inputs = self.tokenizer([t_addr]*len(c_addrs), c_addrs, return_tensors='pt', padding=True, truncation=True, max_length=128).to('cuda')
            with torch.no_grad():
                scores = model(**inputs).logits.softmax(dim=-1)[:, 1]
                best_match_idx = c_indices[scores.argmax().item()]
                results.append({'id': tid, 'label': train_labels[best_match_idx]})

        final_df = pd.merge(df_test[['id']], pd.DataFrame(results), on='id', how='left')
        final_df['label'] = final_df['label'].fillna(df_train['label'].mode()[0]).astype(int)
        final_df.to_csv(self.cfg.get_path(self.cfg.SUBMISSION_FILE), index=False)
        print(f"Submission saved to: {self.cfg.SUBMISSION_FILE}")

# =============================================================================
# RUN PIPELINE
# =============================================================================
if __name__ == "__main__":
    start_time = time.time()

    # 1. Retrieval Phase
    p1 = Phase1Engine(config)
    d_train, d_test, cands = p1.run()

    # 2. Training Phase
    p2 = Phase2Engine(config)
    best_model = p2.train_reranker()

    # 3. Final Ranking
    p2.create_submission(best_model, d_train, d_test, cands)

    print(f"✅ Full process completed in {(time.time() - start_time)/60:.2f} minutes.")