In [None]:
import pandas as pd
import os

In [None]:
INPUT_TSV  = "/content/sample_data/Tagged_Titles_Train.tsv"
OUT_DIR    = "/content/sample_data/eBay_ML_Challenge_2025/conll"
os.makedirs(OUT_DIR, exist_ok=True)


In [None]:
df = pd.read_csv(
    INPUT_TSV,
    sep='\t',
    header=0,
    dtype=str,
    na_values=[''],   # only these become NaN
    keep_default_na=False
)
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
df['tag'] = df['tag'].fillna('Blank')

In [None]:
out_path = os.path.join(OUT_DIR, f"train_data.conll")
with open(out_path, "w", encoding="utf-8") as fout:
    for (_, _, title), group in df.groupby(["record_number","category","title"], sort=False):
            for tok, tg in zip(group["token"], group["tag"]):
                fout.write(f"{tok} {tg}\n")
            fout.write("\n")
    print(f"Wrote {len(df['record_number'].unique())} sentences to {out_path}")

In [None]:
import spacy
from spacy.tokens import DocBin

def conll_to_spacy_every_token(input_path: str, output_path: str, lang: str = "de"):
    """
    Convert a space-separated .conll of `token LABEL` into spaCy's binary DocBin format,
    treating each token as a one-token entity. Fixes the TypeError by tracking document count.
    """
    # Initialize blank pipeline and DocBin
    nlp = spacy.blank(lang)
    doc_bin = DocBin()
    doc_count = 0
    tokens, labels = [], []

    def flush_sentence():
        nonlocal doc_count
        if not tokens:
            return
        # Reconstruct text and create Doc
        doc = nlp.make_doc(" ".join(tokens))
        spans = []
        char_offset = 0
        # Create one-token spans for each label
        for token_text, label in zip(tokens, labels):
            start = char_offset
            end = start + len(token_text)
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span:
                spans.append(span)
            char_offset = end + 1  # account for space
        doc.ents = spans
        doc_bin.add(doc)
        doc_count += 1
        tokens.clear()
        labels.clear()

    # Read and process the .conll file
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            stripped = line.strip()
            if not stripped:
                # Sentence boundary: flush collected tokens
                flush_sentence()
            else:
                parts = stripped.split()
                token = parts[0]
                label = parts[1]
                tokens.append(token)
                labels.append(label)
    # Flush any remaining tokens
    flush_sentence()

    # Save to disk
    doc_bin.to_disk(output_path)
    print(f"✅ Saved {doc_count} documents to {output_path}")

conll_to_spacy_every_token(
    "/content/sample_data/eBay_ML_Challenge_2025/conll/train_data.conll",
    "/content/sample_data/eBay_ML_Challenge_2025/conll/train_data.spacy"
)

In [None]:
pip install spacy-transformers

In [None]:
!python -m spacy train /content/sample_data/config.cfg --output "/content/sample_data/eBay_ML_Challenge_2025/roberta/" --paths.train "/content/sample_data/eBay_ML_Challenge_2025/conll/train_data.spacy" --paths.dev "/content/sample_data/eBay_ML_Challenge_2025/conll/train_data.spacy" --gpu-id 0

In [None]:
INPUT_TSV  = "/content/sample_data/Listing_Titles.tsv"
START_ROW  = 5001
END_ROW    = 30000
FILTERED_DATA = "/content/sample_data/filtered_data.tsv"
MODEL = "/content/sample_data/eBay_ML_Challenge_2025/roberta/model-best"
OUTPUT_TSV  = "/content/sample_data/predictions.tsv"
OUTPUT_COMB = "/content/sample_data/predictions_combined.tsv"
OUTPUT_SWAP = "/content/sample_data/final_predictions.tsv"

In [None]:
import pandas as pd
import spacy
from tqdm import tqdm

# 1) skiprows drops lines 1…(START_ROW-1) after the header (line 0)
skiprows = list(range(1, START_ROW))
# 2) nrows = number of lines from START_ROW through END_ROW, inclusive
nrows = END_ROW - START_ROW + 1

df = pd.read_csv(
    INPUT_TSV,
    sep="\t",
    header=0,
    skiprows=skiprows,
    nrows=nrows,
    names=["record_number","category","title"],
    encoding="utf-8",
    engine="python",
)
df.to_csv(FILTERED_DATA, sep="\t", index=False, header=False, encoding="utf-8")

In [None]:
import spacy
from spacy_transformers import Transformer
# 1) Enable GPU
spacy.require_gpu(0)
# 2) Load model
nlp = spacy.load(MODEL)

In [None]:
import pandas as pd
import spacy
from tqdm import tqdm

results = []

label_filters = {
    1: {"label_cat1"},
    2: {"label_cat2"},
}

for rec, cat, title in tqdm(
    zip(df.record_number, df.category, df.title),
    total=len(df),
    desc="Predicting entities"
):
    doc = nlp(title)
    for ent in doc.ents:
        # Check if entity label matches the allowed labels for this category
        if ent.label_ in label_filters.get(cat, set()):
            results.append([rec, cat, ent.text, ent.label_])

In [None]:
# 5) Save predictions
out_df = pd.DataFrame(results, columns=["record_number","category","entity","tag"])
out_df.to_csv(OUTPUT_TSV, sep="\t", index=False, header=False, encoding="utf-8")
print(f"✅ Saved {len(out_df)} predictions to {OUTPUT_TSV}")

In [None]:
# 1) Load your raw predictions (no header, two columns: tag, label)
df = pd.read_csv(
    OUTPUT_TSV,
    sep="\t",
    header=None,
    names=["record_number", "category", "tag", "label"],
    dtype={"record_number": str, "category": str, "tag": str, "label": str},
    encoding="utf-8",
    keep_default_na=False
)

# 2) Combine Blank‐tag rows into the previous entity
combined = []
for rec, cat, lbl, tg in df.itertuples(index=False):
    lbl = str(lbl).strip()

    # Skip 'O' tags
    #if tg == "O":
    #    continue

    # Combine with previous if tag is "Blank"
    if tg == "Blank" and combined:
        combined[-1][2] += " " + lbl
    else:
        combined.append([rec, cat, lbl, tg])

# 3) Write out, preserving record_number & category, no header
out = pd.DataFrame(combined, columns=["record_number", "category", "label", "tag"])
out.to_csv(
    OUTPUT_COMB,
    sep="\t",
    index=False,
    header=False,
    encoding="utf-8",
)
print(f"Wrote {len(out)} merged predictions to preds_combined_with_rc.tsv")

In [None]:
df = pd.read_csv(OUTPUT_COMB, sep="\t", header=None, names=["rec","cat","lbl","tg"], dtype=str, keep_default_na=False)

# 2. Reorder columns: swap lbl and tg
df = df[["rec","cat","tg","lbl"]]

# 3. Write back out as TSV (no header)
df.to_csv(OUTPUT_SWAP, sep="\t", index=False, header=False, encoding="utf-8")