# Evaluation: baseline classifier and labeling scaffold
- Load cleaned abstracts (and refs if needed).
- Sample a subset to label inclusion/exclusion.
- Fit a simple TF-IDF + logistic regression baseline.
- Save labeled data for reuse.

In [None]:
import json
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

project_root = Path('..').resolve()
data_dir = project_root / 'Data'
abs_path = data_dir / 'cochrane_pubmed_abstracts_clean.parquet'
refs_path = data_dir / 'cochrane_pubmed_references_clean.parquet'
labels_path = data_dir / 'labeled_cochrane_sample.parquet'

abs_df = pd.read_parquet(abs_path)
refs_df = pd.read_parquet(refs_path) if refs_path.exists() else pd.DataFrame()
abs_df.head()

## Create or load a labeled subset
If `labels_path` exists, we reuse it. Otherwise we sample and you fill labels manually (0 = exclude, 1 = include).

In [None]:
import numpy as np
SAMPLE_SIZE = 500  # adjust as needed
if labels_path.exists():
    labeled = pd.read_parquet(labels_path)
else:
    labeled = abs_df.sample(n=min(SAMPLE_SIZE, len(abs_df)), random_state=42).copy()
    labeled['label'] = np.nan  # TODO: fill 0/1 for exclude/include
labeled.head()

### TODO: Label the sample
Fill the `label` column (0 = exclude, 1 = include) for the sampled rows. You can export to CSV to label externally and re-import.

In [None]:
# If you labeled externally, re-read here (optional)
# labeled = pd.read_parquet(labels_path)  # or read_csv if you exported to CSV
labeled_labeled = labeled.dropna(subset=['label']).copy()
print('Labeled rows:', len(labeled_labeled))
labeled_labeled['label'] = labeled_labeled['label'].astype(int)
labeled_labeled.head()

In [None]:
# Train/test split and baseline model
if labeled_labeled.empty:
    raise ValueError("No labeled rows found. Please label the sample (0/1) before training.")

train_df, test_df = train_test_split(
    labeled_labeled,
    test_size=0.2,
    random_state=42,
    stratify=labeled_labeled['label']
)

clf = make_pipeline(
    TfidfVectorizer(max_features=20000, ngram_range=(1, 2)),
    LogisticRegression(max_iter=200, class_weight='balanced')
)

clf.fit(train_df['abstract'], train_df['label'])
pred = clf.predict(test_df['abstract'])
print(classification_report(test_df['label'], pred, digits=3))

In [None]:
# Save labeled subset for reuse
labels_path.parent.mkdir(parents=True, exist_ok=True)
labeled.to_parquet(labels_path, index=False)
labels_path