# Finding Significance Classification

## Inference using the selected model

In [1]:
from pathlib import Path

In [2]:
#MODEL_DIR = Path("wandb_models")
#MODEL_DIR.mkdir(exist_ok=True, parents=True)

In [3]:
#import wandb

In [4]:
#wandb.login()

In [5]:
#api = wandb.Api()

#runs = api.runs(
#    path="paul_ww/significance_classification",
#    filters={"group": "transformer_finetuned"},
#    order="-summary_metrics.test.macro_avg_f1-score",
#)
#best_run = runs[0]

In [6]:
#model_files = [f for f in best_run.files() if f.name.startswith("model_finetuned")]

In [7]:
#model_files

In [8]:
#for f in model_files:
#    f.download(root=MODEL_DIR, replace=True)

In [9]:
base_path = Path('../../output/')

In [10]:
PATH_INFERENCE_DATASET = Path(base_path / "ids_to_abstracts_for_inference.parquet")
PATH_INFERENCE_RESULTS_CACHE = Path(base_path / "prediction_results.jsonl")
PATH_INFERENCE_RESULTS = Path(base_path / "ids_to_significance_predictions_finetuned.parquet")
MODEL = '../../models/signficance/model_finetuned'
DEVICE = "cuda:3"
BATCH_SIZE = 8

### Load the data

In [11]:
import pandas as pd

In [12]:
df_combined = pd.read_parquet(PATH_INFERENCE_DATASET)

In [13]:
df_combined

Unnamed: 0,pm_id,abstract
0,12411355,OBJECTIVE\n\n\nTo measure the effect of giving...
1,20386478,The goal of this research project was to inves...
2,20386477,The objectives of the present investigation we...
3,20386476,The present study investigated the influence o...
4,20386475,The purpose of the present study was to examin...
...,...,...
776980,21594665,Trastuzumab (T) is effective in metastatic bre...
776981,19513541,"PI-103, the first synthetic multitargeted comp..."
776982,23094721,Regular use of aspirin after a diagnosis of co...
776983,18772396,Glioblastoma multiforme (GBM) is the most comm...


### Model Setup

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

TOKENIZER = AutoTokenizer.from_pretrained(
    MODEL,
    truncation=True,
    truncation_side="left",
    model_max_length=512,
    padding="max_length",
)

MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL).to(DEVICE)

In [15]:
from transformers import pipeline
import torch

PIPE = pipeline(
    "text-classification",
    model=MODEL,
    tokenizer=TOKENIZER,
    device=DEVICE,
    torch_dtype=torch.bfloat16,
)

In [16]:
import jsonlines
from transformers.pipelines import Pipeline
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset


def predict_significance_labels(
    ds: Dataset,
    id_col: str,
    feature_col: str,
    pipe: Pipeline,
    batch_size: int,
    output_file: str,
    output_batch_size: int,
) -> None:
    output_file = Path(output_file)
    if output_file.exists():
        with jsonlines.open(output_file) as reader:
            processed_ids = [item.get(id_col) for item in reader]

    else:
        processed_ids = []

    filtered_ds = ds.filter(lambda row: row[id_col] not in processed_ids)

    batch = []
    for doc_id, pred in tqdm(
        zip(
            filtered_ds[id_col],
            pipe(
                KeyDataset(filtered_ds, feature_col),
                batch_size=batch_size,
                truncation=True,
                return_all_scores=True,
            ),
        ),
        desc="Running inference",
        total=len(filtered_ds),
    ):
        batch.append(
            {
                id_col: doc_id,
                "labels": [pred[0]["label"], pred[1]["label"]],
                "scores": [pred[0]["score"], pred[1]["score"]],
            }
        )
        processed_ids.append(doc_id)

        if len(batch) == output_batch_size:
            with jsonlines.open(output_file, mode="a") as writer:
                writer.write_all(batch)
            batch = []

    with jsonlines.open(output_file, mode="a") as writer:
        writer.write_all(batch)

### Inference

In [17]:
ds = Dataset.from_pandas(df_combined)

In [18]:
predict_significance_labels(
    ds=ds,
    id_col="pm_id",
    feature_col="abstract",
    pipe=PIPE,
    batch_size=BATCH_SIZE,
    output_file=PATH_INFERENCE_RESULTS_CACHE,
    output_batch_size=10000,
)

Filter:   0%|          | 0/776985 [00:00<?, ? examples/s]

Running inference: 100%|███████████████████████████████████████████████████████████████████████████████| 3442/3442 [00:32<00:00, 105.11it/s]


In [19]:
LABELS = list(MODEL.config.id2label.values())

In [20]:
LABELS

['no significant effect', 'significant effect']

In [21]:
df_results = pd.read_json(PATH_INFERENCE_RESULTS_CACHE, lines=True).drop(
    columns=["labels"]
)
df_results[f"prob_{LABELS[0]}"] = df_results["scores"].str[0]
df_results[f"prob_{LABELS[1]}"] = df_results["scores"].str[1]
df_results["predicted_label"] = df_results["scores"].apply(
    lambda x: LABELS[x.index(max(x))]
)
df_results["has_significant_effect"] = df_results[f"prob_{LABELS[1]}"] >= 0.5
df_results.drop(columns="scores").to_parquet(PATH_INFERENCE_RESULTS, compression="gzip")
df_results

Unnamed: 0,pm_id,scores,prob_no significant effect,prob_significant effect,predicted_label,has_significant_effect
0,2180658,"[0.7253651022911071, 0.27463486790657005]",0.725365,0.274635,no significant effect,False
1,18043056,"[0.349804937839508, 0.650195062160491]",0.349805,0.650195,significant effect,True
2,18042987,"[0.545411705970764, 0.45458829402923506]",0.545412,0.454588,no significant effect,False
3,18042976,"[0.871544778347015, 0.128455191850662]",0.871545,0.128455,no significant effect,False
4,18042975,"[0.27974003553390503, 0.7202599644660951]",0.279740,0.720260,significant effect,True
...,...,...,...,...,...,...
776980,38493933,"[0.8496014475822441, 0.15039856731891602]",0.849601,0.150399,no significant effect,False
776981,38490284,"[0.20785824954509702, 0.792141735553741]",0.207858,0.792142,significant effect,True
776982,38467383,"[0.070790067315101, 0.9292099475860591]",0.070790,0.929210,significant effect,True
776983,38466872,"[0.19682666659355103, 0.8031733036041261]",0.196827,0.803173,significant effect,True
