In [1]:
import re
import unicodedata
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (AutoTokenizer,AutoModelForSequenceClassification,TrainingArguments,Trainer,DataCollatorWithPadding,EarlyStoppingCallback)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

In [2]:
GLOBAL_POS_N = 300000
GLOBAL_NEG_N = 150000

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
CORE_TYPES = {"article", "review", "book-chapter"}
SEC_TYPES  = {"preprint", "letter", "dissertation", "other", "book", "dataset", "report"}
LOW_TYPES  = {"paratext", "editorial", "erratum", "reference-entry", "libguides","peer-review", "retraction", "supplementary-materials"}

In [5]:
def doc_type_to_flags(doc_type: str):
    dt = (doc_type or "").strip().lower()
    core = 1 if dt in CORE_TYPES else 0
    sec  = 1 if dt in SEC_TYPES else 0
    low  = 1 if dt in LOW_TYPES else 0
    if core + sec + low == 0:
        core = sec = low = 0
    return core, sec, low

In [6]:
TOP_INSTITUTIONS = {"harvard university","massachusetts institute of technology","stanford university","national institutes of health",
    "university of wisconsin madison","johns hopkins university","california institute of technology","university of michigan ann arbor",
    "cornell university","ibm united states","centre national de la recherche scientifique","university of california los angeles",
    "microsoft united states","university of washington","university of california berkeley",}

In [7]:
INSTITUTION_ALIASES = {"mit": "massachusetts institute of technology","caltech": "california institute of technology",
    "ucla": "university of california los angeles","uc berkeley": "university of california berkeley",
    "university of california, berkeley": "university of california berkeley","cnrs": "centre national de la recherche scientifique",
    "nih": "national institutes of health",}

In [8]:
def normalize_institution(name: str) -> str:
    if not name:
        return ""
    s = unicodedata.normalize("NFKD", str(name)).encode("ascii", "ignore").decode("ascii")
    s = s.lower().strip()
    s = s.replace("&", " and ")
    s = re.sub(r"[\(\)\[\]\{\},]", " ", s)
    s = re.sub(r"[-–—]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [9]:
def is_top_institution_from_name(institution_name: str) -> int:
    norm = normalize_institution(institution_name)
    if norm in INSTITUTION_ALIASES:
        norm = INSTITUTION_ALIASES[norm]
    return 1 if norm in TOP_INSTITUTIONS else 0

In [10]:
df = pd.read_csv("A:/DLSA/Project_work/Datasets/forecasting_dataset3.csv")
df["publication_year"] = pd.to_numeric(df["publication_year"], errors="coerce")
df = df[(df["publication_year"] >= 1990) & (df["publication_year"] <= 2015)].copy()
df["publication_year"] = df["publication_year"].astype(int)
df["title"] = df["title"].fillna("").astype(str)
df["abstract"] = df["abstract"].fillna("").astype(str)

In [11]:
if GLOBAL_POS_N is not None and GLOBAL_NEG_N is not None:
    df_pos_all = df[df["label"] == 1]
    df_neg_all = df[df["label"] == 0]
    if len(df_pos_all) == 0 or len(df_neg_all) == 0:
        raise ValueError("Dataset has only one class; cannot sample pos/neg.")
    n_pos = min(GLOBAL_POS_N, len(df_pos_all))
    n_neg = min(GLOBAL_NEG_N, len(df_neg_all))
    df_pos = df_pos_all.sample(n=n_pos, random_state=42)
    df_neg = df_neg_all.sample(n=n_neg, random_state=42)
    df = (pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42).reset_index(drop=True))
print("After global sampling:", len(df), " rows","| pos:", int((df["label"] == 1).sum()),"| neg:", int((df["label"] == 0).sum()))

After global sampling: 290073  rows | pos: 140073 | neg: 150000


In [12]:
#df["text"] = df["title"].fillna("").astype(str) + " [SEP] " + df["abstract"].fillna("").astype(str)

In [13]:
metadata_cols = ["publication_year","is_eng","abstract_len","title_len","core_research","sec_research","low_novelty","num_authors","avg_author_citations",
    "avg_author_productivity","is_top_institution",]
for c in metadata_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)

In [14]:
train_df = df[(df["publication_year"] >= 1990) & (df["publication_year"] <= 2010)].copy()
val_df   = df[(df["publication_year"] >= 2011) & (df["publication_year"] <= 2013)].copy()
test_df  = df[(df["publication_year"] >= 2014) & (df["publication_year"] <= 2015)].copy()

In [15]:
print("\nSplit sizes:")
print("Train:", len(train_df), " Val:", len(val_df), " Test:", len(test_df))
print("\nTrain label distribution (NO balancing):")
print(train_df["label"].value_counts())


Split sizes:
Train: 206242  Val: 49344  Test: 34487

Train label distribution (NO balancing):
label
1    104877
0    101365
Name: count, dtype: int64


In [16]:
# train_pos = train_df[train_df["label"] == 1]
# train_neg_pool = train_df[train_df["label"] == 0]

# if len(train_pos) == 0 or len(train_neg_pool) == 0:
#     raise ValueError("Training set has only one class after temporal split.")

# train_neg = train_neg_pool.sample(n=min(len(train_pos), len(train_neg_pool)), random_state=42)
# train_df = (
#     pd.concat([train_pos, train_neg])
#     .sample(frac=1, random_state=42)
#     .reset_index(drop=True)
# )

# print("\nTrain after balancing (1:1):", len(train_df))
# print(train_df["label"].value_counts())
#train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [17]:
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [18]:
def tokenize(batch):
    return tokenizer(batch["title"],batch["abstract"],truncation=True,max_length=512,)

In [19]:
train_ds = Dataset.from_dict({"title": train_df["title"].tolist(),"abstract": train_df["abstract"].tolist(),"label": train_df["label"].astype(int).tolist()})
val_ds = Dataset.from_dict({"title": val_df["title"].tolist(),"abstract": val_df["abstract"].tolist(),"label": val_df["label"].astype(int).tolist()})
test_ds = Dataset.from_dict({"title": test_df["title"].tolist(),"abstract": test_df["abstract"].tolist(),"label": test_df["label"].astype(int).tolist()})

In [20]:
train_ds = train_ds.map(tokenize, batched=True, remove_columns=["title", "abstract"])
val_ds   = val_ds.map(tokenize, batched=True, remove_columns=["title", "abstract"])
test_ds  = test_ds.map(tokenize, batched=True, remove_columns=["title", "abstract"])

Map:   0%|          | 0/206242 [00:00<?, ? examples/s]

Map:   0%|          | 0/49344 [00:00<?, ? examples/s]

Map:   0%|          | 0/34487 [00:00<?, ? examples/s]

In [21]:
data_collator = DataCollatorWithPadding(tokenizer)

In [22]:
print("\nHF sizes:", len(train_ds), len(val_ds), len(test_ds))


HF sizes: 206242 49344 34487


In [23]:
def load_scibert_lora(name, num_labels=2):
    base = AutoModelForSequenceClassification.from_pretrained(
        name,
        num_labels=num_labels,
        id2label={0: "NOT_PATENTABLE", 1: "PATENTABLE"},
        label2id={"NOT_PATENTABLE": 0, "PATENTABLE": 1},
        dtype=torch.float32
    )
    base.gradient_checkpointing_enable()
    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=["query","key","value"],
        bias="none",
        task_type=TaskType.SEQ_CLS
    )
    model = get_peft_model(base, peft_config)
    return model

In [24]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits

        loss = F.cross_entropy(
            logits,
            labels,
            weight=self.class_weights.to(logits.device) if self.class_weights is not None else None
        )
        return (loss, outputs) if return_outputs else loss

In [25]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
    }


In [26]:
y_train_all = train_df["label"].values
n_pos = int((y_train_all == 1).sum())
n_neg = int((y_train_all == 0).sum())
if n_pos == 0 or n_neg == 0:
    raise ValueError("Train split has only one class.")

w1 = (n_neg / n_pos) ** 0.5   # softer than n_neg/n_pos (better precision)
class_weights = torch.tensor([1.0, w1], dtype=torch.float32)

print(f"\nTrain n_pos={n_pos}, n_neg={n_neg}, sqrt_pos_weight={w1:.4f}")


Train n_pos=104877, n_neg=101365, sqrt_pos_weight=0.9831


In [27]:
scibert_model = load_scibert_lora(model_name).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
training_args = TrainingArguments(
    output_dir="./scibert_lora_forecasting",
    overwrite_output_dir=True,
    num_train_epochs=4,
    learning_rate=2e-5,
    warmup_ratio=0.06,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=100,
    report_to="none",
    remove_unused_columns=False,
)

In [29]:
trainer = WeightedTrainer(
    model=scibert_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

  super().__init__(*args, **kwargs)


In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5646,0.535604,0.730666,0.660936,0.763296,0.708438
2,0.5293,0.52912,0.733382,0.663558,0.766889,0.711491
3,0.5234,0.521227,0.739968,0.673911,0.762256,0.715366
4,0.518,0.538719,0.727181,0.64177,0.822957,0.721157


TrainOutput(global_step=51564, training_loss=0.5457140453736042, metrics={'train_runtime': 166004.661, 'train_samples_per_second': 4.97, 'train_steps_per_second': 0.311, 'total_flos': 1.656863690036217e+17, 'train_loss': 0.5457140453736042, 'epoch': 4.0})

In [31]:
def predict_proba_hf(trainer_obj, ds):
    out = trainer_obj.predict(ds)
    logits = out.predictions
    probs = torch.softmax(torch.tensor(logits), dim=1).cpu().numpy()
    return probs[:, 1], out.label_ids

In [32]:
def find_best_threshold(y_true, probs, thr_min=0.05, thr_max=0.95, step=0.02):
    best = {"f1": -1, "thr": None, "p": None, "r": None}
    for thr in np.arange(thr_min, thr_max + 1e-9, step):
        pred = (probs >= thr).astype(int)
        f1 = f1_score(y_true, pred, zero_division=0)
        if f1 > best["f1"]:
            best = {
                "f1": float(f1),
                "thr": float(thr),
                "p": float(precision_score(y_true, pred, zero_division=0)),
                "r": float(recall_score(y_true, pred, zero_division=0)),
            }
    return best

In [33]:
p_text_val, y_val = predict_proba_hf(trainer, val_ds)
p_text_test, y_test = predict_proba_hf(trainer, test_ds)

best_text = find_best_threshold(y_val, p_text_val)
print("\nBest TEXT threshold on VAL:", best_text)
print("len(y_val):", len(y_val))
print("p_text_val shape:", p_text_val.shape)


Best TEXT threshold on VAL: {'f1': 0.7213987537056084, 'thr': 0.4700000000000001, 'p': 0.6290356615319688, 'r': 0.8455538221528861}
len(y_val): 49344
p_text_val shape: (49344,)


In [34]:
X_train = train_df[metadata_cols].values
y_train = train_df["label"].values
X_val = val_df[metadata_cols].values
y_val = val_df["label"].values
X_test = test_df[metadata_cols].values
y_test = test_df["label"].values

In [35]:
#n_pos = (y_train == 1).sum()
#n_neg = (y_train == 0).sum()
scale_pos_weight = (y_train == 0).sum() / max((y_train == 1).sum(), 1)
#scale_pos_weight = n_neg / n_pos
print(f"\nXGB scale_pos_weight={scale_pos_weight:.3f}")


XGB scale_pos_weight=0.967


In [36]:
xgb = XGBClassifier(
    n_estimators=3000,
    max_depth=4,
    min_child_weight=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
)

In [37]:
xgb.fit(X_train, y_train,eval_set=[(X_val, y_val)],verbose=200)

[0]	validation_0-logloss:0.69212
[200]	validation_0-logloss:0.65742
[400]	validation_0-logloss:0.65606
[600]	validation_0-logloss:0.65571
[800]	validation_0-logloss:0.65567
[1000]	validation_0-logloss:0.65580
[1200]	validation_0-logloss:0.65593
[1400]	validation_0-logloss:0.65603
[1600]	validation_0-logloss:0.65618
[1800]	validation_0-logloss:0.65635
[2000]	validation_0-logloss:0.65649
[2200]	validation_0-logloss:0.65666
[2400]	validation_0-logloss:0.65692
[2600]	validation_0-logloss:0.65705
[2800]	validation_0-logloss:0.65725
[2999]	validation_0-logloss:0.65745


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [38]:
#p_meta_train = xgb.predict_proba(X_train)[:, 1]
p_meta_val   = xgb.predict_proba(X_val)[:, 1]
p_meta_test  = xgb.predict_proba(X_test)[:, 1]

In [39]:
alphas = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
thresholds = np.arange(0.05, 0.96, 0.02)

In [40]:
best = {"f1": -1, "alpha": None, "thr": None}

In [41]:
for a in alphas:
    p_val_fused = a * p_text_val + (1 - a) * p_meta_val
    for thr in thresholds:
        pred = (p_val_fused >= thr).astype(int)
        f1 = f1_score(y_val, pred, zero_division=0)
        if f1 > best["f1"]:
            best = {
                "f1": float(f1),
                "alpha": float(a),
                "thr": float(thr),
                "p": float(precision_score(y_val, pred, zero_division=0)),
                "r": float(recall_score(y_val, pred, zero_division=0)),
            }

print("\nBest FUSION on VAL:", best)


Best FUSION on VAL: {'f1': 0.7246184196864293, 'alpha': 0.6, 'thr': 0.4700000000000001, 'p': 0.6461373231612473, 'r': 0.8248002647378623}


In [43]:
a = best["alpha"]
thr = best["thr"]
p_test_fused = a * p_text_test + (1 - a) * p_meta_test
pred_test = (p_test_fused >= thr).astype(int)

In [44]:
print("accuracy :", round(accuracy_score(y_test, pred_test), 4))
print("precision:", round(precision_score(y_test, pred_test, zero_division=0), 4))
print("recall   :", round(recall_score(y_test, pred_test, zero_division=0), 4))
print("f1       :", round(f1_score(y_test, pred_test, zero_division=0), 4))

accuracy : 0.7248
precision: 0.6228
recall   : 0.8216
f1       : 0.7085


In [45]:
def predict_from_user_input_v2(
    title: str,
    abstract: str,
    publication_year: int,
    is_eng: int,
    doc_type_str: str,              # e.g., "article", "review", "preprint"
    institution_name: str,          # e.g., "MIT", "Harvard University"
    num_authors: float,
    avg_author_citations: float,
    avg_author_productivity: float,
    tokenizer_obj,
    scibert_model_obj,
    xgb_model_obj,
    alpha: float,
    threshold: float,
    max_length: int = 512,
    device: str = None,
):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    title = "" if title is None else str(title)
    abstract = "" if abstract is None else str(abstract)

    # ---- Text prob
    inputs = tokenizer_obj(
        title, abstract,
        return_tensors="pt",
        truncation=True,
        max_length=max_length
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    scibert_model_obj.eval()
    with torch.no_grad():
        out = scibert_model_obj(**inputs)
        p_text = torch.softmax(out.logits, dim=1)[0, 1].item()

    # ---- Metadata
    abstract_len = len(abstract.split())
    title_len = len(title.split())

    core_research, sec_research, low_novelty = doc_type_to_flags(doc_type_str)
    is_top_institution = is_top_institution_from_name(institution_name)

    meta_row = np.array([[
        int(publication_year),
        int(is_eng),
        int(abstract_len),
        int(title_len),
        int(core_research),
        int(sec_research),
        int(low_novelty),
        float(num_authors),
        float(avg_author_citations),
        float(avg_author_productivity),
        int(is_top_institution),
    ]], dtype=np.float32)

    p_meta = float(xgb_model_obj.predict_proba(meta_row)[0, 1])

    # ---- Fuse
    p_fused = float(alpha * p_text + (1 - alpha) * p_meta)
    pred = 1 if p_fused >= threshold else 0

    return {
        "p_text_scibert": round(p_text, 4),
        "p_meta_xgboost": round(p_meta, 4),
        "p_fused": round(p_fused, 4),
        "doc_type_flags": {
            "core_research": int(core_research),
            "sec_research": int(sec_research),
            "low_novelty": int(low_novelty),
        },
        "is_top_institution": int(is_top_institution),
        "prediction": "PATENTABLE" if pred == 1 else "NOT_PATENTABLE",
    }

In [46]:
example = predict_from_user_input_v2(
    title="A corona discharge initiated electrochemical electrospray ionization technique",
    abstract="We report here the development of a corona discharge   initiated electrochemical   electrospray ionization   technique using a standard electrospray ion source. This is a new ionization technique ",
    publication_year=2009,
    is_eng=1,
    doc_type_str="article",           
    institution_name="National Institute of Diabetes and Digestive and Kidney Diseases",            
    num_authors=0,
    avg_author_citations=0,
    avg_author_productivity=0,
    tokenizer_obj=tokenizer,
    scibert_model_obj=scibert_model,
    xgb_model_obj=xgb,
    alpha=best["alpha"],
    threshold=best["thr"],
    max_length=512,
    device=device,
)

print("\nUser-input prediction example:\n", example)


User-input prediction example:
 {'p_text_scibert': 0.9287, 'p_meta_xgboost': 0.5002, 'p_fused': 0.7573, 'doc_type_flags': {'core_research': 1, 'sec_research': 0, 'low_novelty': 0}, 'is_top_institution': 0, 'prediction': 'PATENTABLE'}
