In [None]:
# Library parsing
from bs4 import BeautifulSoup
import zipfile
import os
import re
import glob
import pandas as pd

In [None]:
xml_folder = "train/XML"
all_files = glob.glob(os.path.join(xml_folder, "*.xml"))
article_ids = [os.path.basename(f).replace(".xml", "") for f in all_files]
print(f"Jumlah total file (article_id): {len(article_ids)}")


Jumlah total file (article_id): 400


In [None]:
import pandas as pd

df_labels = pd.read_csv("train_labels.csv")
print(df_labels.columns)
print(df_labels.head())


Index(['article_id', 'dataset_id', 'type'], dtype='object')
               article_id                      dataset_id     type
0    10.1002_2017jc013030  https://doi.org/10.17882/49388  Primary
1  10.1002_anie.201916483                         Missing  Missing
2  10.1002_anie.202005531                         Missing  Missing
3  10.1002_anie.202007717                         Missing  Missing
4  10.1002_chem.201902131                         Missing  Missing


#ext

In [None]:
import os
import re
import glob
import pandas as pd
from bs4 import BeautifulSoup

xml_folder = "train/XML"

primer_keywords = [
    "we collected", "interview", "measured", "generated", "field study",
    "questionnaire", "conducted our study", "samples were collected",
    "data was obtained in this study", "experimentally measured",
    "we conducted", "data collected in this study", "our dataset",
    "we gathered", "collected for this research", "generated during this study", "in-house dataset",
    "experiment produced", "created by authors", "experimentally collected", "primary data", "study participants", "sampled from", "manually collected",
    "we performed an experiment", "collected during our study", "observed during fieldwork",
    "experimental data", "we carried out", "measured in the lab"
]

sekunder_keywords = [
    "obtained from", "taken from", "downloaded", "publicly available",
    "secondary data", "retrieved from", "sourced from", "previous study",
    "archival", "published dataset", "borrowed from", "re-used", "reused",
    "existing dataset", "external data", "data were accessed",
    "gathered from", "data from previous studies",
    "from online database", "data reuse", "secondary analysis", "available at", "extracted from", "according to previous data",
    "cited from", "from repository", "already published", "sourced externally", "data citation",
    "we used dataset from", "acquired from repository", "existing public dataset", "open data", "public dataset", "dataset from literature",
    "data collected by others", "used existing data", "external database", "freely available",
    "pre-existing data", "already collected", "downloadable dataset", "EMPIAR-"
]

def extract_dataset_ids(soup):
    dataset_ids = set()
    for tag in soup.find_all("dataset_id"):
        if tag.text.strip():
            dataset_ids.add(tag.text.strip())
    for data_set in soup.find_all("data-set"):
        id_tag = data_set.find("id")
        if id_tag and id_tag.text.strip():
            dataset_ids.add(id_tag.text.strip())
    for ext_link in soup.find_all("ext-link", {"ext-link-type": "dataset"}):
        if ext_link.text.strip():
            dataset_ids.add(ext_link.text.strip())
        elif ext_link.get("xlink:href"):
            dataset_ids.add(ext_link["xlink:href"].strip())

    doi_pattern = r"https?://doi\.org/10\.\d{4,9}/[^\s\"<>]+"
    semua_teks = soup.get_text(" ")
    dataset_ids.update(re.findall(doi_pattern, semua_teks))

    special_pattern = r"\b(?:CHEMBL\d+|IPR00\d+|GSE\d+|SRP\d+|EMPIAR-\d+|ENSBTAG000\d+|IPR\d+)\b"
    dataset_ids.update(re.findall(special_pattern, semua_teks))

    return list(dataset_ids)

def extract_teks_dataset_id(soup, ds_id):
    for tag in soup.find_all(["ext-link", "dataset_id"]):
        if ds_id.lower() in str(tag).lower():
            parent = tag.find_parent()
            if parent:
                return parent.get_text(separator=" ").strip()

    full_text = soup.get_text(separator=" ")
    idx = full_text.lower().find(ds_id.lower())
    if idx != -1:
        start = max(0, idx - 150)
        end = min(len(full_text), idx + len(ds_id) + 150)
        return full_text[start:end].strip()

    return ""

def detect_type(teks):
    teks = teks.lower()
    for kw in primer_keywords:
        if kw in teks:
            return "Primary"
    for kw in sekunder_keywords:
        if kw in teks:
            return "Secondary"
    return "Missing"

hasil = []
for filepath in glob.glob(os.path.join(xml_folder, "*.xml")):
    article_id = os.path.basename(filepath).replace(".xml", "")
    with open(filepath, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "xml")
        teks_full = soup.get_text(separator=" ").strip()
        dataset_ids = extract_dataset_ids(soup)
        for ds_id in dataset_ids:
            snippet = extract_teks_dataset_id(soup, ds_id)
            tipe = detect_type(teks_full)
            hasil.append({
                "article_id": article_id,
                "dataset_id": ds_id,
                "teks_dataset_id": snippet,
                "type": tipe
            })

df = pd.DataFrame(hasil).drop_duplicates()

# --- Tambahkan file yang tidak punya dataset_id (Missing) ---
all_files = glob.glob(os.path.join(xml_folder, "*.xml"))
all_article_ids = {os.path.basename(f).replace(".xml", "") for f in all_files}
extracted_article_ids = set(df['article_id'].unique())
missing_article_ids = all_article_ids - extracted_article_ids
missing_rows = [{'article_id': aid, 'dataset_id': 'Missing', 'type': 'Missing'} for aid in missing_article_ids]
df = pd.concat([df, pd.DataFrame(missing_rows)], ignore_index=True)

df.to_csv("train.csv", index=False)


In [None]:
df = pd.read_csv("train.csv")

# Hapus baris Missing
df = df[df['type'].isin(['Primary', 'Secondary'])].copy()

# Tambah kolom label numerik
df['label_num'] = df['type'].map({'Primary': 0, 'Secondary': 1})


#tokenisasi

In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Bersihkan teks
def clean_teks_dataset_id(teks):
    if isinstance(teks, str):
        return re.sub(r"\s+", " ", teks.strip())
    return ""


df['cleaned_teks'] = df['teks_dataset_id'].apply(clean_teks_dataset_id)
df['label_num'] = df['type'].map({'Primary': 0, 'Secondary': 1})

# Tokenisasi
def tokenize_text(text, max_length=512):
    return tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

df['tokenized'] = df['cleaned_teks'].apply(lambda x: tokenize_text(x))


#model

In [None]:
class SciBERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512, is_test=False):
        self.texts = df['cleaned_teks'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        if not is_test:
            self.labels = df['label_num'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }
        if not self.is_test:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


In [None]:
test_folder = "test/XML"

hasil_test = []
for filepath in glob.glob(os.path.join(test_folder, "*.xml")):
    article_id = os.path.basename(filepath).replace(".xml", "")
    with open(filepath, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "xml")
        teks_full = soup.get_text(separator=" ").strip()
        dataset_ids = extract_dataset_ids(soup)
        for ds_id in dataset_ids:
            snippet = extract_teks_dataset_id(soup, ds_id)
            hasil_test.append({
                "article_id": article_id,
                "dataset_id": ds_id,
                "teks_dataset_id": snippet
            })

df_test = pd.DataFrame(hasil_test).drop_duplicates()


In [None]:
df_test['cleaned_teks'] = df_test['teks_dataset_id'].apply(clean_teks_dataset_id)

In [None]:
print(df_test.columns)

Index(['article_id', 'dataset_id', 'teks_dataset_id', 'cleaned_teks',
       'predicted_type'],
      dtype='object')


#fine tuning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import TrainingArguments, Trainer
import torch
import numpy as np

# === 1. Split data latih & evaluasi ===
train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df['label_num'], random_state=42)

# === 2. Buat dataset untuk masing-masing ===
train_dataset = SciBERTDataset(train_df, tokenizer, max_length=300)
eval_dataset = SciBERTDataset(eval_df, tokenizer, max_length=300)

# === 3. Hitung class weights ===
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(train_df['label_num']),
                                     y=train_df['label_num'])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(model.device)

# === 4. Metrik evaluasi ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='macro')
    }

# === 5. TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

# === 6. Custom Trainer dengan class weights ===
from torch.nn import CrossEntropyLoss
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# === 7. Inisialisasi trainer dan training ===
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0104,0.833421,0.977778,0.69434
2,0.0001,0.292388,0.987654,0.888128
3,0.0001,0.407008,0.990123,0.897468
4,0.0,0.417219,0.990123,0.897468


TrainOutput(global_step=812, training_loss=0.07981926375231269, metrics={'train_runtime': 189.3461, 'train_samples_per_second': 34.16, 'train_steps_per_second': 4.288, 'total_flos': 997149788712000.0, 'train_loss': 0.07981926375231269, 'epoch': 4.0})

#eval

In [None]:
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 0.292387992143631, 'eval_accuracy': 0.9876543209876543, 'eval_f1': 0.888127727749848, 'eval_runtime': 3.1915, 'eval_samples_per_second': 126.899, 'eval_steps_per_second': 15.98, 'epoch': 4.0}


#predik

In [None]:
test_dataset = SciBERTDataset(df_test, tokenizer, max_length=300, is_test=True)

In [None]:
# === 7. Prediksi pada data test ===
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=-1)

# === 8. Simpan hasil prediksi ke file CSV ===
df_test['predicted_type'] = ['Primary' if label == 0 else 'Secondary' for label in pred_labels]
df_test[['article_id', 'dataset_id', 'predicted_type']].to_csv("submission.csv", index=False)

In [None]:
print(df_test.head())

               article_id                                   dataset_id  \
0    10.1002_2017jc013030        https://doi.org/10.5194/essd-2017-58.   
1  10.1002_cssc.202201821       https://doi.org/10.5281/zenodo.7074790   
2       10.1002_ece3.3985    https://doi.org/10.1163/1937240X-00002254   
3       10.1002_ece3.3985    https://doi.org/10.1017/S0006323199005423   
4       10.1002_ece3.3985  https://doi.org/10.1534/genetics.108.100214   

                                     teks_dataset_id  \
0  wo databases derived from BGC-Argo float measu...   
1  A previous version of this manuscript has been...   
2  Bailie ,  D. A. \n ,  \n Fitzpatrick ,  S. \n ...   
3  Jennions ,  M. D. \n , &  \n Petrie ,  M. \n  ...   
4  Wang ,  J. \n , &  \n Santure ,  A. W. \n  ( 2...   

                                        cleaned_teks predicted_type  
0  wo databases derived from BGC-Argo float measu...        Primary  
1  A previous version of this manuscript has been...        Primary  
2  Baili

In [None]:
print(df_test['predicted_type'].value_counts())

predicted_type
Primary    193
Name: count, dtype: int64
