### EXTRACT FILE TRAINING

In [1]:
# Upload file
from google.colab import files
uploaded = files.upload()

# Library parsing
from bs4 import BeautifulSoup
import zipfile
import os
import re
import glob
import pandas as pd

zip_file = list(uploaded.keys())[0]  # ambil nama file zip
extract_folder = "xml_folder"

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

Saving Data UAS simdat XML.zip to Data UAS simdat XML.zip


In [2]:
# === Folder XML ===
xml_folder = "/content/xml_folder/Data UAS simdat XML"

# Ambil semua path XML
all_files = glob.glob(os.path.join(xml_folder, "*.xml"))

# Ambil nama file tanpa ekstensi
article_ids = [os.path.basename(f).replace(".xml", "") for f in all_files]

# Hitung
print(f"Jumlah total file (article_id): {len(article_ids)}")

Jumlah total file (article_id): 400


In [3]:
import os
import re
import glob
import pandas as pd
from bs4 import BeautifulSoup

# === Keyword Deteksi ===
primer_keywords = [
    "we collected", "interview", "measured", "generated", "field study",
    "questionnaire", "conducted our study", "samples were collected",
    "data was obtained in this study", "experimentally measured",
    "we conducted", "data collected in this study", "our dataset",
    "we gathered", "collected for this research", "generated during this study", "in-house dataset",
    "experiment produced", "created by authors", "experimentally collected", "primary data", "study participants", "sampled from", "manually collected",
    "we performed an experiment", "collected during our study", "observed during fieldwork",
    "experimental data", "we carried out", "measured in the lab"
]

sekunder_keywords = [
    "obtained from", "taken from", "downloaded", "publicly available",
    "secondary data", "retrieved from", "sourced from", "previous study",
    "archival", "published dataset", "borrowed from", "re-used", "reused",
    "existing dataset", "external data", "data were accessed",
    "gathered from", "data from previous studies",
    "from online database", "data reuse", "secondary analysis", "available at", "extracted from", "according to previous data",
    "cited from", "from repository", "already published", "sourced externally", "data citation",
    "we used dataset from", "acquired from repository", "existing public dataset", "open data", "public dataset", "dataset from literature",
    "data collected by others", "used existing data", "external database", "freely available",
    "pre-existing data", "already collected", "downloadable dataset", "EMPIAR-"
]

# === Helper Function: ekstrak dataset_id dari berbagai pola ===
def extract_dataset_ids(soup):
    dataset_ids = set()

    # 1. <dataset_id>
    for tag in soup.find_all("dataset_id"):
        if tag.text.strip():
            dataset_ids.add(tag.text.strip())

    # 2. <data-set><id>
    for data_set in soup.find_all("data-set"):
        id_tag = data_set.find("id")
        if id_tag and id_tag.text.strip():
            dataset_ids.add(id_tag.text.strip())

    # 3. <ext-link ext-link-type="dataset">
    for ext_link in soup.find_all("ext-link", {"ext-link-type": "dataset"}):
        if ext_link.text.strip():
            dataset_ids.add(ext_link.text.strip())
        elif ext_link.get("xlink:href"):
            dataset_ids.add(ext_link["xlink:href"].strip())

    # 4. DOI: pakai regex dari seluruh teks
    doi_pattern = r"https?://doi\.org/10\.\d{4,9}/[^\s\"<>]+"
    semua_teks = soup.get_text(" ")
    matches = re.findall(doi_pattern, semua_teks)
    dataset_ids.update(matches)

    # 5. Pola khusus
    special_pattern = r"\b(?:CHEMBL\d+|IPR00\d+|GSE\d+|SRP\d+|EMPIAR-\d+|ENSBTAG000\d+|IPR\d+)\b"
    matches = re.findall(special_pattern, soup.get_text())
    dataset_ids.update(matches)

    return list(dataset_ids)

# === Helper Function: deteksi type ===
def detect_type(teks):
    teks = teks.lower()
    for kw in primer_keywords:
        if kw in teks:
            return "Primary"
    for kw in sekunder_keywords:
        if kw in teks:
            return "Secondary"
    return "Missing"

def extract_teks_dataset_id(soup, ds_id):
    # 1. Coba cari di tag-tag dataset
    for tag in soup.find_all(["ext-link", "dataset_id"]):
        if ds_id.lower() in str(tag).lower():
            parent = tag.find_parent()
            if parent:
                return parent.get_text(separator=" ").strip()

    # 2. Coba cari via raw string
    full_text = soup.get_text(separator=" ")
    idx = full_text.lower().find(ds_id.lower())
    if idx != -1:
        start = max(0, idx - 150)
        end = min(len(full_text), idx + len(ds_id) + 150)
        return full_text[start:end].strip()

    # 3. Fallback
    return ""

# === Proses Semua File XML ===
hasil = []

for filepath in glob.glob(os.path.join(xml_folder, "*.xml")):
    article_id = os.path.basename(filepath).replace(".xml", "")
    with open(filepath, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "xml")
        teks_full = soup.get_text(separator=" ").strip()

        dataset_ids = extract_dataset_ids(soup)
        for ds_id in dataset_ids:
            snippet = extract_teks_dataset_id(soup, ds_id)
            tipe = detect_type(teks_full)
            hasil.append({
                "article_id": article_id,
                "dataset_id": ds_id,
                "teks_dataset_id": snippet,
                "type": tipe
            })

# === Simpan ke CSV ===
df = pd.DataFrame(hasil).drop_duplicates()
# df.to_csv("hasil_dataset_extraction.csv", index=False)
# print("Selesai! Data disimpan ke hasil_dataset_extraction.csv")

In [4]:
df.sample(40)

Unnamed: 0,article_id,dataset_id,teks_dataset_id,type
849,10.1111_ddi.13153,https://doi.org/10.1650/0010‐5422(2001)103[059...,"‐z Peterson , A. T. ( 2001 ). Predicting sp...",Primary
1742,10.3897_neobiota.82.87455,https://doi.org/10.1111/j.1365-2028.2007.00892.x,008) Invasive Australian acacias on western In...,Primary
1766,10.1029_2023ea002840,https://doi.org/10.1038/nature06164,"., & Fernandez, D. P. (2007). Millennial-scale...",Primary
1184,10.1029_2021gl096173,https://doi.org/10.1002/2016JD025112,Reassessing properties and radiative forcing o...,Missing
2034,10.1186_s12859-016-0922-z,GSE70285,Twenty RNA-seq data generated from metastatic ...,Primary
1733,10.3897_neobiota.82.87455,https://doi.org/10.1111/j.1472-4642.2011.00782.x,"2011 \n \n \n Richardson DM, Rejmánek M (2011)...",Primary
1603,10.3897_zoologia.36.e32053,https://doi.org/10.1016/j.ympev.2018.10.039,Espeland M Breinholt JW Barbosa EP Casagrande ...,Secondary
757,10.1101_2022.02.10.480011,https://doi.org/10.1093/gigascience/giaa153,"Sims, Y., Torrance, J., Tracey, A., & Wood, J....",Primary
483,10.1029_2023wr035126,https://doi.org/10.1111/j.2517-6161.1974.tb009...,"ne, M. (1974). Cross-validatory choice and ass...",Primary
378,10.1111_1365-2435.13569,https://doi.org/10.1002/ece3.3422,queezing out the last egg—Annual fish increase...,Primary


In [5]:
jumlah_article_id = df['article_id'].nunique()
print(f"Jumlah artikel unik: {jumlah_article_id}")

Jumlah artikel unik: 151


In [6]:
# --- Ambil semua article_id dari file XML ---
all_files = glob.glob(os.path.join(xml_folder, "*.xml"))
all_article_ids = {os.path.basename(f).replace(".xml", "") for f in all_files}

# --- Ambil article_id yang berhasil diekstrak ---
extracted_article_ids = set(df['article_id'].unique())

# --- Cari yang tidak ada dataset_id-nya ---
missing_article_ids = all_article_ids - extracted_article_ids

# --- Tambahkan ke DataFrame dengan nilai Missing ---
missing_rows = [{'article_id': aid, 'dataset_id': 'Missing', 'type': 'Missing'} for aid in missing_article_ids]

# --- Gabungkan ---
df = pd.concat([df, pd.DataFrame(missing_rows)], ignore_index=True)

# --- Simpan ke CSV ---
df.to_csv("hasil_dataset_extraction_dengan_missing.csv", index=False)
print(f"Final total artikel: {df['article_id'].nunique()} dari {len(all_files)} file")

Final total artikel: 400 dari 400 file


### EXPLORATORY DATA ANALYSIS

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2286 entries, 0 to 2285
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   article_id       2286 non-null   object
 1   dataset_id       2286 non-null   object
 2   teks_dataset_id  2037 non-null   object
 3   type             2286 non-null   object
dtypes: object(4)
memory usage: 71.6+ KB


In [8]:
# Tetap simpan full dataframe (termasuk 'Missing')
df['label_num'] = df['type'].map({'Primary': 0, 'Secondary': 1, 'Missing':2})

In [9]:
df_train = df[df['label_num'].notna()].copy()

### CLEANSING

In [10]:
import re

def clean_teks_dataset_id(teks, max_len=300):
    """
    Membersihkan teks_dataset_id agar siap dipakai untuk model ML.

    Args:
        teks (str): Teks asli yang ingin dibersihkan.
        max_len (int): Panjang maksimum teks setelah dibersihkan.

    Returns:
        str: Teks yang sudah dibersihkan.
    """
    if not isinstance(teks, str) or teks.strip() == "":
        return ""

    # Lowercase semua
    teks = teks.lower()

    # Hapus URL (kecuali kamu mau simpan dataset_id URL)
    teks = re.sub(r'https?://\S+', ' ', teks)

    # Hapus DOI format (kadang berupa doi:10.xxxx)
    teks = re.sub(r'doi:\s*\S+', ' ', teks)

    # Hapus referensi angka dalam tanda kurung [42], (2020), dll
    teks = re.sub(r'\[[^\]]*\]', ' ', teks)
    teks = re.sub(r'\([^\)]*\d{4}[^\)]*\)', ' ', teks)  # tahun seperti (2020)

    # Hapus karakter non-alfabet (selain titik, koma, angka, dan spasi)
    teks = re.sub(r'[^a-z0-9.,\s]', ' ', teks)

    # Ubah banyak spasi jadi satu
    teks = re.sub(r'\s+', ' ', teks).strip()

    # Potong kalau terlalu panjang (misal > 300 karakter)
    if len(teks) > max_len:
        teks = teks[:max_len].rsplit(' ', 1)[0]  # biar gak motong di tengah kata

    return teks

In [11]:
df['cleaned_teks'] = df['teks_dataset_id'].apply(clean_teks_dataset_id)

In [12]:
df['label_num'] = df['type'].map({'Primary': 0, 'Secondary': 1, 'Missing': 2})

In [13]:
df.head(10)

Unnamed: 0,article_id,dataset_id,teks_dataset_id,type,label_num,cleaned_teks
0,10.1016_j.molcel.2018.11.006,GSE69140,GEO: GSE69140,Primary,0,geo gse69140
1,10.1016_j.molcel.2018.11.006,GSE44672,GEO: GSE44672,Primary,0,geo gse44672
2,10.1016_j.molcel.2018.11.006,https://doi.org/10.17632/jb4jjxsbb7.1,https://doi.org/10.17632/jb4jjxsbb7.1,Primary,0,
3,10.1016_j.molcel.2018.11.006,https://doi.org/10.17632/xtb4mkvf8f.1,https://doi.org/10.17632/xtb4mkvf8f.1,Primary,0,
4,10.1016_j.molcel.2018.11.006,GSE79360,GEO: GSE79360,Primary,0,geo gse79360
5,10.1016_j.molcel.2018.11.006,https://doi.org/10.1016/j.molcel.2018.11.006,Supplemental Information includes seven figure...,Primary,0,supplemental information includes seven figure...
6,10.1016_j.molcel.2018.11.006,GSE89420,GEO: GSE89420,Primary,0,geo gse89420
7,10.1016_j.molcel.2018.11.006,GSE61188,GEO: GSE61188,Primary,0,geo gse61188
8,10.1016_j.molcel.2018.11.006,GSE52279,GEO: GSE52279,Primary,0,geo gse52279
9,10.1186_s13059-020-02048-6,GSE141115,"Denisenko E, Guo B, Jones M, Hou R, de Kock L,...",Primary,0,"denisenko e, guo b, jones m, hou r, de kock l,..."


### TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['cleaned_teks'])
y = df['label_num']

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### RANDOM FOREST

In [16]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [17]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=["Primary", "Secondary","Missing"]))

[[565   0   7]
 [ 16   1   1]
 [  9   0  87]]
              precision    recall  f1-score   support

     Primary       0.96      0.99      0.97       572
   Secondary       1.00      0.06      0.11        18
     Missing       0.92      0.91      0.91        96

    accuracy                           0.95       686
   macro avg       0.96      0.65      0.66       686
weighted avg       0.95      0.95      0.94       686



In [18]:
df['contains_collected'] = df['cleaned_teks'].str.contains("collect|gather|measure", regex=True).astype(int)
df['contains_downloaded'] = df['cleaned_teks'].str.contains("download|obtain|reuse|source", regex=True).astype(int)

In [19]:
import scipy

X_additional = df[['contains_collected', 'contains_downloaded']].values
X_combined = scipy.sparse.hstack([X, X_additional])

In [20]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Akurasi:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=["Primary", "Secondary","Missing"]))

Akurasi: 0.9518950437317785
              precision    recall  f1-score   support

     Primary       0.96      0.99      0.97       572
   Secondary       1.00      0.06      0.11        18
     Missing       0.92      0.91      0.91        96

    accuracy                           0.95       686
   macro avg       0.96      0.65      0.66       686
weighted avg       0.95      0.95      0.94       686



### simpan model

In [21]:
import joblib

# Simpan vectorizer & model
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(model, "model.pkl")

['model.pkl']

# TESTING

In [22]:
# =============================
# 1. IMPORT & EKSTRAK FILE ZIP
# =============================
from google.colab import files
import zipfile, os, re, glob
import pandas as pd
from bs4 import BeautifulSoup

# Upload file zip dari data test XML
uploaded = files.upload()
zip_file = list(uploaded.keys())[0]
test_folder = "/content/datasetTest/XML"

# Ekstrak ke folder
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(test_folder)

Saving DataTestUASSIMDATXML.zip to DataTestUASSIMDATXML.zip


In [26]:
# =============================
# 2. DEFINISI FUNGSI EKSTRAKSI
# =============================

def extract_dataset_ids(soup):
    dataset_ids = set()

    for tag in soup.find_all("dataset_id"):
        if tag.text.strip():
            dataset_ids.add(tag.text.strip())

    for data_set in soup.find_all("data-set"):
        id_tag = data_set.find("id")
        if id_tag and id_tag.text.strip():
            dataset_ids.add(id_tag.text.strip())

    for ext_link in soup.find_all("ext-link", {"ext-link-type": "dataset"}):
        if ext_link.text.strip():
            dataset_ids.add(ext_link.text.strip())
        elif ext_link.get("xlink:href"):
            dataset_ids.add(ext_link["xlink:href"].strip())

    doi_pattern = r"https?://doi\.org/10\.\d{4,9}/[^\s\"<>]+"
    semua_teks = soup.get_text(" ")
    matches = re.findall(doi_pattern, semua_teks)
    dataset_ids.update(matches)

    special_pattern = r"\b(?:CHEMBL\d+|IPR00\d+|GSE\d+|SRP\d+|EMPIAR-\d+|ENSBTAG000\d+|IPR\d+)\b"
    matches = re.findall(special_pattern, soup.get_text())
    dataset_ids.update(matches)

    return list(dataset_ids)

def extract_text_around_dataset(teks, dataset_id, window=150):
    teks = teks.replace('\n', ' ')
    index = teks.find(dataset_id)
    if index == -1:
        return ""
    start = max(index - window, 0)
    end = index + len(dataset_id) + window
    return teks[start:end]

# =============================
# 3. EKSTRAK DATA TEST
# =============================

hasil_test = []

for filepath in glob.glob(os.path.join(test_folder, "**/*.xml"), recursive=True):
    article_id = os.path.basename(filepath).replace(".xml", "")
    with open(filepath, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "xml")
        full_teks = soup.get_text(separator=" ")

        dataset_ids = extract_dataset_ids(soup)
        for ds_id in dataset_ids:
            teks_ds = extract_text_around_dataset(full_teks, ds_id)
            hasil_test.append({
                "article_id": article_id,
                "dataset_id": ds_id,
                "teks_dataset_id": teks_ds
            })

df_test = pd.DataFrame(hasil_test)

# =============================
# 4. CLEANING TEKS
# =============================

def clean_teks_dataset_id(teks, max_len=300):
    if not isinstance(teks, str) or teks.strip() == "":
        return ""
    teks = teks.lower()
    teks = re.sub(r'https?://\S+', ' ', teks)
    teks = re.sub(r'doi:\s*\S+', ' ', teks)
    teks = re.sub(r'\[[^\]]*\]', ' ', teks)
    teks = re.sub(r'\([^\)]*\d{4}[^\)]*\)', ' ', teks)
    teks = re.sub(r'[^a-z0-9.,\s]', ' ', teks)
    teks = re.sub(r'\s+', ' ', teks).strip()
    if len(teks) > max_len:
        teks = teks[:max_len].rsplit(' ', 1)[0]
    return teks

df_test['cleaned_teks'] = df_test['teks_dataset_id'].apply(clean_teks_dataset_id)

# =============================
# 5. LOAD MODEL & VECTORIZER
# =============================
import joblib

vectorizer = joblib.load("vectorizer.pkl")   # hasil training
model = joblib.load("model.pkl")             # hasil training

# =============================
# 6. PREDIKSI TYPE
# =============================

X_test = vectorizer.transform(df_test['cleaned_teks'])
y_pred = model.predict(X_test)

# Mapping kembali ke label
label_map_reverse = {0: 'Primary', 1: 'Secondary', 2:'Missing'}
df_test['type'] = [label_map_reverse[i] for i in y_pred]

# =============================
# 7. SIMPAN KE FILE CSV
# =============================

df_test[['article_id', 'dataset_id', 'type']].to_csv("hasil_prediksi_type.csv", index=False)
print("✅ Hasil prediksi disimpan ke 'hasil_prediksi_type.csv'")

✅ Hasil prediksi disimpan ke 'hasil_prediksi_type.csv'


In [27]:
df_test.head()

Unnamed: 0,article_id,dataset_id,teks_dataset_id,cleaned_teks,type
0,10.1002_ece3.5395,https://doi.org/10.5441/001/1.71r7pp6q,".5441/001/1.v1cs4nn0 , https://doi.org/10.544...",".5441 001 1.v1cs4nn0 , , , , . 1 introduction ...",Primary
1,10.1002_ece3.5395,https://doi.org/10.5441/001/1.4192t2j4,the Movebank Data Repository ( https://www.mo...,"the movebank data repository with , , , , . 1 ...",Primary
2,10.1002_ece3.5395,https://doi.org/10.5441/001/1.v1cs4nn0,Data Availability Statement: The data use...,data availability statement the data used for ...,Primary
3,10.1002_ece3.5395,https://doi.org/10.5441/001/1.ck04mn78,w.movebank.org ): with https://doi.org/10.544...,"w.movebank.org with , , , , . 1 introduction m...",Primary
4,10.1002_ece3.5395,https://doi.org/10.5441/001/1.c42j3js7,used for this study are available through the...,used for this study are available through the ...,Primary


In [28]:
df_test[['article_id', 'dataset_id', 'type']].head(50)

Unnamed: 0,article_id,dataset_id,type
0,10.1002_ece3.5395,https://doi.org/10.5441/001/1.71r7pp6q,Primary
1,10.1002_ece3.5395,https://doi.org/10.5441/001/1.4192t2j4,Primary
2,10.1002_ece3.5395,https://doi.org/10.5441/001/1.v1cs4nn0,Primary
3,10.1002_ece3.5395,https://doi.org/10.5441/001/1.ck04mn78,Primary
4,10.1002_ece3.5395,https://doi.org/10.5441/001/1.c42j3js7,Primary
5,10.1002_mp.14424,https://doi.org/10.7937/tcia.2020.6c7y‐gq39,Primary
6,10.1002_ece3.6144,https://doi.org/10.5061/dryad.zw3r22854,Primary
7,10.1002_ece3.6303,https://doi.org/10.5061/dryad.37pvmcvgb,Primary
8,10.1002_esp.5090,https://doi.org/10.1016/j.quageo.2007.12.001,Primary
9,10.1002_esp.5090,https://doi.org/10.1002/2016GL070815,Primary
