### EKSTRAK DATA

EKSTRAK FILE SUPAYA DATA TERSTRUKTUR

In [1]:
# Upload file
from google.colab import files
uploaded = files.upload()

Saving Data UAS simdat XML.zip to Data UAS simdat XML.zip


In [2]:
# Library parsing
from bs4 import BeautifulSoup
import zipfile
import os
import re
import glob
import pandas as pd

zip_file = list(uploaded.keys())[0]  # ambil nama file zip
extract_folder = "xml_folder"

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

In [5]:
# FUNGSI EKSTRAKSI
def extract_info_from_xml(filename):
    with open(filename, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "xml")

    # JUDUL
    judul = None
    if soup.find("article-title"):
        judul = soup.find("article-title").text.strip()
    elif soup.find("title", {"type": "main"}):
        judul = soup.find("title", {"type": "main"}).text.strip()
    elif soup.find("title"):
        judul = soup.find("title").text.strip()

    # PENULIS
    penulis = []

    # 1. Format NLM/JATS: <contrib contrib-type="author">
    for contrib in soup.find_all("contrib", {"contrib-type": "author"}):
        name_tag = contrib.find("name")
        if name_tag:
            given = name_tag.find("given-names")
            surname = name_tag.find("surname")
            given_text = given.text.strip() if given else ""
            surname_text = surname.text.strip() if surname else ""
            full_name = f"{given_text} {surname_text}".strip()
            if full_name:
                penulis.append(full_name)

    # 2. Format TEI-like: <author><persname><forename>, <surname>
    for author in soup.find_all("author"):
        persname = author.find("persname")
        if persname:
            forename = persname.find("forename")
            surname = persname.find("surname")
            forename_text = forename.text.strip() if forename else ""
            surname_text = surname.text.strip() if surname else ""
            full_name = f"{forename_text} {surname_text}".strip()
            if full_name and full_name not in penulis:
                penulis.append(full_name)

    # Gabungkan
    penulis_str = ", ".join(penulis)

    # TAHUN TERBIT
    tahun_terbit = None

    # 1. Format umum NLM
    pub_date = soup.find("pub-date")
    if pub_date and pub_date.find("year"):
        tahun_terbit = pub_date.find("year").text.strip()

    # 2. Format <date><year>
    elif soup.find("date") and soup.find("date").find("year"):
        tahun_terbit = soup.find("date").find("year").text.strip()

    # 3. Format <date when="YYYY-MM-DD">
    elif soup.find("date", {"when": True}):
        date_when = soup.find("date", {"when": True})["when"]
        tahun_terbit = date_when[:4]  # ambil 4 digit awal

    # 4. Format text string berisi tahun (fallback)
    elif soup.find("date") and soup.find("date").text:
        match = re.search(r"\b(19|20)\d{2}\b", soup.find("date").text)
        if match:
            tahun_terbit = match.group(0)

    # METODOLOGI
    metodologi = None
    method_keywords = [
        "method", "methods", "materials and methods", "data and methods",
        "methodology", "experimental section"
    ]

    # --- 1. Exact match di title/head ---
    for sec in soup.find_all(["sec", "div", "section"]):
        title = sec.find(["title", "head"])
        if title and title.text.strip().lower() in method_keywords:
            paragraf_list = sec.find_all("p")
            metodologi = "\n\n".join(p.text.strip() for p in paragraf_list) if paragraf_list else sec.get_text(separator="\n").strip()
            break

    # --- 2. Partial match (pakai in) ---
    if not metodologi:
        for sec in soup.find_all(["sec", "div", "section"]):
            title = sec.find(["title", "head"])
            if title and any(kw in title.text.strip().lower() for kw in method_keywords):
                paragraf_list = sec.find_all("p")
                metodologi = "\n\n".join(p.text.strip() for p in paragraf_list) if paragraf_list else sec.get_text(separator="\n").strip()
                break

    # --- 3. Full text contains "method" ---
    if not metodologi:
        for sec in soup.find_all(["sec", "div", "section"]):
            full_text = sec.get_text(separator="\n").lower()
            if any(kw in full_text for kw in method_keywords):
                metodologi = sec.get_text(separator="\n").strip()
                break


    # KALIMAT MENGANDUNG "DATA"
    kalimat_berisi_data = []

    # gabungkan semua teks dari abstract sampai body
    teks_kandidat = []
    for tag in soup.find_all(["abstract", "body", "sec", "p"]):
        teks_kandidat.append(tag.get_text(separator=" ").strip())

    full_text = " ".join(teks_kandidat)

    # split per kalimat
    kalimat_list = re.split(r'(?<=[.!?])\s+', full_text)
    kalimat_berisi_data = [kal for kal in kalimat_list if "data" in kal.lower()]

    # Gabungkan kalimat jadi satu string (jika mau versi string)
    kalimat_data_str = " || ".join(kalimat_berisi_data[:10])

    # DATASET ID
    dataset_ids = []

    # 1. Jika ada tag <dataset_id>
    for tag in soup.find_all("dataset_id"):
        if tag.text.strip():
            dataset_ids.append(tag.text.strip())

    # 2. Jika ada tag <id> di dalam <data-set>
    for data_set in soup.find_all("data-set"):
        id_tag = data_set.find("id")
        if id_tag and id_tag.text.strip():
            dataset_ids.append(id_tag.text.strip())

    # 3. Jika ada tag <ext-link> dengan tipe "dataset"
    for ext_link in soup.find_all("ext-link", {"ext-link-type": "dataset"}):
        if ext_link.text.strip():
            dataset_ids.append(ext_link.text.strip())
        elif ext_link.get("xlink:href"):
            dataset_ids.append(ext_link["xlink:href"].strip())

    # 4. Cari semua yang mengandung "https://doi.org/"
    for tag in soup.find_all():
        if tag.string and "https://doi.org/" in tag.string:
            dataset_ids.append(tag.string.strip())
        for attr_value in tag.attrs.values():
            if isinstance(attr_value, str) and "https://doi.org/" in attr_value:
                dataset_ids.append(attr_value.strip())

    # 5. Cari pola khusus seperti CHEMBL, IPR00, GSE, SRP, EMPIAR-, ENSBTAG000, atau IPR
    pattern = r"\b(?:CHEMBL\d+|IPR00\d+|GSE\d+|SRP\d+|EMPIAR-\d+|ENSBTAG000\d+|IPR\d+)\b"

    for tag in soup.find_all():
        # Cek isi teks
        if tag.string:
            matches = re.findall(pattern, tag.string)
            dataset_ids.extend(matches)
        # Cek semua atribut
        for attr_val in tag.attrs.values():
            if isinstance(attr_val, str):
                matches = re.findall(pattern, attr_val)
                dataset_ids.extend(matches)

    # Gabungkan semua dataset_id unik
    dataset_id_str = ", ".join(sorted(set(dataset_ids)))

    return {
        "filename": os.path.basename(filename),
        "judul": judul,
        "penulis": penulis_str,
        "tahun_terbit": tahun_terbit,
        "metodologi": metodologi,
        "kalimat_dengan_data": kalimat_data_str,
        "dataset_id": dataset_id_str
    }