Pengeksportan Data Dari File Database ke dalam String Besar

In [None]:
from pathlib import Path
import re

# ====== KONFIGURASI ======
SQL_PATHS = [
    r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Preprocessing\proposals.sql",
    "/mnt/data/proposals.sql",
]

START_MARKER = """INSERT INTO `proposals` (`id`, `nim`, `periode_id`, `judul`, `latar_belakang`, `rumusan`, `batasan`, `tujuan`, `referensi`, `status`, `user_id`, `modified`, `filename`, `filedir`, `mime_type`, `manfaat`, `revisi_dari`, `dosen_id`, `upload_revisi`) VALUES"""
END_MARKER = """
--
-- Indexes for dumped tables
--
""".strip("\n")  # biar lebih fleksibel di awal/akhir baris

# ====== UTIL ======
def read_all(paths):
    for p in map(Path, paths):
        if p.exists():
            for enc in ("utf-8", "utf-8-sig", "latin-1"):
                try:
                    return p.read_text(encoding=enc)
                except UnicodeDecodeError:
                    continue
            return p.read_bytes().decode("latin-1", errors="replace")
    raise FileNotFoundError("File SQL tidak ditemukan di path yang disediakan.")

def normalize_newlines(s: str) -> str:
    return s.replace("\r\n", "\n").replace("\r", "\n")

def whitespace_tolerant_regex(s: str) -> re.Pattern:
    """
    Buat regex yang menganggap semua whitespace di 's' bisa jadi
    spasi/enter/tab berapa pun. Karakter lain di-escape supaya literal.
    """
    # Normalize whitespace di marker dulu
    s = normalize_newlines(s)
    # Pisah berdasar whitespace lalu sambung dengan \s+
    parts = re.split(r"\s+", s.strip())
    pattern = r"\s*".join(map(re.escape, parts))
    return re.compile(pattern, flags=re.IGNORECASE | re.DOTALL)

def find_first(haystack: str, needle: str) -> int:
    """Cari needle persis; kalau gagal, pakai regex toleran whitespace."""
    idx = haystack.find(needle)
    if idx >= 0:
        return idx
    # fallback regex
    m = whitespace_tolerant_regex(needle).search(haystack)
    return m.start() if m else -1

def find_between_markers(text: str, start_marker: str, end_marker: str) -> str:
    t = normalize_newlines(text)
    sm = normalize_newlines(start_marker).strip()
    em = normalize_newlines(end_marker).strip()

    i = find_first(t, sm)
    if i < 0:
     
        return t

    sm_re = whitespace_tolerant_regex(sm)
    m = sm_re.search(t, i)
    start_after = m.end() if m else i + len(sm)

    j = find_first(t[start_after:], em)
    if j < 0:
        # Tidak ada penutup -> ambil dari setelah pembuka sampai akhir
        return t[start_after:]
    end_index = start_after + j
    return t[start_after:end_index]

# ====== MAIN ======
if __name__ == "__main__":
    whole = read_all(SQL_PATHS)
    data = find_between_markers(whole, START_MARKER, END_MARKER)
    out = Path("between_insert_and_indexes.sql")
    out.write_text(data, encoding="utf-8")

    # Bungkus ke variabel Python 'data' kalau mau dipakai di script lain
    py_out = Path("./Data.py")
    py_out.write_text('data = """\n' + data.replace('"""', r'\"\"\"') + '\n"""', encoding="utf-8")

    print(f"OK. Panjang hasil: {len(data):,} chars")
    print(f"Simpan: {out.resolve()}")
    print(f"Juga file Python: {py_out.resolve()}")


OK. Panjang hasil: 8,819,044 chars
Simpan: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Preprocessing\between_insert_and_indexes.sql
Juga file Python: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Preprocessing\Data.py


Parsing String Ke List Proposal

In [2]:
import re
import html
import unicodedata as ud
from Data import data

def is_alpha(ch):
    return bool(ch) and (ch.isalpha() or ud.category(ch).startswith('L'))

def is_numeric(ch): 
    return bool(ch) and (ch.isdigit() or ud.category(ch).startswith('N'))

def is_space_or_punct_or_symbol(ch: str) -> bool:
    cat = ud.category(ch)     # contoh: 'Ll','Lu','Nd','Ps','Pe','Zs','Po','Sm','So',...
    return cat[0] in ('Z', 'P', 'S')

def removeHTMLString(data):
    
    #Komleks Taq HTML
    data = data.replace('<p class=\"MsoNormal\" style=\"margin-left:.25in;text-align:justify;text-indent:\r\n.25in;line-height:150%\"><span style=\"font-family:times new roman,serif; font-size:12.0pt; line-height:150%\">',"")
    data = data.replace("<o:p></o:p>", "")
    data = data.replace("<p class=\"MsoNormal\" style=\"margin-left:.25in;text-align:justify;text-indent:\r\n.25in;line-height:150%\">", "")
    data = data.replace("</p>\r\n\r\n<p><!--[if supportFields]><b style=\'mso-bidi-font-weight:normal\'><span lang=ZH-CN\r\nstyle=\'font-size:12.0pt;line-height:107%;font-family:\"Times New Roman\",serif;\r\nmso-fareast-font-family:Cambria;mso-ansi-language:ZH-CN;mso-fareast-language:\r\nZH-CN;mso-bidi-language:AR-SA\'><span style=\'mso-element:field-end\'></span></span></b><![endif]--></p>\r\n","")
    data = data.replace("<p><!--[if supportFields]><b style=\'mso-bidi-font-weight:\r\nnormal\'><span lang=ZH-CN style=\'font-size:12.0pt;line-height:107%;font-family:\r\n\"Times New Roman\",serif;mso-fareast-font-family:Cambria\'><span\r\nstyle=\'mso-element:field-begin\'></span></span></b><b style=\'mso-bidi-font-weight:\r\nnormal\'><span style=\'font-size:12.0pt;line-height:107%;font-family:\"Times New Roman\",serif;\r\nmso-fareast-font-family:Cambria;mso-ansi-language:EN-US\'><span\r\nstyle=\'mso-spacerun:yes\'> </span>BIBLIOGRAPHY<span style=\'mso-spacerun:yes\'> \r\n</span>\\l 1033 </span></b><b style=\'mso-bidi-font-weight:normal\'><span\r\nlang=ZH-CN style=\'font-size:12.0pt;line-height:107%;font-family:\"Times New Roman\",serif;\r\nmso-fareast-font-family:Cambria\'><span style=\'mso-element:field-separator\'></span></span></b><![endif]-->","")
    data = data.replace("<!--[if supportFields]><b style=\'mso-bidi-font-weight:\r\nnormal\'><span style=\'font-size:12.0pt;line-height:150%;font-family:\"Times New Roman\",serif;\r\nmso-fareast-font-family:\"Times New Roman\";color:black\'><span style=\'mso-element:\r\nfield-begin;mso-field-lock:yes\'></span>","")
    data = data.replace("<span style=\'mso-element:field-separator\'></span></span></b><![endif]-->","")
    data = data.replace("</p>\r\n\r\n<p><!--[if supportFields]><b\r\nstyle=\'mso-bidi-font-weight:normal\'><span style=\'font-size:12.0pt;line-height:\r\n150%;font-family:\"Times New Roman\",serif;mso-fareast-font-family:\"Times New Roman\";\r\ncolor:black\'><span style=\'mso-element:field-end\'></span></span></b><![endif]--><strong>&nbsp;</strong></p>\r\n", "")
    data = data.replace("INSERT INTO `proposals` (`id`, `nim`, `periode_id`, `judul`, `latar_belakang`, `rumusan`, `batasan`, `tujuan`, `referensi`, `status`, `user_id`, `modified`, `filename`, `filedir`, `mime_type`, `manfaat`, `revisi_dari`, `dosen_id`, `upload_revisi`) VALUES", "")
    data = data.replace(":</p>\r\n\r\n<p>", ": ")
    data = data.replace(")</p>\r\n\r\n<p>", "). ")
    data = data.replace(".</p>\r\n\r\n<p>", ". ")
    data = data.replace("</p>\r\n\r\n<p>", " ")
    data = data.replace(":\r\na", ": ")

    #Vairasi Taq P
    data = data.replace("<p>?</p>","")
    data = data.replace("1.</p>","")
    data = data.replace(".?</p>",".")
    data = data.replace(".</p>",".")
    data = data.replace(")</p>",").")
    data = data.replace('</p>',"")
    data = data.replace("</p>","")
    data = data.replace("<p>","")
    data = data.replace("./p>",".")
    
    #Variasi Entitas
    data = data.replace("&#39;","`")
    data = data.replace("&hellip;..","")
    data = data.replace("&nbsp;"," ")
    data = data.replace("&iuml;", "ï")
    data = data.replace("&igrave;;","ì")
    data = data.replace("&uuml;","ü")
    data = data.replace("&ouml;","ö")
    data = data.replace("&iacute;","í")
    data = data.replace("&egrave;","è")
    data = data.replace("&eacute;","é")
    data = data.replace("&agrave;","à")
    data = data.replace("v&gt;","")
    
    
    #Variasi Gagal Encoding
    data = data.replace("p?ny?n", "pinyin")
    data = data.replace("h?nz?", "hanzi")
    
    #Variasi Strong
    data = data.replace("<strong>", "")
    data = data.replace("/strong","")
    
    #Variasi ol li
    data = data.replace("<ol><li><em>","")
    data = data.replace("<li><em>",", ")
    data = data.replace(".</li>\r\n",", ")
    data = data.replace("</li>","")
    data = data.replace("<li>"," ")
    data = data.replace("<ul>", "")
    data = data.replace("</ul>", "")
    data = data.replace("<ul", "")
    data = data.replace("</ul", "")
    data = data.replace("<ol>","")
    data = data.replace("</ol>","")
    data = data.replace("<>","")

    #Variasi T
    data = data.replace("\t1", "")
    data = data.replace("\t2", "")
    data = data.replace("\t", "")

    #Variasi Lainnya    
    data = data.replace("\r","")
    data = data.replace("</span>","")
    data = data.replace("<span>","")
    data = data.replace("<br />","")
    data = data.replace("<br/>","")
    data = data.replace("</em>", "")
    data = data.replace("<em>", "")
    data = data.replace("\n","")
    data = data.replace("/b","")
    
    data = _fix_em_dot_rule(data)

    # ---- Hilangkan penanda [number] (mis. [1], [23]) ----
    data = re.sub(r'\[\s*\d+\s*\]\s*', '', data)
    data = re.sub(r'\.\s*([A-Za-z])\s*\.', r'. \1.', data)
    # data = data.replace("</em>.", ",")

    data = re.sub(r'([.!?])(?:\s*[.!?])+(?=\s|$)', r'\1', data)
    data = re.sub(r'<[^>]+>', '', data)
    data = html.unescape(data)
    
    return data

import re

def _fix_em_dot_rule(text: str) -> str:
    """
    Ubah pola:  <word></em>. <word>
    sesuai aturan kapitalisasi yang diminta.
    """
    # word = huruf awal (Unicode letter) lalu susulan \w/- (biar aman utk bahasa Indo/Inggris)
    # gunakan UNICODE agar .isupper()/islower() relevan
    pat = re.compile(r'(?P<left>\b[^\W\d_][\w\-]*)\s*</em>\.\s+(?P<right>[^\W\d_][\w\-]*)', re.UNICODE)

    out = []
    last = 0
    for m in pat.finditer(text):
        lw = m.group('left')
        rw = m.group('right')

        l_upper = lw[0].isupper()
        l_lower = lw[0].islower()
        r_upper = rw[0].isupper()
        r_lower = rw[0].islower()

        # default: boundary kalimat
        sep = ". "
        new_lw = lw

        if l_lower and r_lower:
            sep = ", "
            new_lw = lw
        elif l_upper and r_upper:
            sep = ". "
            new_lw = lw
        elif l_upper and r_lower:
            sep = ", "
            # decapitalize kata sebelum
            new_lw = lw[0].lower() + lw[1:]
        elif l_lower and r_upper:
            sep = ". "
            new_lw = lw

        # tambahkan bagian sebelum match
        out.append(text[last:m.start('left')])
        # susun pengganti
        out.append(new_lw + sep + rw)
        last = m.end('right')

    out.append(text[last:])
    return "".join(out)




def splitField(rawData): #Data String Besar
    # Memecah string besar menjadi potongan-potongan data per-insert.
    # Setiap elemen dalam list ini berisi 9 data proposal.
    dataPerInsert = []

    # Memecah 9 data proposal dari setiap elemen dataPerInsert
    # menjadi list yang setiap elemennya adalah 1 data proposal.
    proposalList = []

    # Mengelompokkan data proposal yang telah dipisahkan
    # berdasarkan ID-nya.
    groupedProposalsById = []

    # Memisahkan setiap data proposal menjadi bagian-bagian yang lebih kecil
    # sesuai struktur field. Namun, beberapa field yang seharusnya digabung
    # ikut terpisah.
    rawSplitFields = []

    # Menggabungkan kembali bagian-bagian field yang terpisah dari
    # rawSplitFields.
    recombinedFields = []

    dataFinal = [] #Data proposal yang sudah sesuai struktur proposal
    kalimat = "" #Variabel untuk menampung kalimat yang terpisah untuk digabung kembali
    kalimatSebelumnya = "" #Variabel untuk menampung kalimat sebelumnya yang ketika sudah lengkap akan dimasukkan ke dalam variable kalimat sebelum di insert ke data proposal
    StringBuka = False #Variable untuk mendeteksi apakah ada tanda ' yang belum tertutup dalam sebuah string

    totalRemoveProposal = 0 #Variabel untuk menampung jumlah proposal yang dihapus karena judul kosong
    dataPerInsert = rawData.split("');")
    for i in dataPerInsert :
        proposalList = i.split("'),")
        for j in proposalList :
            groupedProposalsById.append(j)

    for i in groupedProposalsById :
        if len(recombinedFields) == 19 :
            # if recombinedFields[0] == '1169':
            #     print("debug")
            # if recombinedFields[0] == '(380' :
            #     print("debug")
            if recombinedFields[3] == "" or recombinedFields[3] == "-" or recombinedFields[3] == " '-'" or recombinedFields[3] == "''" or recombinedFields[3] == " ' '" or recombinedFields[3] == " " or recombinedFields[3] == "  " or recombinedFields[3] == " ''":
                totalRemoveProposal = totalRemoveProposal + 1
                print(recombinedFields[0])
                recombinedFields = []
            elif len(recombinedFields[4].split()) < 2 and len(recombinedFields[5].split()) < 2 and len(recombinedFields[6].split()) < 2 and len(recombinedFields[7].split()) < 2 and len(recombinedFields[8].split()) < 2 :
                totalRemoveProposal = totalRemoveProposal + 1
                print(recombinedFields[0])
                recombinedFields = []
            else :
                index = 0
                for field in recombinedFields :
                    listChar = list(field)
                    for indexChar in range(len(listChar)) :
                        if is_alpha(listChar[indexChar]) or is_numeric(listChar[indexChar]):
                            recombinedFields[index] = field
                            index = index + 1
                            break
                        if  (is_space_or_punct_or_symbol(listChar[indexChar]) or listChar[indexChar] in ("'", '"', ".", ",", ":", ";", "!", "?", "(", ")", "[", "]", "{", "}", "<", ">", "-", "_", "/", "\\", "&", "%", "$", "#", "@", "*", "+", "=", "~", "`", "^")):
                            EXCLUDE = ("'", '"')  # yang TIDAK dihapus
                            if listChar[indexChar] not in EXCLUDE:
                                listChar[indexChar] = ""  # cek bukan kutip
                                field = "".join(listChar)
                            else :
                                if is_alpha(listChar[indexChar+1]) or is_numeric(listChar[indexChar+1]):
                                    recombinedFields[index] = field
                                    index = index + 1
                                    break
                                    # print(listChar[indexChar+1])
                                else :
                                    listChar[indexChar+1] = ""  # cek bukan kutip
                                    field = "".join(listChar)
                                    recombinedFields[index] = field
                                    index = index + 1
                                    break
                dataFinal.append(recombinedFields)

                recombinedFields = []
        debug  = "sabar"
        debugIndex = 0
        rawSplitFields = i.split(",")
        for j in rawSplitFields  :
            if (rawSplitFields[len(rawSplitFields)- 1] == j):
                recombinedFields.append(j)
            else :

                if j == '(3328' :
                    debug = "gas"
                    print("debug")
                if debug == "gas" :
                    debugIndex = debugIndex + 1
                    if debugIndex == 13 :
                        print("debug")
                if len(j) != 0:
                    if j.count("'") == 1 and StringBuka == False: #Mendeteksi apakah Kutip String belun dibuka dan hanya terdapat 1 kutip untuk menandakan bahwa kalimat itu memiliki kelanjutan sehingga hasil akhir kalimat tidak akan langsung diinputkan ke dalam data proposal
                        kalimat = kalimat + ", " + j #kalimat pertama masuk ke sini
                        StringBuka = True
                    elif j.count("'") == 0 and StringBuka == True: #Mendeteksi apakah kutip string sudah dibuka dan tidak ada kutip pada kata tersebut untuk menandakan kalimat akan terus berlanjut atau memilikii kelanjutan
                        listCharKalimat = list(j) #Pengecekan apakah ini merupakan kalimat yang akhir atau hanya kalimat yang terpisah hasil split tanda (,)
                        for indexChara in range(len(listCharKalimat)) :
                            if is_alpha(listCharKalimat[indexChara]) or is_numeric(listCharKalimat[indexChara]):
                                kalimat = kalimat + ", " + j
                                break
                            if  (is_space_or_punct_or_symbol(listCharKalimat[indexChara]) or listCharKalimat[indexChara] in ("'", '"', ".", ",", ":", ";", "!", "?", "(", ")", "[", "]", "{", "}", "<", ">", "-", "_", "/", "\\", "&", "%", "$", "#", "@", "*", "+", "=", "~", "`", "^")):
                                EXCLUDE = ("'", '"','.')  # yang TIDAK dihapus
                                if listCharKalimat[indexChara] not in EXCLUDE:
                                    listCharKalimat[indexChara] = ""
                                else:
                                    aha = "".join(listCharKalimat)
                                    kalimat = kalimat + aha
                                    break
                    elif j.count("'") >= 1 and StringBuka == True and (re.findall(r"\w+'\w+|\w+'\s\w+|'\w+|\b\w+'\w*\b|'\w[\w\s]*?'|'\w+",j)) : #Mendeteksi apakah kutip string sudah dibuka dan dalam kalimat terdeteksi lebih atau setidaknya terdapat 1 kutip dalam kalimat, kemudian mendeteksi apakah kutip itu merepakan kutip penutup atau kutip dari nama orang, jalan, tempat atau suatu kata sehingga menyatakan kalimat belum berakhir)
                        if  (j[-1]=="'" and j[-2]==".") : #Mendeteksi apakah kutip itu merupakan kutip penutup kalimat dengan melihat adanya titik sebelum kutip penutup
                            kalimatSebelumnya = "" + kalimat
                            kalimat = kalimat + ", " + j
                            recombinedFields.append(kalimat)
                            # writer.writerow([recombinedFields[0], kalimat])
                            kalimat = ""
                            StringBuka = False

                        else  :
                            kalimat = kalimat + ", " + j
                    elif j.count("'") == 1 and StringBuka == True and ( not re.findall(r"\w+'\w+|\w+'\s\w+",j) or not re.findall(r"\b\w+'\w*\b|'\w[\w\s]*?'",j)) : #Mendeteksi apakah hanya terdapat 1 kutip dan kutip itu berada dipaling belakang sebagai penutup kalimat
                        kalimatLast ="" #Untuk mengatasi kelebihan tanda baca , pada akhir kalimat
                        if len(j) < 4:
                            for k in j :
                                if k != " ":
                                    kalimatLast = kalimatLast + k
                            recombinedFields.append(kalimat + kalimatLast)
                            kalimat = ""
                            StringBuka = False
                            continue
                        recombinedFields.append(kalimat + ", " + j)
                        # writer.writerow([recombinedFields[0], kalimat])
                        kalimat = ""
                        StringBuka = False
                    elif j.count("'") == 2 : #Medeteksi apa bila terdeteksi 2 kutip dalam 1 kalimat sebagai pembuka dan penutup, bila terdapat 2  dan bukan penutup atau pembuka dia akan masuk pada elif kedua terlebih dahulu, biasa kalimat pendek masuk ke dalam sini
                        recombinedFields.append(j) #judul masuk disini
                    elif j.count("'") == 0 and j[len(j)-1].isdigit() and j[0].isdigit() : #Mendeteksi apabila value dalam string hanya berupa angka dan bukan string alfabet
                        recombinedFields.append(j)
                    else : #Diluar dari semua itu akan dianggap sebagai sebuah field yang tidak terdeteksi sama sekali karena alasan tertentu
                        recombinedFields.append(j)
    print("Total Remove Proposal: ", totalRemoveProposal)
    return dataFinal
daftarProposal = []
data = "INSERT INTO `proposals` (`id`, `nim`, `periode_id`, `judul`, `latar_belakang`, `rumusan`, `batasan`, `tujuan`, `referensi`, `status`, `user_id`, `modified`, `filename`, `filedir`, `mime_type`, `manfaat`, `revisi_dari`, `dosen_id`, `upload_revisi`) VALUES" + data
data = removeHTMLString(data) #Penghapusan HTML tag pada data
daftarProposal = splitField(data)
print("Jumlah data setelah diproses: ", len(daftarProposal))


(914
debug
debug
Total Remove Proposal:  1
Jumlah data setelah diproses:  1279


In [3]:
daftarParagraf = []
for proposal in daftarProposal :
    daftarParagraf.append([proposal[0],"Judul",proposal[3]])
    daftarParagraf.append([proposal[0],"LatarBelakang",proposal[4]])
    daftarParagraf.append([proposal[0],"Rumusan",proposal[5]])
    daftarParagraf.append([proposal[0],"Tujuan",proposal[7]])

Jumlah Paragraf Latar Belakang:  1279
Jumlah Paragraf Judul:  1279
Jumlah Paragraf Rumusan:  1279
Jumlah Paragraf Tujuan:  1279


In [None]:
print("debug")

Code untuk memecah proposal pada bagian latar belakang dan tujuan menjadi kalimat

In [5]:
import re

# ======================= UTIL: Masking & Unmasking URL =======================
# Kita anggap data paragraf aslinya aman (tidak terpecah), jadi regex URL yang "normal" cukup
_URL_PATTERN = re.compile(
    r'(?i)\b(?:https?://|www\.)[^\s<>()]+'  # http(s)://... atau www....
)

# ====== PRE-HEAL ringan untuk domain terpisah (ClickTale. Com, dll.) ======
def pre_heal_domains(text: str) -> str:
    if not text:
        return text
    # “example. com” / “ClickTale. Com” → “example.com” / “ClickTale.Com”
    TLD = r'(?:com|net|org|edu|gov|mil|id|co\.id|ac\.id|go\.id|sch\.id|my\.id|io|ai)'
    text = re.sub(rf'\b([A-Za-z0-9-]{{2,}})\.\s*({TLD})\b', r'\1.\2', text, flags=re.I)
    # “www . example . com” → “www.example.com” (jaga-jaga)
    text = re.sub(r'(?i)\bwww\s*\.\s*', 'www.', text)
    text = re.sub(r'(?i)\b([A-Za-z0-9-]{2,})\s*\.\s*', r'\1.', text)  # hanya saat terlihat rantai domain
    return text


def mask_links(text: str, daftarLink: list) -> str:
    """
    Deteksi URL, simpan ke daftarLink, lalu ganti di teks menjadi token $daftarLink{idx}.
    Trailing punctuation seperti '.', ',', ')', ';', ':' akan dipertahankan di luar token.
    """
    if not text:
        return text

    def _repl(m: re.Match) -> str:
        raw = m.group(0)
        # Pisahkan trailing punctuations yang tak termasuk URL
        trail = ''
        while raw and raw[-1] in '.,;:)]':
            trail = raw[-1] + trail
            raw = raw[:-1]
        idx = len(daftarLink)
        daftarLink.append(raw)
        return f"$daftarLink{idx}{trail}"

    return _URL_PATTERN.sub(_repl, text)

def unmask_links(text: str, daftarLink: list) -> str:
    """Kembalikan token $daftarLink{idx} menjadi URL aslinya."""
    if not text:
        return text
    return re.sub(
        r'\$daftarLink(\d+)',
        lambda m: daftarLink[int(m.group(1))] if int(m.group(1)) < len(daftarLink) else m.group(0),
        text
    )

# ======================= Normalisasi whitespace ringan =======================
def normalize_ws(s: str) -> str:
    if not s:
        return s
    ELLIPS = "<ELLIPS_TOKEN>"
    s = s.replace("...", "…").replace("…", ELLIPS)

    # kompres pengulangan tanda baca umum
    s = re.sub(r'(?<=\d)\.\s+(?=\d)', '.', s)
    s = re.sub(r'\.(?:\s*\.)+', '.', s)
    s = re.sub(r'([!?])\1+', r'\1', s)
    s = re.sub(r'([,;:])\s*\1+', r'\1', s)

    # whitespace
    s = s.replace("\u00A0", " ")
    s = re.sub(r"[ \t\r\f\v]+", " ", s)

    # spasi sebelum penutup
    s = re.sub(r"\s+([,.;?!)\]\}\"'])", r"\1", s)
    # spasi setelah tanda baca umum (kecuali diikuti penutup/petik)
    s = re.sub(r"(?<!\d)([,;?!\.])(?!\d|[)\]]|['\"]) (?=\S)", r"\1 ", s)

    # titik ribuan/desimal yang sering kepisah
    s = re.sub(r"(?<=\d)\s*\.\s*(?=\d{3}\b)", ".", s)
    s = re.sub(r"(?<=\d)\s*\.\s*(?=\d)", ".", s)

    s = re.sub(r"\s{2,}", " ", s)
    s = s.replace(ELLIPS, "…")
    return s.strip()

# ===== Fallback clean_html =====
def clean_html(text, keep_linebreaks=True):
    if text is None:
        return ""
    t = re.sub(r"(?is)<br\s*/?>", "\n", text) if keep_linebreaks else text
    t = re.sub(r"(?is)<[^>]+>", " ", t)
    return re.sub(r"\s{2,}", " ", t).strip()

# -------------------- Proteksi titik yang bukan akhir kalimat -----------------
PROTECTED  = "<DOT>"
PCOLON     = "<COLON>"
_END_PRIORITY = {"?": 3, "!": 2, ".": 1, "…": 1}

def protect_dynamic(text: str) -> str:
    s = text

    # Gelar medis Sp.X
    s = re.sub(r'\bSp\s*\.\s*([A-Z]{1,3})\b', lambda m: 'Sp' + PROTECTED + m.group(1), s)

    # Gelar di depan nama (dr., drs., ir.)
    s = re.sub(r'\b(dr|drs|ir)\s*\.\s*(?=[A-Z])', lambda m: m.group(1) + PROTECTED + ' ', s, flags=re.I)

    # enumerasi "1." / "a."
    s = re.sub(r"(?:(?<=^)|(?<=[\s\"'(\[\{<:;?!\.]))(\d{1,3})\.(?=\s*\S)", r"\1" + PROTECTED, s)
    s = re.sub(r"(?:(?<=^)|(?<=[\s\"'(\[\{<:;?!\.]))([A-Za-z])\.(?=\s*\S)", r"\1" + PROTECTED, s)
    # juga varian ")"
    s = re.sub(r"(?:(?<=^)|(?<=[\s\"'(\[\{<:;?!\.]))(\d{1,3})\)(?=\s*\S)", r"\1)", s)
    s = re.sub(r"(?:(?<=^)|(?<=[\s\"'(\[\{<:;?!\.]))([A-Za-z])\)(?=\s*\S)", r"\1)", s)

    # ellipsis → lindungi titiknya
    s = re.sub(r"\.\.\.", lambda m: m.group(0).replace(".", PROTECTED), s)

    # Prof. dr.
    s = re.sub(r"\bProf\.\s*dr\.", lambda m: m.group(0).replace(".", PROTECTED), s)

    # Multi-dot abbr (S.Kom., Ph.D., d.s.b.)
    s = re.sub(r"\b(?:[A-Za-z]{1,4}\.){2,}(?=[\s,;:)]|$)", lambda m: m.group(0).replace(".", PROTECTED), s)

    # whitelist lowercase abbr
    LOWER_ABBR_EXT = ("dll", "dsb", "dst", "dkk", "al", "ibid", "op", "cit", "vs", "hlm", "hal")
    s = re.sub(rf"\b(?:{'|'.join(LOWER_ABBR_EXT)})\.(?=\s)", lambda m: m.group(0).replace(".", PROTECTED), s, flags=re.I)

    # TitleCase short abbr (No., Jl., Prof., St.)
    s = re.sub(r"\b([A-Z][a-z]{0,3})\.(?=\s*[a-z0-9(])", r"\1" + PROTECTED, s)
    s = re.sub(r"\b([A-Z][a-z]{0,3})\.(?=\s*[A-Z][a-z])", r"\1" + PROTECTED, s)
    s = re.sub(r'\b(St)\.(?=\s)', r'\1' + PROTECTED, s)

    # inisial berurutan (R. A.)
    s = re.sub(r"\b([A-Z])\.(?=\s*[A-Z])", r"\1" + PROTECTED, s)

    # Dotted ALLCAP suffix: Node.JS, Socket.IO
    s = re.sub(r"(?<=\b[A-Za-z])\.(?=[A-Z]{2,5}\b)", PROTECTED, s)

    # Kode regulasi/nomor: KA.401 dst.
    s = re.sub(r'\b([A-Z]{1,4})\.(\d+)\b', lambda m: m.group(0).replace('.', PROTECTED), s)

    # rantai kode bertitik (AHU-0016081.AH.01.01)
    s = re.sub(r"\b([A-Z0-9-]+(?:\.[A-Z0-9-]+){1,})\b",
               lambda m: m.group(1).replace(".", PROTECTED), s)

    # desimal 1.23
    s = re.sub(r"(?<=\d)\.(?=\d)", PROTECTED, s)

    # titik sebelum tahun di dalam kurung (…., 2014)
    s = re.sub(r"\(([^)]*)\.(?=\s*\d{4}\))", lambda m: m.group(0).replace(".", PROTECTED), s)

    # pola (*.JPG)
    s = re.sub(r"\(\*\.[A-Za-z0-9]{1,5}\)", lambda m: m.group(0).replace(".", PROTECTED), s)

    # 'et al.' dan 'dkk.'
    s = re.sub(r'\bet\.\s*al\.', lambda m: m.group(0).replace('.', PROTECTED), s, flags=re.I)
    s = re.sub(r'\bdkk\.', lambda m: m.group(0).replace('.', PROTECTED), s, flags=re.I)

    # Titik antara kapital dan digit (KA.401)
    s = re.sub(r'(?<=\b[A-Z])\.(?=\d)', PROTECTED, s)

    # *** Tambahan kecil untuk "24 / 7" → "24/7" ***
    s = re.sub(r'(\d+)\s*/\s*(\d+)', r'\1/\2', s)

    # 1) Lindungi titik DI DALAM kurung pada pola "(Nama, 2016)." tapi biarkan titik sesudah kurung
    s = re.sub(
        r'\(([^()]*?,\s*\d{4}[a-z]?[^()]*)\)\.',
        lambda m: '(' + m.group(1).replace('.', PROTECTED) + ').',
        s
    )

    # Derajat/gelar populer
    s = re.sub(r'\bPh\s*\.\s*D\b', lambda m: m.group(0).replace('.', PROTECTED), s)
    s = re.sub(r'\bM\s*\.\s*[Ss]c\b\.?', lambda m: m.group(0).replace('.', PROTECTED), s)
    s = re.sub(r'\bS\s*\.\s*Kom\b\.?', lambda m: m.group(0).replace('.', PROTECTED), s)
    s = re.sub(r'\bD\s*\.\s*I\s*\.\s*C\b\.?', lambda m: m.group(0).replace('.', PROTECTED), s)

    # Ir.Nizam → Ir.<DOT> Nizam
    s = re.sub(r'\b(Ir)\s*\.(?=[A-Z])', r'\1' + PROTECTED + ' ', s)

    # 2) Varian titik setelah kurung di akhir string
    s = re.sub(
        r'\(([^()]*?,\s*\d{4}[a-z]?[^()]*)\)\.\s*$',
        lambda m: '(' + m.group(1).replace('.', PROTECTED) + ').',
        s
    )

    # (*.EXT) varian ber-spasi
    s = re.sub(r"\(\s*\*\s*\.\s*[A-Za-z0-9]{1,5}\s*\)", lambda m: m.group(0).replace(".", PROTECTED), s)

    # Nama.Inisial → lindungi
    s = re.sub(r'\b([A-Z][a-z]+)\.(?=[A-Z]\b)', lambda m: m.group(1) + PROTECTED, s)

    # KAPITAL.DIGIT varian spasi
    s = re.sub(r'\b([A-Z]{1,4})\s*\.\s*(\d+)\b', lambda m: m.group(1) + PROTECTED + m.group(2), s)

    # RANTAI KODE DENGAN SPASI
    s = re.sub(
        r'\b([A-Z]{1,5})\s*\.\s*(\d+)/([A-Z]{1,5})\s*\.\s*(\d+)/([A-Z]{2,6})/(\d{4})\b',
        lambda m: (m.group(1) + PROTECTED + m.group(2) + '/' +
                   m.group(3) + PROTECTED + m.group(4) + '/' +
                   m.group(5) + '/' + m.group(6)),
        s
    )

    return s

def unprotect(text: str) -> str:
    return text.replace(PROTECTED, ".").replace(PCOLON, ":")

# ====== Anti-glue post-pass (tanpa logic URL) ======
def anti_glue_overmerged(seg: str):
    if not seg or seg.rstrip().endswith(":"):
        return [seg]

    t = seg.strip()
    if t.count('(') != t.count(')'):
        return [seg]
    if re.search(r'\.\s+\((?=[^)]*\d{4})', t):
        return [seg]
    if re.match(r'^\(\s*[^()]*\d{4}[^()]*\)\.?$', t):
        return [seg]

    prot = protect_dynamic(t)
    # FORCE_END setelah "). " bila diikuti kata sitasi umum
    prot = re.sub(r'\)\.\s+(?=(?:diakses|diunduh|accessed|retrieved)\b)', '). <FORCE_END> ', prot, flags=re.I)
    split_re = re.compile(r'(?:(?<=[\.!?…])|\<FORCE_END\>)\s+(?=(?:[A-Z0-9""])|(?!\s)\()')

    parts = split_re.split(prot)
    parts = [unprotect(p).strip() for p in parts if p and p.strip()]
    parts = [p.replace('<FORCE_END>', '').strip() for p in parts if p and p.strip()]

    if len(parts) >= 2 and any(" " in p for p in parts):
        return parts
    return [seg]

# -------------------- Split per kalimat (robust, tanpa URL-rules) --------------------
def _choose_ender(cluster: str) -> str:
    best = "."
    best_score = 0
    for ch in cluster:
        score = _END_PRIORITY.get(ch, 0)
        if score > best_score:
            best, best_score = ch, score
    return best

def preclean_minimal(s: str) -> str:
    if not s:
        return s
    ELLIPS = "<ELLIPS_TOKEN>"
    s = s.replace("...", "…").replace("…", ELLIPS)
    s = re.sub(r'(?<=\d)\.\s+(?=\d)', '.', s)
    s = re.sub(r'\.(?:\s*\.)+', '.', s)
    s = re.sub(r'([!?])\1+', r'\1', s)
    s = re.sub(r'([,;:])\s*\1+', r'\1', s)
    # buang penutup kurung yatim
    s = re.sub(r'([\.!?])\s*[\)\]\}]+(\s+)(?=[A-Z(\"\'"])', r'\1\2', s)
    s = re.sub(r'([\.!?])\s*[\)\]\}]+\s*$', r'\1', s)
    s = s.replace(ELLIPS, "…")
    return s

def split_sentences(text: str):
    cleaned = clean_html(text, keep_linebreaks=True)
    cleaned = preclean_minimal(cleaned)
    s = protect_dynamic(cleaned).strip()

    END   = r"[\.?!;:…]"
    CLOSE = r"[\"')\]\}]"
    OPEN  = r"[\"'(\[\{<]"
    BUL   = r"[-–—•·*]+"
    sep = re.compile(rf"({END}+)(?:{CLOSE}+)?(?:\s*{BUL})?(?=(?:\s+|$|{OPEN}|[A-Z0-9]|{BUL}))")

    # pola sitasi
    citation_paren_re = re.compile(r"""^\(\s*[^()]*\d{4}[a-z]?[^()]*\)\.?$""", re.I | re.VERBOSE)
    citation_author_year_re = re.compile(r"""^[A-Z][A-Za-z .'\-]+?,\s*(?:[A-Z]\.\s*)+
                                             (?:,(?:\s*&|\s*and|\s*)[A-Z][A-Za-z .'\-]+?,\s*(?:[A-Z]\.\s*)+)*
                                             (?:,\s*et\s+al\.)?\s*\(\d{4}[a-z]?\)\.?$""",
                                         re.I | re.VERBOSE)
    surname_only = re.compile(r'^[A-Z][A-Za-z.\- ]+\.\s*$', re.I)
    surname_with_year_paren = re.compile(r"""^[A-Z][A-Za-z .'\-]+?\.\s*\(\d{4}[a-z]?[^)]*\)\.?$""",
                                         re.I | re.VERBOSE)

    # split awal
    parts, last = [], 0
    for m in sep.finditer(s):
        seg = s[last:m.end()].strip()
        if seg:
            parts.append(seg)
        last = m.end()
    tail = s[last:].strip()
    if tail:
        parts.append(tail)

    # tidy + unprotect
    parts = [re.sub(rf"^\s*{BUL}\s*", "", p) for p in parts]
    parts = [re.sub(r'^[\'\"]+', "", p) for p in parts]
    parts = [re.sub(r'[\'\"]+$', "", p) for p in parts]
    parts = [unprotect(p).strip() for p in parts if p.strip()]

    # GLUE: token enumerasi yang yatim
    glued, i = [], 0
    enum_token_re = re.compile(r'^[\s"\'(\[]([0-9]{1,3}|[A-Za-z])\.\s*$')
    while i < len(parts):
        cur = parts[i]
        if enum_token_re.match(cur) and i + 1 < len(parts):
            token = enum_token_re.match(cur).group(1) + "."
            glued.append(f"{token} {parts[i+1].lstrip()}")
            i += 2
        else:
            glued.append(cur)
            i += 1

    # GLUE: segmen berakhir ":" → satukan hanya dgn segmen berikutnya
    merged, buf = [], ""
    for seg in glued:
        if buf:
            merged.append((buf + " " + seg).strip()); buf = ""
        elif seg.rstrip().endswith(":"):
            buf = seg
        else:
            merged.append(seg)
    if buf:
        merged.append(buf.strip())
    final = merged

    # GLUE: sitasi multi‐segmen sampai ')' muncul
    combined, buf = [], None
    for seg in final:
        t = seg.strip()
        if buf is not None:
            buf = f"{buf} {t}"
            if re.search(r'\)\.?$', t):
                combined.append(buf.strip()); buf = None
            continue
        if re.match(r'^\(', t) and not re.search(r'\)\.?$', t):
            buf = t
        elif re.search(r'\(\s*$', t) or re.search(r'\bhlm\.\s*$', t, re.I):
            buf = t
        else:
            combined.append(seg)
    if buf:
        combined.append(buf.strip())
    final = combined

    # GLUE: "Ocepek." + "(2013: 1-5)."
    glued_name_year = []
    for seg in final:
        if glued_name_year and surname_only.match(glued_name_year[-1]) and citation_paren_re.match(seg.strip()):
            glued_name_year[-1] = f"{glued_name_year[-1].rstrip()} {seg.strip()}"
        else:
            glued_name_year.append(seg)
    final = glued_name_year

    # GLUE: sitasi utuh tempel ke sebelumnya
    glued_citations = []
    for seg in final:
        _t = seg.strip()
        if glued_citations and (
            citation_paren_re.match(_t) or
            citation_author_year_re.match(_t) or
            surname_with_year_paren.match(_t)
        ):
            glued_citations[-1] = f"{glued_citations[-1].rstrip()} {seg}".strip()
        else:
            glued_citations.append(seg)
    final = glued_citations

    # orphan close seg
    orphan_close_seg = re.compile(r'^[\)\]\}]+\.?$')
    final2 = []
    for seg in final:
        if final2 and orphan_close_seg.match(seg):
            continue
        final2.append(seg)
    final = final2

    # "15)." tempel ke sebelumnya
    glued4 = []
    for seg in final:
        if glued4 and re.match(r"^\d+\)\.?$", seg):
            glued4[-1] = glued4[-1].rstrip() + " " + seg
        else:
            glued4.append(seg)
    final = glued4

    # angka desimal/enumerasi yatim "0." "9)." "57)."
    orphan_num_dot = re.compile(r'^\d+[\.\)]\.?\s*$')
    glued_num = []
    for seg in final:
        t = seg.strip()
        if glued_num and orphan_num_dot.match(t):
            prev = glued_num[-1].rstrip()
            if not prev or prev[-1] not in '.!?':
                glued_num[-1] = (prev + " " + t).strip()
            else:
                glued_num.append(seg)
        else:
            glued_num.append(seg)
    final = glued_num

    # POST-PASS: pecah segmen yang masih memuat >1 kalimat
    post = []
    for seg in final:
        post.extend(anti_glue_overmerged(seg))
    final = [normalize_ws(x) for x in post]
    return final

# ============================= Pipeline utama ================================
# Asumsikan kamu sudah membangun daftarParagraf seperti:
# daftarParagraf = [
#   [idProposal, "Judul", "Isi"],
#   [idProposal, "LatarBelakang", "Isi"],
#   [idProposal, "Rumusan", "Isi"],
#   [idProposal, "Tujuan", "Isi"],
#   ...
# ]

print("Jumlah Paragraf:", len(daftarParagraf))

daftarLink = []  # bank URL global
daftarKalimatProposal = []

def _tag_lower(t: str) -> str:
    t = (t or "").strip()
    mapping = {
        "Judul": "judul",
        "LatarBelakang": "latar_belakang",
        "Rumusan": "rumusan",
        "Tujuan": "tujuan"
    }
    return mapping.get(t, t.lower())

for pid, tag, isi in daftarParagraf:
    tag_norm = _tag_lower(tag)

    if tag_norm == "judul":
        # Judul langsung 1 baris (tetap mask/unmask untuk konsistensi)
        healed = pre_heal_domains(isi or "")
        masked = mask_links(healed, daftarLink)
        kal = normalize_ws(masked)
        kal = unmask_links(kal, daftarLink)
        daftarKalimatProposal.append([pid, kal, "judul"])
        continue

    # Untuk paragraf lain: mask → split → unmask per kalimat
    healed = pre_heal_domains(isi or "")
    masked = mask_links(healed, daftarLink)
    for kal in split_sentences(masked):
        kal = normalize_ws(kal)
        kal = unmask_links(kal, daftarLink)
        if kal:
            daftarKalimatProposal.append([pid, kal, tag_norm])

# ============ POST-FLATTEN: pecah lagi item over-merged =============
daftarKalimatProposal_fix = []
for pid, kal, tag in daftarKalimatProposal:
    parts = anti_glue_overmerged(kal)
    if len(parts) > 1 and any(" " in p for p in parts):
        for p in parts:
            p = normalize_ws(p)
            if p:
                daftarKalimatProposal_fix.append([pid, p, tag])
    else:
        daftarKalimatProposal_fix.append([pid, normalize_ws(kal), tag])
daftarKalimatProposal = daftarKalimatProposal_fix

# (Opsional) Recovery kata nempel ekstrem (bukan URL terkait)
def _recover_overmerged_words(t: str) -> str:
    if re.search(r'[A-Za-z]{30,}', t):
        t = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', t)
        t = re.sub(r'([a-z]{5,})([a-z]{5,})', r'\1 \2', t)
    return t

# =================== POST-PASS LINTAS ITEM: re-glue cerdas ===================
def cross_glue_items(items):
    """
    Lintas-item (pid+tag sama) penggabungan:
    - URL-only & (URL) & angka+URL → tempel ke sebelumnya.
    - Ekor sitasi umum ( … (2016). ), sumber angka (24/7 … (2010).), paren tanpa tahun → tempel ke sebelumnya.
    - “Risti?” + “(2014, …) mengatakan …” (segmen berikut diawali kurung+year lalu lanjut teks) → tempel.
    - Brand. + Com/Net/Org/Id … → jadikan Brand.Com/Net/… (tanpa spasi nyasar).
    - Frasa judul pendek (≤3 kata) / typo ':.', ';' → tempel ke berikutnya.
    - Tail “diakses/accessed/retrieved …” → tempel ke sebelumnya.
    - Singkatan badan/instansi & alamat: UD., CV., PT., TBK., RS., PD., MT., KH. → zip kiri/kanan.
    - Node. JS / Socket. IO → tempel.
    - Rangkaian setelah ':' (daftar/pertanyaan beruntun) → gulung.
    """
    if not items:
        return items

    # --- Pola umum ---
    url_only        = re.compile(r'^\s*\(?\s*(?:https?://|www\.)\S+\s*\)?\.?\s*$', re.I)
    num_then_url    = re.compile(r'^\s*\d+\s+(?:https?://|www\.)\S+\s*$', re.I)
    paren_url_only  = re.compile(r'^\s*\(\s*(?:https?://|www\.)\S+\s*\)\.?\s*$', re.I)

    # sitasi klasik "(... 2016 ...)." satu baris penuh
    tail_citation_paren = re.compile(r'^\s*\([^()]*\d{4}[a-z]?\s*[^()]*\)\.?\s*$', re.I)
    # "Author ... (2016)." (awalan huruf)
    tail_author_year    = re.compile(r'^[A-Z][^()]{0,200}\(\s*\d{4}[a-z]?\s*\)\.?\s*$', re.I)
    # *** BARU: "24/7 Wall St. (2010)." (awalan angka/karakter non-huruf) ***
    tail_source_year_num= re.compile(r'^[0-9][^()]{0,200}\(\s*\d{4}[a-z]?\s*\)\.?\s*$', re.I)
    # *** BARU: paren TANPA tahun (anggap sitasi) ***
    tail_paren_no_year  = re.compile(r'^\s*\([^()]{1,120}\)\.?\s*$', re.I)
    # *** BARU: baris MULAI dengan (tahun …) LALU lanjut teks (untuk kasus "Risti? (2014, hlm. 4) mengatakan …") ***
    starts_paren_year_then_text = re.compile(r'^\s*\(\s*\d{4}[^\)]*\)\s+\S', re.I)

    # prev akhiri "et al." / "dkk."
    prev_etal_end = re.compile(r'(?:et\s+al|dkk)\.\s*$', re.I)

    # singkatan korporat/alamat
    corp_abbr_only   = re.compile(r'^\s*(?:UD|CV|PT|TBK|RS|PD|MT|KH)\.\s*$', re.I)
    ends_with_corp   = re.compile(r'(?:UD|CV|PT|TBK|RS|PD|MT|KH)\.\s*$', re.I)
    short_name_line  = re.compile(r'^[A-Z][\w&.\'\- ]{1,40}\.\s*$', re.U)
    starts_with_name = re.compile(r'^\s*[\'"“”‘’(]*[A-Z][A-Za-z0-9&.\'\-]*(?:\s+[A-Z][A-Za-z0-9&.\'\-]*)*', re.U)

    # domain/TLD
    tld = r'(?:com|net|org|io|ai|id|co\.id|ac\.id|go\.id|sch\.id|my\.id)'
    domainish_end    = re.compile(rf'\b[A-Za-z0-9-]{{2,}}\.{tld}\.\s*$', re.I)
    tld_only_start   = re.compile(rf'^{tld}\b', re.I)
    # *** BARU: prev "Brand." + next "Com/Net/..." ***
    prev_word_dot    = re.compile(r'\b([A-Za-z][A-Za-z0-9-]{1,})\.\s*$')

    # daftar/pertanyaan pendek
    short_q = re.compile(r'^(?:per\s+\w+|\w.{0,80}\?)$', re.I)

    # (IPK)., (CPU), (UX).
    lone_paren_abbr  = re.compile(r'^\s*\([A-Za-z]{2,8}\)\.?\s*$')

    # Node. JS / Socket. IO
    word_dot_prev    = re.compile(r'.*\b[A-Za-z]{2,}\.\s*$')
    next_allcaps     = re.compile(r'^[A-Z]{1,4}\b')

    # *** BARU: tail "diakses/accessed/retrieved" ***
    tail_access_note = re.compile(r'\b(diakses|accessed|retrieved)\b', re.I)

    def _unbalanced_paren_or_quote(s: str) -> bool:
        s = s.strip()
        if s.count('(') != s.count(')'):
            return True
        for left, right in [("'", "'"), ('"', '"'), ('“','”'), ('‘','’')]:
            if s.count(left) != s.count(right):
                return True
        return False

    def _is_very_short_opening(s: str) -> bool:
        w = s.strip().split()
        return len(w) <= 3 and s.endswith('.')

    # *** BARU: frasa pendek capitalized tanpa tanda akhir (≤3 kata) → kandidat tempel ***
    def _is_short_titleish(s: str) -> bool:
        s = s.strip()
        if re.search(r'[.!?]$', s):
            return False
        w = s.split()
        return 1 <= len(w) <= 3 and w[0][:1].isupper()

    out = []
    i = 0
    n = len(items)

    while i < n:
        pid, txt, tag = items[i]
        t = normalize_ws(txt)

        has_prev = bool(out) and out[-1][0] == pid and out[-1][2] == tag
        prev_txt = out[-1][1] if has_prev else None

        has_next = (i + 1 < n) and (items[i+1][0] == pid) and (items[i+1][2] == tag)
        next_txt = normalize_ws(items[i+1][1]) if has_next else None

        # 0) URL-only / (URL) / angka+URL
        if (url_only.match(t) or paren_url_only.match(t) or num_then_url.match(t)) and has_prev:
            out[-1][1] = normalize_ws(prev_txt.rstrip() + " " + t)
            i += 1
            continue

        # 1) Pembuka sangat pendek ATAU typo ':.', ';' → gabung ke berikutnya
        if has_next and (_is_very_short_opening(t) or t.rstrip().endswith(':.') or t.rstrip().endswith(';')):
            out.append([pid, normalize_ws(t.rstrip() + " " + next_txt.lstrip()), tag])
            i += 2
            continue

        # 1b) *** BARU: frasa pendek 'title-ish' (mis. "Kepuasan (satisfaction)") → tempel ke sebelumnya
        if has_prev and _is_short_titleish(t):
            out[-1][1] = normalize_ws(prev_txt.rstrip() + " " + t)
            i += 1
            continue

        # 2) "Risti?" + "(2014, hlm. 4) mengatakan …" → jika next mulai "(YYYY ... ) <teks>"
        if has_next and t.rstrip().endswith('?') and starts_paren_year_then_text.match(next_txt):
            out.append([pid, normalize_ws(t + " " + next_txt), tag])
            i += 2
            continue
        
        # 2b) Prev berakhir "et al." / "dkk." + next mulai "(YYYY) ...": gabung
        if has_prev and prev_etal_end.search(prev_txt) and starts_paren_year_then_text.match(t):
            out[-1][1] = normalize_ws(prev_txt.rstrip() + " " + t)
            i += 1
            continue


        # 3) Ekor sitasi luas (huruf / angka / paren tanpa tahun) → tempel ke sebelumnya
        if has_prev and (tail_citation_paren.match(t) or tail_author_year.match(t) or tail_source_year_num.match(t) or tail_paren_no_year.match(t)):
            out[-1][1] = normalize_ws(prev_txt.rstrip() + " " + t)
            i += 1
            continue

        # 3b) *** BARU: baris mengandung 'diakses/accessed/retrieved' → tempel ke sebelumnya
        if has_prev and tail_access_note.search(t):
            out[-1][1] = normalize_ws(prev_txt.rstrip() + " " + t)
            i += 1
            continue

        # 4) UD./CV./PT./TBK./RS./PD./MT./KH. berdiri sendiri → zip kiri+kanan
        if corp_abbr_only.match(t) and has_prev:
            merged = prev_txt.rstrip() + " " + t
            consumed = 1
            if has_next:
                merged += " " + next_txt
                consumed = 2
            out[-1][1] = normalize_ws(merged)
            i += consumed
            continue

        # 5) Prev berakhir singkatan → tempel lanjutan nama/alamat/abbr dalam paren
        if has_prev and ends_with_corp.search(prev_txt) and (starts_with_name.match(t) or short_name_line.match(t) or lone_paren_abbr.match(t)):
            out[-1][1] = normalize_ws(prev_txt.rstrip() + " " + t)
            i += 1
            continue

        # 6) Node. JS / Socket. IO
        if has_prev and word_dot_prev.match(prev_txt) and next_allcaps.match(t):
            out[-1][1] = normalize_ws(prev_txt.rstrip() + " " + t)
            i += 1
            continue

        # 7) Prev "Brand." + next "Com/Net/Org/Id ..." → jadikan "Brand.Com ..."
        if has_prev and prev_word_dot.search(prev_txt) and tld_only_start.match(t):
            out[-1][1] = normalize_ws(re.sub(r'\.\s+$', '.', prev_txt) + t)
            i += 1
            continue

        # 8) Lone paren abbreviation (IPK). → tempel ke sebelumnya
        if has_prev and lone_paren_abbr.match(t):
            out[-1][1] = normalize_ws(prev_txt.rstrip() + " " + t)
            i += 1
            continue

        # 9) Rangkaian setelah ':' → gulung (daftar/pertanyaan/“Diakses : …”, “Rayon 2 : …” dst.)
        if t.rstrip().endswith(':') and has_next:
            buf = t
            j = i + 1
            while j < n and items[j][0] == pid and items[j][2] == tag:
                nxt = normalize_ws(items[j][1])
                if (short_q.match(nxt)
                    or nxt[0:1].islower()
                    or nxt.rstrip().endswith(':')
                    or re.search(r'^(Diakses\s*:|InfoKomputer\s*:|Rayon\s*\d+\s*:|\d+\s*\w*)', nxt, re.I)):
                    buf = normalize_ws(buf + " " + nxt); j += 1; continue
                if not re.search(r'[.!?]$', buf):
                    buf = normalize_ws(buf + " " + nxt); j += 1; continue
                break
            out.append([pid, buf, tag])
            i = j
            continue

        # 10) Teks tak seimbang kurung/petik → rekatkan agar seimbang
        if _unbalanced_paren_or_quote(t):
            if has_prev:
                out[-1][1] = normalize_ws(prev_txt.rstrip() + " " + t)
                i += 1
                continue
            elif has_next and not _unbalanced_paren_or_quote(next_txt):
                out.append([pid, normalize_ws(t + " " + next_txt), tag])
                i += 2
                continue

        # default
        out.append([pid, t, tag])
        i += 1

    return out


daftarKalimatProposal = [
    [pid, _recover_overmerged_words(kal), tag]
    for pid, kal, tag in daftarKalimatProposal
]

daftarKalimatProposal = cross_glue_items(daftarKalimatProposal)

# Cari kalimat pendek
listKalimatPendek = [k for k in daftarKalimatProposal if len(k[1].split()) <= 5]

print("Jumlah Kalimat:", len(daftarKalimatProposal))
print("Jumlah Link yang disimpan:", len(daftarLink))
# daftarLink sekarang berisi semua URL unik yang dimasking selama proses


Jumlah Paragraf: 5116
Jumlah Kalimat: 24509
Jumlah Link yang disimpan: 124


In [6]:
print("Debug")

Debug


In [8]:
JumlahLatarBelakang = 0
JumlahJudul = 0
JumalahRumusan = 0
JumlahTujuan = 0

for daftar in daftarKalimatProposal :
    if daftar[2] == 'latar_belakang' :
        JumlahLatarBelakang = JumlahLatarBelakang + 1
    elif daftar[2] == 'judul' :
        JumlahJudul = JumlahJudul + 1
    elif daftar[2] == 'rumusan' :
        JumalahRumusan = JumalahRumusan + 1
    elif daftar[2] == 'tujuan' :
        JumlahTujuan = JumlahTujuan + 1
        
print("Jumlah Paragraf Latar Belakang: ", JumlahLatarBelakang)
print("Jumlah Paragraf Judul: ", JumlahJudul)
print("Jumlah Paragraf Rumusan: ", JumalahRumusan)
print("Jumlah Paragraf Tujuan: ", JumlahTujuan)


Jumlah Paragraf Latar Belakang:  19113
Jumlah Paragraf Judul:  1279
Jumlah Paragraf Rumusan:  2407
Jumlah Paragraf Tujuan:  1710


Debugging Isi

In [4]:
# === BUILD dataTuning DARI HASIL AKHIR PIPELINE ===
import re

def _normalize_ws_final(s: str) -> str:
    # ringan saja di tahap akhir
    s = re.sub(r"\s+", " ", str(s)).strip()
    return s

# (opsional) perbaiki typo "Bove? e" → "Bove e"
def _fix_name_typos_final(s: str) -> str:
    return re.sub(r'([A-Za-z])\?\s*([A-Za-z])', r'\1\2', s)

dataTuning = []
for idx, (pid, txt, asp) in enumerate(daftarKalimatProposal, start=1):
    txt = _fix_name_typos_final(_normalize_ws_final(txt))
    dataTuning.append([idx, str(pid), txt.lower(), str(asp).lower()])

print("Jumlah dataTuning:", len(dataTuning))
print("Contoh:", dataTuning[0] if dataTuning else "(kosong)")


Jumlah dataTuning: 24509
Contoh: [1, '26', "'sistem penyensoran gambar dewasa secara otomatis dengan menggunakan normalized cut, fuzzy clustering dan backpropagation'", 'judul']


In [8]:
# === SIMPAN dataTuning -> dataTuning.csv (aman Excel) ===
# Harap: dataTuning = [[idKalimat, idProposal, Text, Aspek], ...]
import re
import pandas as pd

# --- mapping aspek ---
_ASPECT_ALIASES = {
    "judul": "judul", "title": "judul",
    "latar": "latar_belakang", "latar belakang": "latar_belakang", "latar_belakang": "latar_belakang",
    "rumusan": "rumusan_masalah", "rumusan masalah": "rumusan_masalah",
    "rumusan_masalah": "rumusan_masalah", "perumusan masalah": "rumusan_masalah",
    "tujuan": "tujuan", "objective": "tujuan", "objectives": "tujuan",
}
ALLOWED_ASPECTS = {"judul", "latar_belakang", "rumusan_masalah", "tujuan"}

def normalize_aspect(a: str) -> str:
    if a is None: return ""
    key = re.sub(r"\s+", " ", str(a).strip().lower().replace("-", " ").replace("/", " "))
    return _ASPECT_ALIASES.get(key, key.replace(" ", "_"))

# Pilihan: set True jika memang ingin lowercase konten teks di CSV
LOWER_TEXT = False

rows = []
dropped = {"short":0, "empty":0, "aspect":0}

for row in dataTuning:
    if not isinstance(row, (list, tuple)) or len(row) < 4:
        dropped["short"] += 1
        continue
    _sid, _pid, _txt, _asp = row[0], row[1], row[2], row[3]

    txt = "" if _txt is None else str(_txt).strip()
    if not txt:
        dropped["empty"] += 1
        continue

    asp = normalize_aspect(_asp)
    if asp not in ALLOWED_ASPECTS:
        dropped["aspect"] += 1
        continue

    # jaga karakter unicode (—, –, “ ”, dll) apa adanya
    if LOWER_TEXT:
        txt = txt.lower()

    rows.append({"proposal_id": str(_pid), "aspect": asp, "text": txt})

df = pd.DataFrame(rows)
df.insert(0, "sentence_id", range(1, len(df)+1))

# Penting: utf-8-sig supaya Excel tidak mojibake (â€“, â€œ, dll)
df.to_csv("dataTuning.csv", index=False, encoding="utf-8-sig", lineterminator="\n")


print(f"Tersimpan {len(df)} baris. Buang -> short:{dropped['short']} empty:{dropped['empty']} aspect:{dropped['aspect']}")


Tersimpan 24509 baris. Buang -> short:0 empty:0 aspect:0
