<a href="https://colab.research.google.com/github/lisaadnr/normalization-ayat/blob/main/Normalisasi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install python-docx



In [15]:
from docx import Document
import os
import re

def extract_ayat_text(doc_path):
    doc = Document(doc_path)
    ayat_texts = []

    # Kata-kata yang harus dihapus jika muncul di awal kalimat (huruf kapital)
    ignored_keywords = [
        "AL", "AS", "AT", "YAS", "Madaniyyah", "Makkiyyah", "AN", "'ABASA",
        "NUH", "QAF", "AS", "AD", "ASY", "FUSSILAT", "SAD", "GAFIR", "AZ",
        "MUHAMMAD", "ALI", "MARYAM", "TAHA", "AR", "QURAISY", "IBRAHIM",
        "YASIN", "YUNUS", "HUD", "YUSUF", "FATIR", "SABA", "LUQMAN"
    ]

    ignored_phrase = "Dengan nama Allah Yang Maha Pengasih lagi Maha Penyayang"

    # Periksa apakah ini Surah Al-Fatihah berdasarkan nama file
    is_alfatihah = "alfatihah" in os.path.basename(doc_path).lower()

    for para in doc.paragraphs:
        text = para.text.strip()

        # Skip paragraf kosong
        if not text:
            continue

        # Skip teks dengan kata di awal yang termasuk dalam ignored_keywords
        if re.match(rf"^({'|'.join(ignored_keywords)})\b", text):
            continue

        # Hapus frasa "Dengan nama Allah..." jika bukan Surah Al-Fatihah
        if ignored_phrase in text and not is_alfatihah:
            continue

        # Jika paragraf tidak diawali dengan angka dan teksnya bold, lewati
        if not re.match(r'^\d+\.', text) and any(run.bold for run in para.runs):
            continue

        # Skip teks yang diawali dengan "Surah"
        if text.startswith("Surah"):
            continue

        # Hapus tanda kurung kapital (termasuk Unicode) di awal paragraf
        text = re.sub(r'^\([A-ZĀĒĪŌŪṢḌṬẒʿ\s\-]+\)', '', text).strip()

        # Skip seluruh kalimat jika mengandung kata dalam kurung huruf kapital
        if re.search(r'\([A-Z\s]+\)', text):
            continue

        # Hapus catatan kaki (misalnya 1), 2), dst.)
        text = re.sub(r'\d+\)', '', text)

        # Hapus tanda kurung tutup di akhir kalimat
        text = re.sub(r'\s*\)\s*$', '', text)  # Hapus jika `)` di akhir kalimat
        text = re.sub(r'\s*\)\b', '', text)  # Hapus jika `)` di akhir kata

        # Tambahkan teks yang telah dibersihkan ke list ayat_texts
        ayat_texts.append(text.strip())

    # Gabungkan semua ayat menjadi satu teks dan hilangkan spasi/enter di awal dan akhir
    return "\n".join(ayat_texts).strip()

# Path ke folder yang berisi file DOCX
folder_path = '/content/sample_data/ayat'

# Folder output untuk menyimpan file hasil normalisasi
output_folder = '/content/sample_data/mpat'
os.makedirs(output_folder, exist_ok=True)

# Loop melalui setiap file DOCX di folder
for filename in os.listdir(folder_path):
    if filename.endswith('.docx'):
        file_path = os.path.join(folder_path, filename)
        ayat_text = extract_ayat_text(file_path)

        # Simpan hasilnya ke file TXT di folder output
        output_file_path = os.path.join(output_folder, f"{filename.replace('.docx', '')}_normalized.txt")
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(ayat_text)
        print(f"Processed: {filename}")


Processed: 111. al-Lahab.docx
Processed: 34. Saba'.docx
Processed: 82. al-Infitar.docx
Processed: 27. an-Naml.docx
Processed: 49. al-Hujurat.docx
Processed: 90. al-Balad.docx
Processed: 96. al-'Alaq.docx
Processed: 83. al-Mutaffifin.docx
Processed: 17. al-Isra'.docx
Processed: 59. al-Hasyr.docx
Processed: 53. an-Najm.docx
Processed: 32. as-Sajdah.docx
Processed: 3. Ali 'Imran.docx
Processed: 30. ar-Rum.docx
Processed: 109. al-Kafirun.docx
Processed: 55. ar-Rahman.docx
Processed: 63. al-Munafiqun.docx
Processed: 76. al-Insan.docx
Processed: 23. al-Mu'minun.docx
Processed: 4. an-Nisa'.docx
Processed: 100. al-'Adiyat.docx
Processed: 113. al-Falaq.docx
Processed: 25. al-Furqan.docx
Processed: 39. az-Zumar.docx
Processed: 60. al-Mumtahanah.docx
Processed: 20. Taha.docx
Processed: 86. at-Tariq.docx
Processed: 8. al-Anfal.docx
Processed: 58. al-Mujadalah.docx
Processed: 73. al-Muzzammil.docx
Processed: 46. al-Ahqaf.docx
Processed: 101. al-Qari'ah.docx
Processed: 18. al-Kahf.docx
Processed: 38

In [16]:
import os
import zipfile
from google.colab import files

# Path ke folder output
output_folder = '/content/sample_data/mpat'

# Nama file ZIP yang akan dibuat
zip_file_path = '/content/kesepuluh.zip'

# Kompres folder ke dalam ZIP
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
    for root, dirs, files_in_dir in os.walk(output_folder):
        for file in files_in_dir:
            # Tambahkan file ke dalam ZIP dengan struktur folder
            zipf.write(os.path.join(root, file),
                       arcname=os.path.relpath(os.path.join(root, file), output_folder))

# Download file ZIP
files.download(zip_file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>