# Disini sebagai pemotongan dataset Arabic, Transliteration, dan Validation

In [2]:
import pandas as pd

# **1️⃣ Load CSV**
quran_path = "pre_quran.csv"

# **2️⃣ Read CSV**
quran_df = pd.read_csv(quran_path, delimiter=";", dtype=str)

# **3️⃣ Pisahkan Data berdasarkan panjang kolom 'kemenag'**
df_less_equal_100 = quran_df[quran_df["kemenag"].str.len() <= 100]  # Panjang ≤ 100
df_more_than_100 = quran_df[quran_df["kemenag"].str.len() > 100]   # Panjang > 100

# **4️⃣ Simpan ke CSV baru**
df_less_equal_100.to_csv("quran_len_100.csv", index=False, sep=";", encoding="utf-8-sig")
#df_more_than_100.to_csv("quran_kemenag_>100.csv", index=False, sep=";", encoding="utf-8-sig")

# **5️⃣ Output hasil**
print(f"✅ Data dengan panjang 'kemenag' ≤ 100 disimpan di: quran_kemenag_≤100.csv")
#print(f"✅ Data dengan panjang 'kemenag' > 100 disimpan di: quran_kemenag_>100.csv")


✅ Jumlah nilai di kolom 'kemenag' dengan panjang ≤ 100: 3314


In [1]:
import pandas as pd

# Baca file data.csv dan transcript.csv
data_df = pd.read_csv("quran_ayas.csv", delimiter=';')
transcript_df = pd.read_csv("transcript.csv", delimiter=';')

# Pilih kolom yang dibutuhkan dari data.csv
data_selected = data_df[['sura', 'aya', 'kemenag', 'transliteration_id']]

# Gabungkan dengan transcript.csv berdasarkan kolom 'sura' dan 'aya'
merged_df = data_selected.merge(transcript_df[['sura', 'aya', 'transcript']], on=['sura', 'aya'], how='left')

# Simpan ke file pre_quran.csv
merged_df.to_csv("pre_quran.csv", index=False, sep=';')

print("File berhasil disimpan: pre_quran.csv")


File berhasil disimpan: pre_quran.csv


### Pemotongan sesuai panjang letter (100)

In [3]:
import pandas as pd

def split_arabic_text(text):
    """Membagi teks Arab menjadi beberapa bagian berdasarkan panjang teks"""
    if pd.isna(text):  # Jika teks kosong atau NaN, kembalikan daftar kosong
        return [""]
    
    text = str(text)  # Pastikan teks adalah string
    total_length = len(text.replace(" ", ""))  # Hitung panjang tanpa spasi

    # Menentukan jumlah bagian berdasarkan panjang teks
    if total_length <= 100:
        num_parts = 1
    elif total_length <= 160:
        num_parts = 2
    elif total_length <= 230:
        num_parts = 3
    else:
        num_parts = 4

    # Menentukan panjang setiap segmen (tanpa spasi)
    segment_length = total_length // num_parts
    words = text.split()  # Pisahkan teks berdasarkan spasi
    segments = []
    current_segment = []
    current_length = 0

    for word in words:
        current_length += len(word)  # Tambah panjang kata
        current_segment.append(word)
        
        # Jika panjang segmen cukup dan belum mencapai jumlah segmen yang diinginkan
        if current_length >= segment_length and len(segments) < num_parts - 1:
            segments.append(" ".join(current_segment))
            current_segment = []  # Reset segmen untuk bagian berikutnya
            current_length = 0

    # Tambahkan segmen terakhir
    segments.append(" ".join(current_segment))
    
    return segments

# Baca file pre_quran.csv
file_path = "pre_quran.csv"
df = pd.read_csv(file_path, delimiter=';')

# Pastikan kolom 'kemenag' ada dalam data
if "kemenag" not in df.columns:
    raise ValueError("Kolom 'kemenag' tidak ditemukan dalam file CSV.")

# List untuk menyimpan hasil
new_data = []

# Proses setiap baris
for index, row in df.iterrows():
    arabic_text = row["kemenag"]
    parts = split_arabic_text(arabic_text)  # Pisahkan teks Arab

    # Buat nomor bagian yang berulang untuk segmen
    for part_index, part in enumerate(parts, start=1):
        new_row = row.copy()
        new_row["kemenag_baru"] = part
        new_row["Bagian"] = part_index  # Nomor bagian berurutan
        new_data.append(new_row)

# Buat DataFrame baru dari hasil split
new_df = pd.DataFrame(new_data)

# Simpan ke file CSV baru
output_file = "quran_v1.csv"
new_df.to_csv(output_file, index=False, sep=';')

print(f"File berhasil disimpan: {output_file}")


File berhasil disimpan: quran_v1.csv


In [1]:
pip install arab-transliterator

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting arab-transliterator
  Downloading arab_transliterator-1.0.3-py3-none-any.whl.metadata (5.7 kB)
Downloading arab_transliterator-1.0.3-py3-none-any.whl (8.0 kB)
Installing collected packages: arab-transliterator
Successfully installed arab-transliterator-1.0.3
Note: you may need to restart the kernel to use updated packages.


In [55]:
####fix

import csv
import re
from arab_transliterator.transliterator import ArabTransliterator

# Inisialisasi transliterator
Trans = ArabTransliterator()

# Nama file input & output
input_file = "quran_v1.csv"  # Ganti sesuai file Anda
output_csv = "quran_transliteration.csv"
translit_map = {
    "dh": "ż",
    "sh": "sy",
    "ʿ": "'",
    "gh": "g",
    "th": "ṡ",
    #"'":"`",
}
def adjust_transliteration(text):
    """Menyesuaikan transliterasi dengan aturan yang diberikan."""
    for old, new in translit_map.items():
        text = text.replace(old, new)
        text = re.sub(r" l-", "l", text)
        text = re.sub(r" s-","s-",text)
        text = re.sub(r" w-","w-",text)
        text = re.sub(r" d-","d-",text)
        text = re.sub(r" ṣ-","ṣ-",text)
        text = re.sub(r" z-","z-",text)
        text = re.sub(r" n-","n-",text)
        text = re.sub(r" m-","m-",text)
        text = re.sub(r" sy-","sy-",text)
        text = re.sub(r" j-","j-",text)
        text = re.sub(r" g-","g-",text)
        text = re.sub(r" f-","f-",text)
        text = re.sub(r" b-","b-",text)
        text = re.sub(r" ẓ-","ẓ-",text)
        text = re.sub(r" t-","t-",text)
        text = re.sub(r" r-","r-",text)
        text = re.sub(r" '-","'-",text)
        text = re.sub(r" ṡ-","ṡ-",text)
        text = re.sub(r" g-","g-",text)
        text = re.sub(r" ż-","ż-",text)
        text = re.sub(r" h-","h-",text)
        text = re.sub(r" ḍ-","ḍ-",text)
        text = re.sub(r" y-","y-",text)
        text = re.sub(r" ṭ-","ṭ-",text)
        text = re.sub(r" ḥ-","ḥ-",text)
        text = re.sub(r" q-","q-",text)
        text = re.sub(r"  "," ",text)
        text = re.sub("āa","ā a", text)
    return text
# Struktur hasil transliterasi

quran_data = []

# Pola regex untuk menghapus tanda baca wakaf
wakaf_pattern = re.compile(r'[\u06DA-\u06DC\u06DF-\u06E8\u06EA-\u06ED]')

# Pola regex untuk mendeteksi harakat akhir ayat
tanwin_kasrah = re.compile(r'ً|ٍ|ٌ$')
tanwin_fathah = re.compile(r'ً$')
tanwin_dammah = re.compile(r'ٌ$')
kasrah_akhir = re.compile(r'ِ$')
fathah_akhir = re.compile(r'َ$')
dammah_akhir = re.compile(r'ُ$')

def waqafkan_akhir(ayah):
    """Fungsi untuk mengubah harakat akhir ayat agar sesuai dengan aturan waqaf."""
    ayah = re.sub(wakaf_pattern, '', ayah)  # Hapus tanda wakaf

    # Ubah tanwin
    if re.search(tanwin_fathah, ayah):
        ayah = re.sub(tanwin_fathah, 'ا', ayah)
    elif re.search(tanwin_kasrah, ayah) or re.search(tanwin_dammah, ayah):
        ayah = re.sub(tanwin_kasrah, '', ayah)
        ayah = re.sub(tanwin_dammah, '', ayah)

    # Ubah harakat terakhir jadi sukun
    ayah = re.sub(kasrah_akhir, '', ayah)
    ayah = re.sub(dammah_akhir, '', ayah)
    ayah = re.sub(fathah_akhir, 'ا', ayah)

    return ayah

# Baca file CSV
with open(input_file, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=";")
    header = next(reader)  # Ambil header

    # Simpan semua data dalam list untuk pengecekan kondisi
    quran_raw_data = list(reader)

# Looping dengan pengecekan kondisi 'Bagian'
for i, row in enumerate(quran_raw_data):
    if len(row) < 6:  # Pastikan baris memiliki cukup kolom
        continue

    sura = int(row[0])
    aya = int(row[1])
    arabic_text = row[5]
    trans_id = row[3]  # Transliteration dari kolom 'Bagian'

    # Cek apakah harus langsung pakai trans_id
    bagian = int(row[6])
    next_bagian = int(quran_raw_data[i+1][6]) if i+1 < len(quran_raw_data) and quran_raw_data[i+1][6].isdigit() else 0

    if bagian == 1 and next_bagian == 1:
        transliteration = trans_id  # Gunakan langsung dari file
    else:
        # Modifikasi teks agar sesuai dengan aturan waqaf
        arabic_text_waqaf = waqafkan_akhir(arabic_text)

        try:
            # Transliterate menggunakan arab-transliterator
            arabic_text_waqaf = arabic_text_waqaf.replace("ٰ","َا").replace("ٰ","َا")
            transliteration = Trans.translate(arabic_text_waqaf)
            transliteration = transliteration.replace("'","`")
            transliteration = adjust_transliteration(transliteration)
        except Exception as e:
            print(f"Kesalahan transliterasi pada Surah {sura} Ayat {aya}: {e}")
            transliteration = ""

    # Simpan ke dalam list
    quran_data.append([sura, aya,bagian, arabic_text, transliteration])

# Simpan ke file CSV
with open(output_csv, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter=";")
    writer.writerow(["Sura", "Aya", "Bagian" , "Arabic", "Transliteration"])
    writer.writerows(quran_data)

print(f"Data transliterasi telah disimpan di '{output_csv}'.")


Kesalahan transliterasi pada Surah 2 Ayat 26: list index out of range
Kesalahan transliterasi pada Surah 6 Ayat 102: list index out of range
Kesalahan transliterasi pada Surah 6 Ayat 154: list index out of range
Kesalahan transliterasi pada Surah 9 Ayat 103: list index out of range
Kesalahan transliterasi pada Surah 14 Ayat 4: list index out of range
Kesalahan transliterasi pada Surah 14 Ayat 17: list index out of range
Kesalahan transliterasi pada Surah 23 Ayat 27: list index out of range
Kesalahan transliterasi pada Surah 39 Ayat 5: list index out of range
Kesalahan transliterasi pada Surah 41 Ayat 12: list index out of range
Kesalahan transliterasi pada Surah 65 Ayat 12: list index out of range
Data transliterasi telah disimpan di 'quran_transliteration.csv'.


In [65]:
from arab_transliterator.transliterator import ArabTransliterator

def transliterate_with_diacritics(text):
    trans = ArabTransliterator()
    transliterated_text = trans.translate(text)
    
    diacritic_map = {
        "ً": "an", "ٌ": "un", "ٍ": "in",
        "َ": "a", "ُ": "u", "ِ": "i",
        "ّ": "",  # Shadda (handled separately)
        "ْ": "'",
        "ٰ": "َ"  # Dagger Alif
    }

    # Tambahkan diakritik sesuai mapping
    for arabic_char, translit in diacritic_map.items():
        transliterated_text = transliterated_text.replace(arabic_char, translit)
    
    return transliterated_text

arabic_text = "الَّذِيْنَ كَفَرُوْا فَيَقُوْلُوْنَ مَاذَآ اَرَادَ اللَّاهُ بِهَاذَا مَثَلا ۘ يُضِلّ"
print(transliterate_with_diacritics(arabic_text))


IndexError: list index out of range

In [64]:
arabic_text = "الَّذِيْنَ كَفَرُوْا فَيَقُوْلُوْنَ مَاذَآ اَرَادَ اللَّاهُ بِهَاذَا مَثَلا ۘ يُضِلّ"
test = arabic_text.replace("ٰ","َا")
print(test)

الَّذِيْنَ كَفَرُوْا فَيَقُوْلُوْنَ مَاذَآ اَرَادَ اللَّاهُ بِهَاذَا مَثَلا ۘ يُضِلّ


In [73]:
from arab_transliterator.transliterator import ArabTransliterator

def transliterate_with_diacritics(text):
    trans = ArabTransliterator()
    
    try:
        transliterated_text = trans.translate(text)
    except IndexError as e:
        print(f"Error transliterating: {text} -> {e}")
        return ""

    diacritic_map = {
        "ً": "an", "ٌ": "un", "ٍ": "in",
        "َ": "a", "ُ": "u", "ِ": "i",
        "ّ": "",  # Shadda (handled separately)
        "ْ": "'",
        "ٰ": "َ"  # Dagger Alif
    }

    for arabic_char, translit in diacritic_map.items():
        transliterated_text = transliterated_text.replace(arabic_char, translit)

    return transliterated_text

arabic_text = "الَّذِيْنَ كَفَرُوْا فَيَقُوْلُوْنَ مَاذَآ اَرَادَ اللَّاهُ بِهَاذَا مَثَلا ۘ يُضِلّ"
print(transliterate_with_diacritics(arabic_text))
for i, char in enumerate(arabic_text):
    print(f"Index {i}: {char} (Unicode: {ord(char)})")

Error transliterating: الَّذِيْنَ كَفَرُوْا فَيَقُوْلُوْنَ مَاذَآ اَرَادَ اللَّاهُ بِهَاذَا مَثَلا ۘ يُضِلّ -> list index out of range

Index 0: ا (Unicode: 1575)
Index 1: ل (Unicode: 1604)
Index 2: ّ (Unicode: 1617)
Index 3: َ (Unicode: 1614)
Index 4: ذ (Unicode: 1584)
Index 5: ِ (Unicode: 1616)
Index 6: ي (Unicode: 1610)
Index 7: ْ (Unicode: 1618)
Index 8: ن (Unicode: 1606)
Index 9: َ (Unicode: 1614)
Index 10:   (Unicode: 32)
Index 11: ك (Unicode: 1603)
Index 12: َ (Unicode: 1614)
Index 13: ف (Unicode: 1601)
Index 14: َ (Unicode: 1614)
Index 15: ر (Unicode: 1585)
Index 16: ُ (Unicode: 1615)
Index 17: و (Unicode: 1608)
Index 18: ْ (Unicode: 1618)
Index 19: ا (Unicode: 1575)
Index 20:   (Unicode: 32)
Index 21: ف (Unicode: 1601)
Index 22: َ (Unicode: 1614)
Index 23: ي (Unicode: 1610)
Index 24: َ (Unicode: 1614)
Index 25: ق (Unicode: 1602)
Index 26: ُ (Unicode: 1615)
Index 27: و (Unicode: 1608)
Index 28: ْ (Unicode: 1618)
Index 29: ل (Unicode: 1604)
Index 30: ُ (Unicode: 1615)
Index 31:

In [5]:
import re
from arab_transliterator.transliterator import ArabTransliterator

def remove_diacritics(text):
    # Hapus harakat (tanda baca Arab)
    #text = re.sub(r"[\u0617-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED]", "", text)
    # Hapus karakter tidak umum seperti ۘ
    #text = re.sub(r"[^\u0600-\u06FF\s]", "", text)  
    return text
def normalize_shaddah(text):
    text = re.sub(r"(\w)ّ", r"\1\1", text)  # Ubah "لّ" menjadi "لل"
    return text
arabic_text = "الَّذِيْنَ كَفَرُوْا فَيَقُوْلُوْنَ مَاذَآ اَرَادَ اللَّاهُ بِهَاذَا مَثَلا ۘ يُضِلّ"
#arabic_text = normalize_shaddah(arabic_text)
#arabic_text = remove_diacritics(arabic_text)  # Normalisasi teks

trans = ArabTransliterator()
try:
    transliterated_text = trans.translate(arabic_text)
    print(transliterated_text)
except IndexError as e:
    print(f"Error: {e}")

Error: list index out of range


In [6]:
####fix

import csv
import re
from arab_transliterator.transliterator import ArabTransliterator
import unicodedata

# Inisialisasi transliterator
Trans = ArabTransliterator()

# Nama file input & output
input_file = "quran_v1.csv"  # Ganti sesuai file Anda
output_csv = "quran_transliteration.csv"
translit_map = {
    "dh": "ż",
    "sh": "sy",
    "ʿ": "'",
    "gh": "g",
    "th": "ṡ",
    #"'":"`",
}
#def normalize_shaddah(text):
#    text = re.sub(r"(\w)ّ", r"\1\1", text)  # Ubah "لّ" menjadi "لل"
#    return text
def adjust_transliteration(text):
    """Menyesuaikan transliterasi dengan aturan yang diberikan."""
    for old, new in translit_map.items():
        text = text.replace(old, new)
        text = re.sub(r" l-", "l", text)
        text = re.sub(r" s-","s-",text)
        text = re.sub(r" w-","w-",text)
        text = re.sub(r" d-","d-",text)
        text = re.sub(r" ṣ-","ṣ-",text)
        text = re.sub(r" z-","z-",text)
        text = re.sub(r" n-","n-",text)
        text = re.sub(r" m-","m-",text)
        text = re.sub(r" sy-","sy-",text)
        text = re.sub(r" j-","j-",text)
        text = re.sub(r" g-","g-",text)
        text = re.sub(r" f-","f-",text)
        text = re.sub(r" b-","b-",text)
        text = re.sub(r" ẓ-","ẓ-",text)
        text = re.sub(r" t-","t-",text)
        text = re.sub(r" r-","r-",text)
        text = re.sub(r" '-","'-",text)
        text = re.sub(r" ṡ-","ṡ-",text)
        text = re.sub(r" g-","g-",text)
        text = re.sub(r" ż-","ż-",text)
        text = re.sub(r" h-","h-",text)
        text = re.sub(r" ḍ-","ḍ-",text)
        text = re.sub(r" y-","y-",text)
        text = re.sub(r" ṭ-","ṭ-",text)
        text = re.sub(r" ḥ-","ḥ-",text)
        text = re.sub(r" q-","q-",text)
        text = re.sub(r"  "," ",text)
        text = re.sub("āa","ā a", text)
    return text
# Struktur hasil transliterasi

quran_data = []

# Pola regex untuk menghapus tanda baca wakaf
wakaf_pattern = re.compile(r'[\u06DA-\u06DC\u06DF-\u06E8\u06EA-\u06ED]')

# Pola regex untuk mendeteksi harakat akhir ayat
tanwin_kasrah = re.compile(r'ً|ٍ|ٌ$')
tanwin_fathah = re.compile(r'ً$')
tanwin_dammah = re.compile(r'ٌ$')
kasrah_akhir = re.compile(r'ِ$')
fathah_akhir = re.compile(r'َ$')
dammah_akhir = re.compile(r'ُ$')

def waqafkan_akhir(ayah):
    """Fungsi untuk mengubah harakat akhir ayat agar sesuai dengan aturan waqaf."""
    ayah = re.sub(wakaf_pattern, '', ayah)  # Hapus tanda wakaf

    # Ubah tanwin
    if re.search(tanwin_fathah, ayah):
        ayah = re.sub(tanwin_fathah, 'ا', ayah)
    elif re.search(tanwin_kasrah, ayah) or re.search(tanwin_dammah, ayah):
        ayah = re.sub(tanwin_kasrah, '', ayah)
        ayah = re.sub(tanwin_dammah, '', ayah)

    # Ubah harakat terakhir jadi sukun
    ayah = re.sub(kasrah_akhir, '', ayah)
    ayah = re.sub(dammah_akhir, '', ayah)
    ayah = re.sub(fathah_akhir, 'ا', ayah)

    return ayah

# Baca file CSV
with open(input_file, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=";")
    header = next(reader)  # Ambil header

    # Simpan semua data dalam list untuk pengecekan kondisi
    quran_raw_data = list(reader)

# Looping dengan pengecekan kondisi 'Bagian'
for i, row in enumerate(quran_raw_data):
    if len(row) < 6:  # Pastikan baris memiliki cukup kolom
        continue

    sura = int(row[0])
    aya = int(row[1])
    arabic_text = row[5]
    trans_id = row[3]  # Transliteration dari kolom 'Bagian'

    # Cek apakah harus langsung pakai trans_id
    if len(row) < 7 or not row[6].isdigit():
        print(f"Baris tidak valid di Surah {sura} Ayat {aya}: {row}")
        continue

    bagian = int(row[6])
    if i+1 < len(quran_raw_data) and len(quran_raw_data[i+1]) > 6 and quran_raw_data[i+1][6].isdigit():
        next_bagian = int(quran_raw_data[i+1][6])
    else:
        next_bagian = 0


    if bagian == 1 and next_bagian == 1:
        transliteration = trans_id  # Gunakan langsung dari file
    else:
        # Modifikasi teks agar sesuai dengan aturan waqaf
        arabic_text_waqaf = arabic_text

        try:
            # Transliterate menggunakan arab-transliterator
            arabic_text_waqaf = arabic_text_waqaf.replace("ٰ","َا").replace("ٰ","َا")
            #arabic_text_waqaf = re.sub(r"[^\u0600-\u06FF\s]", "", arabic_text_waqaf)
            #arabic_text_waqaf = normalize_shaddah(arabic_text_waqaf)
            #arabic_text_waqaf = unicodedata.normalize("NFKD", arabic_text_waqaf)
            transliteration = Trans.translate(arabic_text_waqaf)
            transliteration = transliteration.replace("'","`")
            transliteration = adjust_transliteration(transliteration)
        except Exception as e:
            print(f"Kesalahan transliterasi pada Surah {sura} Ayat {aya}: {e}")
            print(arabic_text_waqaf)
            print(transliteration)
            transliteration = ""

    # Simpan ke dalam list
    quran_data.append([sura, aya,bagian, arabic_text, transliteration])

# Simpan ke file CSV
with open(output_csv, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter=";")
    writer.writerow(["Sura", "Aya", "Bagian" , "Arabic", "Transliteration"])
    writer.writerows(quran_data)

print(f"Data transliterasi telah disimpan di '{output_csv}'.")


Data transliterasi telah disimpan di 'quran_transliteration.csv'.


In [36]:
### transliteration updated

import pandas as pd
from difflib import SequenceMatcher

# **1️⃣ Load CSV Files**
transliteration_path = "quran_transliteration.csv"
quran_v1_path = "quran_v1.csv"

# **2️⃣ Read CSV Files**
transliteration_df = pd.read_csv(transliteration_path, delimiter=";")
quran_v1_df = pd.read_csv(quran_v1_path, delimiter=";")

# **3️⃣ Ensure column names match**
transliteration_df.columns = ["Sura", "Aya", "Bagian", "Arabic", "Transliteration"]
quran_v1_df.columns = ["sura", "aya", "kemenag", "transliteration_id", "transcript", "kemenag_baru", "Bagian"]

# **4️⃣ Convert columns to string for accurate matching**
transliteration_df["Sura"] = transliteration_df["Sura"].astype(str)
transliteration_df["Aya"] = transliteration_df["Aya"].astype(str)
quran_v1_df["sura"] = quran_v1_df["sura"].astype(str)
quran_v1_df["aya"] = quran_v1_df["aya"].astype(str)

# **5️⃣ Create a dictionary mapping (sura, aya) to transliteration_id**
transliteration_dict = {}
for _, row in quran_v1_df.iterrows():
    key = (row["sura"], row["aya"])
    transliteration_dict[key] = row["transliteration_id"]

# **6️⃣ Function to find the most similar transliteration substring**
def find_most_similar(original_text, full_text):
    """Finds the most similar substring in full_text that matches original_text length"""
    best_match = ""
    best_ratio = 0.0

    for i in range(len(full_text) - len(original_text) + 1):
        candidate = full_text[i : i + len(original_text)]
        ratio = SequenceMatcher(None, original_text, candidate).ratio()
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = candidate

    return best_match

# **7️⃣ Replace Transliteration**
for idx, row in transliteration_df.iterrows():
    key = (row["Sura"], row["Aya"])
    
    if key in transliteration_dict:
        full_transliteration = transliteration_dict[key]
        original_translit = row["Transliteration"]
        
        # Cari bagian transliterasi yang paling mirip dengan posisi aslinya
        best_match = find_most_similar(original_translit, full_transliteration)
        transliteration_df.at[idx, "Transliteration"] = best_match

# **8️⃣ Save Updated CSV**
output_path = "quran_transliteration_updated.csv"
transliteration_df.to_csv(output_path, index=False, sep=";", encoding="utf-8-sig")

print(f"✅ Transliterasi diperbarui dan disimpan ke {output_path}")


✅ Transliterasi diperbarui dan disimpan ke quran_transliteration_updated.csv


In [12]:
import pandas as pd
from difflib import SequenceMatcher
import re

# **1️⃣ Load CSV Files**
transliteration_path = "quran_transliteration_updated.csv"
quran_v1_path = "quran_v1.csv"

# **2️⃣ Read CSV Files**
transliteration_df = pd.read_csv(transliteration_path, delimiter=";")
quran_v1_df = pd.read_csv(quran_v1_path, delimiter=";")

# **3️⃣ Ensure column names match**
transliteration_df.columns = ["Sura", "Aya", "Bagian", "Arabic", "Transliteration"]
quran_v1_df.columns = ["sura", "aya", "kemenag", "transliteration_id", "transcript", "kemenag_baru", "Bagian"]

# **4️⃣ Convert columns to string for accurate matching**
transliteration_df["Sura"] = transliteration_df["Sura"].astype(str)
transliteration_df["Aya"] = transliteration_df["Aya"].astype(str)
quran_v1_df["sura"] = quran_v1_df["sura"].astype(str)
quran_v1_df["aya"] = quran_v1_df["aya"].astype(str)

# **5️⃣ Create a dictionary mapping (sura, aya) to transcript**
transcript_dict = {}
for _, row in quran_v1_df.iterrows():
    key = (row["sura"], row["aya"])
    # Hapus "-" dan spasi sebelum disimpan
    cleaned_transcript = re.sub(r"[-\s]", "", row["transcript"])
    transcript_dict[key] = cleaned_transcript

# **6️⃣ Function to find the most similar transcript substring**
def find_most_similar(original_text, full_text):
    """Finds the most similar substring in full_text that matches original_text length"""
    best_match = ""
    best_ratio = 0.0

    # Hapus "-" dan spasi dari teks transliterasi
    cleaned_original = re.sub(r"[-\s]", "", original_text)

    for i in range(len(full_text) - len(cleaned_original) + 1):
        candidate = full_text[i : i + len(cleaned_original)]
        ratio = SequenceMatcher(None, cleaned_original, candidate).ratio()
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = candidate

    return best_match

# **7️⃣ Add Transcript Column**
transliteration_df["Transcript"] = ""

for idx, row in transliteration_df.iterrows():
    key = (row["Sura"], row["Aya"])
    
    if key in transcript_dict:
        full_transcript = transcript_dict[key]
        original_translit = row["Transliteration"]
        
        # Cari bagian transcript yang paling mirip dengan posisi aslinya
        best_match = find_most_similar(original_translit, full_transcript)
        transliteration_df.at[idx, "Transcript"] = best_match

# **8️⃣ Save Updated CSV**
output_path = "quran_transliteration_with_transcript.csv"
transliteration_df.to_csv(output_path, index=False, sep=";", encoding="utf-8-sig")

print(f"✅ Kolom 'transcript' ditambahkan dan disimpan ke {output_path}")

✅ Kolom 'transcript' ditambahkan dan disimpan ke quran_transliteration_with_transcript.csv


In [52]:
import pandas as pd
import re
from difflib import SequenceMatcher

# **1️⃣ Load CSV Files**
transliteration_path = "quran_transliteration_updated.csv"
quran_v1_path = "quran_v1.csv"

# **2️⃣ Read CSV Files**
transliteration_df = pd.read_csv(transliteration_path, delimiter=";")
quran_v1_df = pd.read_csv(quran_v1_path, delimiter=";")

# **3️⃣ Ensure column names match**
transliteration_df.columns = ["Sura", "Aya", "Bagian", "Arabic", "Transliteration"]
quran_v1_df.columns = ["sura", "aya", "kemenag", "transliteration_id", "transcript", "kemenag_baru", "Bagian"]

# **4️⃣ Convert columns to string for accurate matching**
transliteration_df["Sura"] = transliteration_df["Sura"].astype(str)
transliteration_df["Aya"] = transliteration_df["Aya"].astype(str)
quran_v1_df["sura"] = quran_v1_df["sura"].astype(str)
quran_v1_df["aya"] = quran_v1_df["aya"].astype(str)

# **5️⃣ Create a dictionary mapping (sura, aya) to transcript**
transcript_dict = {
    (row["sura"], row["aya"]): row["transcript"] for _, row in quran_v1_df.iterrows()
}

# **6️⃣ Function to Find Best Matching Substring in Transcript**
def find_best_match(transliteration, full_transcript):
    """
    Mencari bagian terbaik dalam transcript yang sesuai dengan transliteration.
    ✅ Gunakan SequenceMatcher untuk mencari start & end yang paling cocok.
    ✅ Pastikan substring yang diambil **mengandung transliteration secara penuh**.
    """
    if not isinstance(full_transcript, str) or not isinstance(transliteration, str):
        return ""

    # Hapus semua spasi & tanda hubung dari transliteration dan transcript
    translit_clean = re.sub(r"[-\s]", "", transliteration)
    transcript_clean = re.sub(r"[-\s]", "", full_transcript)

    # Gunakan SequenceMatcher untuk mencari kecocokan terbaik
    match = SequenceMatcher(None, transcript_clean, translit_clean).find_longest_match(0, len(transcript_clean), 0, len(translit_clean))

    if match.size == 0:
        return ""  # Jika tidak ada kecocokan, kembalikan string kosong

    # **Ambil substring terbaik dari transcript dengan padding ekstra untuk kelengkapan**
    start = max(0, match.a - 5)  # Tambah toleransi 5 karakter ke belakang
    end = min(len(transcript_clean), match.a + match.size + 5)  # Tambah 5 karakter ke depan
    #print("end_word", end)
    return full_transcript[start:end]

# **7️⃣ Add Transcript Column**
transliteration_df["Transcript"] = ""

for idx, row in transliteration_df.iterrows():
    key = (row["Sura"], row["Aya"])

    if key in transcript_dict:
        full_transcript = transcript_dict[key]
        transliteration = row["Transliteration"]
        
        # Cari bagian transcript yang paling mirip dengan transliteration
        best_match = find_best_match(transliteration, full_transcript)
        #print(best_match)
        transliteration_df.at[idx, "Transcript"] = best_match

# **8️⃣ Save Updated CSV**
output_path = "quran_transliteration_with_transcript_fixed.csv"
transliteration_df.to_csv(output_path, index=False, sep=";", encoding="utf-8-sig")

print(f"✅ Kolom 'Transcript' telah diperbaiki dan disimpan ke {output_path}")


end_word 24
bis mil lā hir raḥ mā ni
end_word 27
al ḥam du lil lā hi rab bil
end_word 15
ar raḥ mā nir r
end_word 15
mā li ki yau mi
end_word 29
iy yā ka na' bu du wa iy yā k
end_word 22
ih di naṣ ṣi rā ṭal mu
end_word 60
ṣi rā ṭal la żī na an 'am ta 'a lai him gai ril mag ḍụ bi 'a
end_word 10
a lif lām 
end_word 40
żā li kal ki tā bu lā rai ba fīh , hu da
end_word 67
al la żī na yu` mi nụ na bil gai bi wa yu qī mụ naṣ ṣa lā ta wa mim
end_word 43
wal la żī na yu` mi nụ na bi mā un zi la i 
end_word 78
n zi la i lai ka wa mā un zi la ming qab lik 
end_word 49
u lā `i ka 'a lā hu dam mir rab bi him wa u lā `i
end_word 67
in nal la żī na ka fa rụ sa wā `un 'a lai him a an żar ta hum am la
end_word 51
ta mall llā hu 'a lā qu lụ bi him wa 'a lā sam 
end_word 86
a lā sam 'i him , wa 'a lā ab ṣā ri him gi s
end_word 67
wa mi nan nā si may ya qụ lu ā man nā bil lā hi wa bil yau mil ā kh
end_word 73
i 'ụ nall llā ha wal la żī na ā ma nụ , wa mā yakh da 'ụ na il lā
end_word 35
fī qu lụ bi him ma

In [54]:
import pandas as pd
import re
from difflib import SequenceMatcher

# **1️⃣ Load CSV Files**
transliteration_path = "quran_transliteration_updated.csv"
quran_v1_path = "quran_v1.csv"

# **2️⃣ Read CSV Files**
transliteration_df = pd.read_csv(transliteration_path, delimiter=";")
quran_v1_df = pd.read_csv(quran_v1_path, delimiter=";")

# **3️⃣ Ensure column names match**
transliteration_df.columns = ["Sura", "Aya", "Bagian", "Arabic", "Transliteration"]
quran_v1_df.columns = ["sura", "aya", "kemenag", "transliteration_id", "transcript", "kemenag_baru", "Bagian"]

# **4️⃣ Convert columns to string for accurate matching**
transliteration_df["Sura"] = transliteration_df["Sura"].astype(str)
transliteration_df["Aya"] = transliteration_df["Aya"].astype(str)
quran_v1_df["sura"] = quran_v1_df["sura"].astype(str)
quran_v1_df["aya"] = quran_v1_df["aya"].astype(str)

# **5️⃣ Create a dictionary mapping (sura, aya) to transcript**
transcript_dict = {
    (row["sura"], row["aya"]): row["transcript"] for _, row in quran_v1_df.iterrows()
}

# **6️⃣ Function to Find Best Matching Substring in Transcript**
def find_best_match(transliteration, full_transcript):
    """
    ✅ Mencari bagian terbaik dalam transcript yang sesuai dengan transliteration.
    ✅ Menggunakan SequenceMatcher untuk menemukan start & end terbaik.
    ✅ Menghitung panjang berdasarkan huruf saja (spasi tidak dihitung).
    """
    if not isinstance(full_transcript, str) or not isinstance(transliteration, str):
        return ""

    # **Hapus semua spasi dari transliteration dan transcript**
    translit_clean = re.sub(r"\s", "", transliteration)
    transcript_clean = re.sub(r"\s", "", full_transcript)

    # **Gunakan SequenceMatcher untuk menemukan kecocokan terbaik**
    match = SequenceMatcher(None, transcript_clean, translit_clean).find_longest_match(0, len(transcript_clean), 0, len(translit_clean))

    if match.size == 0:
        return ""  # Jika tidak ada kecocokan, kembalikan string kosong

    # **Ambil substring dengan panjang huruf sesuai transliteration**
    start = match.a  # Posisi awal dalam transcript_clean
    length = len(translit_clean)  # Jumlah huruf tanpa spasi yang harus diambil

    # **Konversi full_transcript ke bentuk list untuk mempertahankan spasi**
    result = []
    count = 0  # Hitung jumlah huruf yang sudah dimasukkan

    for char in full_transcript:
        if char.strip():  # Hanya hitung huruf, bukan spasi
            count += 1
        if start <= count <= start + length:
            result.append(char)
        if count >= start + length:
            break

    return "".join(result)  # Gabungkan kembali ke bentuk string

# **7️⃣ Add Transcript Column**
transliteration_df["Transcript"] = ""

for idx, row in transliteration_df.iterrows():
    key = (row["Sura"], row["Aya"])

    if key in transcript_dict:
        full_transcript = transcript_dict[key]
        transliteration = row["Transliteration"]
        transliteration = transliteration.replace("-","")
        # Cari bagian transcript yang paling mirip dengan transliteration
        best_match = find_best_match(transliteration, full_transcript)

        transliteration_df.at[idx, "Transcript"] = best_match

# **8️⃣ Save Updated CSV**
output_path = "quran_transliteration_with_transcript_fixed.csv"
transliteration_df.to_csv(output_path, index=False, sep=";", encoding="utf-8-sig")

print(f"✅ Kolom 'Transcript' telah diperbaiki dan disimpan ke {output_path}")


✅ Kolom 'Transcript' telah diperbaiki dan disimpan ke quran_transliteration_with_transcript_fixed.csv
