#### Library

In [2]:
import re
import csv
import pandas as pd
from collections import Counter
from jaro import jaro_winkler_metric
from unidecode import unidecode

#### Data Cleaning and Preprocessing

##### Clean unwanted characters (,,, Kejadian 1 - 2 ,,, --> Kejadian 1 - 2)

In [3]:
# take only report chat
df = pd.read_csv("assets/classifiedchat_context.csv")
df = df[df['Category'] == 'report']

def remove_colon_number(text):
    pattern = r"\s*:\s*\d+"
    return re.sub(pattern, "", text)

def format_numbers(report):
    if ":" in report:
        pattern = r"(\d+)\s*-\s*(\d+)(?:\s*[,;]\s*(\d+)\s*-\s*(\d+))*\s*(?:done)?"

        # Using re.findall() to find all patterns in the report
        matches = re.findall(pattern, report, re.IGNORECASE)

        # If a pattern is found
        if matches:
            # Initializing a list to store all numbers found
            all_numbers = []

            # Collecting all numbers from each found pattern
            for match in matches:
                for num in match:
                    if num:
                        all_numbers.append(int(num))

            # Sorting numbers and removing duplicates
            all_numbers = sorted(set(all_numbers))

            # Creating a range of numbers
            min_num = all_numbers[0]
            max_num = all_numbers[-1]

            # Taking the book name from the report
            book_name = re.search(r'^\w+', report).group()

            # Generating the result string in the desired format
            result = f"{book_name} {min_num}-{max_num}"
            return result
        else:
            return report
    else:
        return report

def clean_text(text):
    cleaned_text = re.sub(r'(\w+) (\d+), (\d+)-(\d+)(?:\s\w+)*', r'\1 \2-\4', text, flags=re.IGNORECASE)
    return cleaned_text

def preprocessing_report(report):
    # lowercase all messages
    report = df['Preprocessing'].str.lower()

    # remove unwanted characters
    report = report.str.replace('_', '').str.replace('`', '').str.replace('.', '').str.replace('--', '-').str.replace(',', '').str.replace('*', '').str.replace('"', '').str.lstrip().str.rstrip()

    # remove all emojis
    emojis = pd.read_csv('assets/emojis.csv')
    emojis_list = emojis['emo'].tolist()

    for emoji in emojis_list:
        report = report.str.replace(emoji, '')

    # remove unwanted words
    word_list = ['selesai', 'done']
    for word in word_list:
        report = report.str.replace(word, '')

    # remove unknown characters
    for i, message in enumerate(report):
        modified_text = re.sub(r'(?<=\D)(\d+)', r' \1', message)
        modified_text = re.sub(r'(?<=\d)([a-zA-Z])', r' \1', modified_text)
        report[i:i+1] = ' '.join(modified_text.split())
    
    return report

df['Preprocessing'] = df['Message']
df['Preprocessing'] = df['Preprocessing'].apply(remove_colon_number)#.apply(clean_text)

report = preprocessing_report(df)

df['Cleaned Report'] = report.apply(format_numbers)

##### Remove Diacritic Characters (ķel 2-3 --> kel 2-3)

In [3]:
def remove_and_replace_diacritics(text):
    cleaned_text = unidecode(text)
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s-]', '', cleaned_text)
    return cleaned_text

df['Cleaned Report'] = df['Cleaned Report'].apply(remove_and_replace_diacritics)

In [4]:
df['Cleaned Report'].to_csv('clean_report.csv', index=False)

##### Fix Typo in Bookname (krl 2-3, kel 2-3)

In [10]:
# This function corrects the typo in a given book name within a report.
def fix_bookname_typo(report, booknames):
    # Extracting the first word of the report
    first_word_match = re.match(r'^\w+', report)
    if first_word_match:
        first_word = first_word_match.group(0)
    else:
        first_word = ''  # Default value if no word is found
       
    # Removing the first word from the report
    rest_of_report = re.sub(r'^\w+', '', report).lstrip()

    # Searching for a word following a hyphen in the remaining report
    match = re.search(r'-(\s*\b[a-zA-Z]+\b|\b[a-zA-Z]+\b)', rest_of_report)
    if match:
        after_hyphen = match.group(1)  # Word following the hyphen
        # Fixing the typo in the word following the hyphen
        fixed_after_hyphen = fix_bookname(after_hyphen.strip(), booknames)
        # Replacing the original word with the corrected one
        rest_of_report = rest_of_report.replace(after_hyphen, fixed_after_hyphen, 1)

    # Fixing the typo in the first word of the report and returning the corrected report
    return fix_spaces(fix_bookname(first_word, booknames) + " " + rest_of_report.strip())

# This function finds the closest match to a given book name from a list of book names.
def fix_bookname(bookname_input, booknames):
    booknames_list = booknames["Kitab"].tolist()  # Converting book names to a list

    max_score = 0.75  # Threshold for similarity score
    current_bookname = bookname_input  # Default value is the input book name

    # Iterating through the list of book names to find the closest match
    for bookname in booknames_list:
        similarity_score = jaro_winkler_metric(bookname_input, bookname)
        if similarity_score >= max_score:
            max_score = similarity_score
            current_bookname = bookname

    # Returning the closest match to the input book name
    return current_bookname

def fix_spaces(text):
    # Using regular expression to find hyphens surrounded by spaces
    fixed_text = re.sub(r'\s*-\s*', ' - ', text)
    return fixed_text

booknames = pd.read_csv("assets/book_names.csv")

fix_typo = list()
for i in range(len(df)):
    cleaned_report_str = str(df['Cleaned Report'][i:i+1].values[0])  # Convert DataFrame slice to string
    new_value = fix_bookname_typo(cleaned_report_str, booknames)
    # Remove double spaces
    output = ' '.join(new_value.split())
    fix_typo.append(output)

df['Fix Book Name'] = fix_typo

In [11]:
df['Fix Book Name'].to_csv('fix_bookname.csv', index=False)

##### Just take the book name and chapter (bu saya sudah sampai kejadian 35, kejadian 35)

In [12]:
book_list = pd.read_csv('assets/book_names.csv')
book_list = book_list['Kitab'].tolist()

# Construct a regex pattern to match any of the names in the list along with numbers and '-'
pattern = r'(?:\b(?:' + '|'.join(book_list) + r')\b(?:\s*\d*-*\d*\s*)*)'

processed_message = []

for i in range(len(df)):
    # Skip empty strings
    text = df['Fix Book Name'][i:i+1].values[0] + " end"
    if not text.strip():
        continue
    
    # Find all matches for the pattern in the text
    matches = re.findall(pattern, text)

    if matches:
        # Construct the final output by joining the matches
        output = ' '.join(matches)
        
        # Remove double spaces
        output = ' '.join(output.split())

        # Remove space before or after hyphen
        output = re.sub(r'\s*-\s*', r'-', output)
    else:
        output = text

    processed_message.append(output)

df['Processed'] = processed_message

In [13]:
df['Processed'].to_csv('coba1.csv', index=False)

##### Change Abbreviation (Kel 2 - 3, Keluaran 2 - 3)

In [14]:
singkatan_df = pd.read_csv('assets/singkatan.csv')
kitab_df = pd.read_csv('assets/biblebooknames.csv')
singkatan_dict = dict(zip(singkatan_df['Singkatan'], kitab_df['Kitab']))

def ganti_singkatan(text):
    if isinstance(text, str):  # Check if text is a string
        words = text.split()
        i = 0
        while i < len(words):
            word = words[i]
            if '-' in word:
                parts = word.split('-')
                updated_parts = []
                for part in parts:
                    if part.lower() in singkatan_dict:  # Check lowercase for case-insensitivity
                        updated_parts.append(singkatan_dict[part.lower()])
                    else:
                        updated_parts.append(part)
                words[i] = '-'.join(updated_parts)
            elif word.lower() in singkatan_dict:  # Check lowercase for case-insensitivity
                words[i] = singkatan_dict[word.lower()]
            elif i < len(words) - 1 and (word + ' ' + words[i+1]).lower() in singkatan_dict:  # Check for multi-word abbreviations
                words[i] = singkatan_dict[(word + ' ' + words[i+1]).lower()]
                del words[i+1]  # Remove the next word as it's part of the abbreviation
            elif re.match(r'.*\d', word):  # Check if the word contains a digit
                break  # If a word with a digit is encountered, stop replacing
            elif i > 0 and words[i-1].lower() in singkatan_dict:  # Check if the previous word is an abbreviation
                words[i-1] = singkatan_dict[words[i-1].lower()]
            i += 1
        return ' '.join(words)
    else:
        return text

processed_2 = list()

for text in df['Processed']:
    new_value = ganti_singkatan(text)
    processed_2.append(new_value)

df['Processed 2'] = processed_2

In [15]:
df['Processed 2'] = df['Processed 2'].str.replace('--', '-')
df['Processed 2'].to_csv('coba2.csv', index=False)

In [16]:
def expand_chapter_range(input_str):
    # Split the input string into book and chapter range
    parts = str(input_str).split()
    if len(parts) > 2:
        book = parts[0] + " " + parts[1]
        chapters = parts[2].split('-')
    elif len(parts) < 2:
        return input_str
    else:
        book = parts[0]
        chapters = parts[1].split('-')

    # If there's only one chapter, return the original input
    if len(chapters) == 1:
        return input_str

    # Otherwise, format the output
    start_chapter = chapters[0]
    end_chapter = chapters[1]
    output = f"{book} {start_chapter} - {book} {end_chapter}"
    return output


processed3 = list()
for i in range(len(df)):
    new_value = expand_chapter_range(df['Processed 2'][i:i+1].values[0])
    processed3.append(new_value)

df['Processed 3'] = processed3

In [17]:
df['Processed 3'].to_csv('coba3.csv', index=False)

Parsing

In [18]:
# Fungsi untuk membaca jenis kejadian dan nomor kejadian dari file CSV
def read_events(file_name):
    events = []
    with open(file_name, newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            events.append(row[0].lower())  # Tambahkan jenis kejadian dengan nomor kejadian
    return events

# Fungsi untuk mengubah format pesan
def parse_messages(df, events):
    parsed_messages = []

    for line in df['Processed 3']:
        matches = re.findall(r'(\w+\s\d+)\s*-\s*(\w+\s\d+)', line)
        parsed_line = ''
        for match in matches:
            start_event, end_event = match
            start_index = events.index(start_event.lower()) if start_event.lower() in events else None
            end_index = events.index(end_event.lower()) if end_event.lower() in events else None
            if start_index is not None and end_index is not None:
                for i in range(start_index, end_index + 1):
                    parsed_line += f"{events[i]}, "  # Menghapus .capitalize() agar tidak ada huruf kapital di awal
        parsed_line = parsed_line[:-2]  # Menghapus koma dan spasi ekstra dari akhir
        parsed_messages.append(parsed_line)
    
    df['Parsed'] = parsed_messages

    return df

# Fungsi untuk menyimpan hasil parsing ke file
def save_parsed_messages(parsed_messages, output_file):
    with open(output_file, 'w') as file:
        for parsed_line in parsed_messages:
            file.write(parsed_line)

# Contoh penggunaan
file_name = 'assets/biblechapters.csv'

events = read_events(file_name)
df = parse_messages(df, events)

In [25]:
df['Parsed'].to_csv("final.csv", index=False)