In [18]:
import re
import csv
import pandas as pd
from unidecode import unidecode
from jaro import jaro_winkler_metric
from nltk.tokenize import word_tokenize

Data Cleaning

In [19]:
def remove_colon_number(text):
    pattern = r"\s*:\s*\d+"
    return re.sub(pattern, "", text)

In [20]:
def remove_diacritics(text):
    cleaned_text = unidecode(' '.join(text.split()))
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s-]', '', cleaned_text)
    return cleaned_text

In [21]:
def add_space_between_number_and_alphabet(text):
    pattern = r'(\d)([a-zA-Z])|([a-zA-Z])(\d)'
    result = re.sub(pattern, r'\1\3 \2\4', text)
    return result

In [None]:
def fix_numbers(text):
    # Remove leading spaces
    text = text.strip()
    # Define a regex pattern to match numbers with leading zeros
    pattern = r'\b0+(\d+)\b'
    # Use a lambda function to remove leading zeros and substitute in the text
    result = re.sub(pattern, lambda x: x.group(1), text)
    return result

Fix Typo

In [22]:
# This function finds the closest match to a given book name from a list of book names.
def jaro_function(bookname_input, booknames):
    max_score = 0.75  # Threshold for similarity score
    current_bookname = bookname_input  # Default value is the input book name

    # Iterating through the list of book names to find the closest match
    for bookname in booknames:
        similarity_score = jaro_winkler_metric(bookname_input, bookname)
        if similarity_score > max_score:
            max_score = similarity_score
            current_bookname = bookname

    # Returning the closest match to the input book name
    return current_bookname

def fix_typo(text, app_dictionary):
    text = text.lower()
    text = remove_colon_number(text)
    text = remove_diacritics(text)
    text = add_space_between_number_and_alphabet(text)
    text = fix_numbers(text)
    text_list = text.split(" ")
    result = list()

    for word in text_list:
        current = jaro_function(word, app_dictionary)
        if current != word:
            pass
        else:
            if current in app_dictionary:
                pass
            else:
                for word2 in app_dictionary:
                    if word2 in word:
                        current = word2
                        break
                    else:
                        continue
        result.append(current)
    
    return ' '.join(result)

In [23]:
df = pd.read_csv("assets/classifiedchat_context.csv")
df = df[df['Category'] == 'report']

my_dictionary = pd.read_csv("assets/app_dictionary.csv")['Kamus'].tolist()
df['Preprocessing'] = df['Message'].apply(lambda x: fix_typo(x, my_dictionary))

In [24]:
df['Preprocessing'].to_csv('Preprocessing1.csv', index=False)

Another Cleaning

In [25]:
# Load cases from CSV
cases_df = pd.read_csv('assets/book_names.csv')

# Extract cases as a list
cases = cases_df['Kitab'].tolist()

def preprocessing_text(report, bible_names):
    # Convert text to lowercase
    text = report.lower()
    text = re.sub(r'\s*-\s*', '-', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Generate regex pattern dynamically from cases to handle multiple keywords separated by spaces
    pattern = r'\b(?:' + '|'.join(cases) + r')(?:\s+\d+(?:-\d+)?)?(?:\s+(?:' + '|'.join(cases) + r')\s+\d+(?:-\d+)?)?(?:-\d+)?'  # Modified pattern to capture the entire phrase including hyphen

    # Extract matches using regex
    matches = re.findall(pattern, text)

    # Join the matches to get the cleaned text
    result = '-'.join(matches)

    # print("Cleaned text:", result)
    return result

df['Preprocessing2'] = df['Preprocessing'].apply(lambda x: preprocessing_text(x, cases))


In [26]:
df['Preprocessing2'].to_csv('Preprocessing2.csv', index=False)

Parsing

In [27]:
singkatan_df = pd.read_csv('assets/singkatan.csv')
kitab_df = pd.read_csv('assets/booknames_nospace.csv')
singkatan_dict = dict(zip(singkatan_df['Singkatan'], kitab_df['Kitab']))

def ganti_singkatan(text):
    if isinstance(text, str):  # Check if text is a string
        words = text.split()
        i = 0
        while i < len(words):
            word = words[i]
            if '-' in word:
                parts = word.split('-')
                updated_parts = []
                for part in parts:
                    if part.lower() in singkatan_dict:  # Check lowercase for case-insensitivity
                        updated_parts.append(singkatan_dict[part.lower()])
                    else:
                        updated_parts.append(part)
                words[i] = '-'.join(updated_parts)
            elif word.lower() in singkatan_dict:  # Check lowercase for case-insensitivity
                words[i] = singkatan_dict[word.lower()]
            elif i < len(words) - 1 and (word + ' ' + words[i+1]).lower() in singkatan_dict:  # Check for multi-word abbreviations
                words[i] = singkatan_dict[(word + ' ' + words[i+1]).lower()]
                del words[i+1]  # Remove the next word as it's part of the abbreviation
            elif re.match(r'.*\d', word):  # Check if the word contains a digit
                break  # If a word with a digit is encountered, stop replacing
            elif i > 0 and words[i-1].lower() in singkatan_dict:  # Check if the previous word is an abbreviation
                words[i-1] = singkatan_dict[words[i-1].lower()]
            i += 1
        return ' '.join(words)
    else:
        return text

processed_2 = list()

for text in df['Preprocessing2']:
    new_value = ganti_singkatan(text)
    processed_2.append(new_value)

df['Preprocessing2'] = processed_2

In [28]:
df['Preprocessing2'].to_csv('Preprocessing3.csv', index=False)

In [29]:
def expand_chapter_range(input_str):
    # Split the input string into book and chapter range
    parts = str(input_str).split()
    if len(parts) > 2:
        book = parts[0] + " " + parts[1]
        chapters = parts[2].split('-')
    elif len(parts) < 2:
        return input_str
    else:
        book = parts[0]
        chapters = parts[1].split('-')

    # If there's only one chapter, return the original input
    if len(chapters) == 1:
        return input_str

    # Otherwise, format the output
    start_chapter = chapters[0]
    end_chapter = chapters[1]
    output = f"{book} {start_chapter} - {book} {end_chapter}"
    return output


processed3 = list()
for i in range(len(df)):
    new_value = expand_chapter_range(df['Preprocessing2'][i:i+1].values[0])
    new_value = re.sub(r'(\d+)-(\w+)', r'\1 - \2', new_value)
    processed3.append(new_value)
    
df['Processed 3'] = processed3

In [30]:
df['Processed 3'].to_csv('Preprocessing4.csv', index=False)

Parsing

In [31]:
# Fungsi untuk membaca jenis kejadian dan nomor kejadian dari file CSV
def read_events(file_name):
    events = []
    with open(file_name, newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            events.append(row[0].lower()) 
    return events

def parse_messages(df, events):
    parsed_messages = []

    for line in df['Processed 3']:
        matches = re.findall(r'(\w+\s\d+)(?:\s*-\s*(\w+\s\d+))?', line)  # Modifikasi regex di sini
        parsed_line = ''
        for match in matches:
            start_event, end_event = match[0], match[1]

            start_index = events.index(start_event.lower()) if start_event.lower() in events else None
            end_index = events.index(end_event.lower()) if end_event and end_event.lower() in events else start_index
            if start_index is not None and end_index is not None:
                for i in range(start_index, end_index + 1):
                    parsed_line += f"{events[i]}, "  # Menghapus .capitalize() agar tidak ada huruf kapital di awal
        parsed_line = parsed_line[:-2]  # Menghapus koma dan spasi ekstra dari akhir
        parsed_messages.append(parsed_line)
    
    df['Parsed'] = parsed_messages

    return df

# Fungsi untuk menyimpan hasil parsing ke file
def save_parsed_messages(parsed_messages, output_file):
    with open(output_file, 'w') as file:
        for parsed_line in parsed_messages:
            file.write(parsed_line)

# Contoh penggunaan
file_name = 'assets/biblechapters_nospace.csv'

events = read_events(file_name)
df = parse_messages(df, events)

In [32]:
df['Parsed'].to_csv("Parsed.csv", index=False)