In [1]:
import re
import csv
import pandas as pd
from collections import Counter
from jaro import jaro_winkler_metric
from unidecode import unidecode

#### Read Data from CSV

In [2]:
df = pd.read_csv("assets/classifiedchat_context.csv")
df = df[df['Category'] == 'report']

#### Data Cleaning and Preprocessing

In [3]:
# Remove Bible verses (from Amsal 28:2 become Amsal 28)
def remove_colon_number(text):
    pattern = r"\s*:\s*\d+"
    return re.sub(pattern, "", text)
 
df['Preprocessing1'] = df['Message'].apply(remove_colon_number)

In [4]:
# Formatting numbers with commas
def clean_text(text):
    cleaned_text = re.sub(r'(\w+) (\d+), (\d+)-(\d+)(?:\s\w+)*', r'\1 \2-\4', text, flags=re.IGNORECASE)
    return cleaned_text

df['Preprocessing2'] = df['Preprocessing1'].apply(clean_text)

In [5]:
# Remove unwanted characters
def remove_unwanted_char(text):
    # lowercase all messages
    text = text.lower()

    # remove unwanted characters
    text = text.replace('_', '').replace('`', '').replace('.', '').replace('--', '-').replace(',', '').replace('*', '').replace('"', '').lstrip().rstrip()

    # remove all emojis
    emojis = pd.read_csv('assets/emojis.csv')
    emojis_list = emojis['emo'].tolist()

    for emoji in emojis_list:
        text = text.replace(emoji, '')

    # remove unwanted words
    word_list = ['selesai', 'done']
    for word in word_list:
        text = text.replace(word, '')

    # remove unknown characters
    modified_text = re.sub(r'(?<=\D)(\d+)', r' \1', text)
    modified_text = re.sub(r'(?<=\d)([a-zA-Z])', r' \1', modified_text)

    # remove diacritic characters
    cleaned_text = unidecode(' '.join(modified_text.split()))
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s-]', '', cleaned_text)
    return cleaned_text

df['Preprocessing3'] = df['Preprocessing2'].apply(remove_unwanted_char)

In [9]:
# # Fromat series of numbers (from Amsal 28-29; 30-31; 32-33 become Amsal 28-33)
# def format_numbers(report):
#     pattern = r"(\d+)\s*-\s*(\d+)(?:\s*[,;]\s*(\d+)\s*-\s*(\d+))*\s*(?:done)?"

#     # Using re.findall() to find all patterns in the report
#     matches = re.findall(pattern, report, re.IGNORECASE)

#     # If a pattern is found
#     if matches:
#         # Initializing a list to store all numbers found
#         all_numbers = []

#         # Collecting all numbers from each found pattern
#         for match in matches:
#             for num in match:
#                 if num:
#                     all_numbers.append(int(num))

#         # Sorting numbers and removing duplicates
#         all_numbers = sorted(set(all_numbers))

#         # Creating a range of numbers
#         min_num = all_numbers[0]
#         max_num = all_numbers[-1]

#         # Taking the book name from the report
#         book_name = re.search(r'^\w+', report).group()

#         # Generating the result string in the desired format
#         result = f"{book_name} {min_num}-{max_num}"
#         return result
#     else:
#         return report

# df['Preprocessing'] = df['Preprocessing'].apply(format_numbers)

In [11]:
def format_numbers(report):
    pattern = r"(\b\w+\b\s+)?(\d+)\s*-\s*(\d+)(?:\s*[,;]\s*(\d+)\s*-\s*(\d+))*\s*(?:done)?"
    # ^^^^^^^^^^^^^^^^

    # Using re.findall() to find all patterns in the report
    matches = re.findall(pattern, report, re.IGNORECASE)

    # If a pattern is found
    if matches:
        # Initializing a list to store all numbers found
        all_numbers = []

        # Collecting all numbers from each found pattern
        for match in matches:
            for num in match[1:]:
                if num:
                    all_numbers.append(int(num))

        # Sorting numbers and removing duplicates
        all_numbers = sorted(set(all_numbers))

        # Creating a range of numbers
        min_num = all_numbers[0]
        max_num = all_numbers[-1]

        # Taking the book name from the report
        book_name = match[0] if match[0] else re.search(r'^\w+', report).group()

        # Generating the result string in the desired format
        result = f"{book_name} {min_num}-{max_num}"
        return result
    else:
        return report

df['Preprocessing4'] = df['Preprocessing3'].apply(format_numbers)

In [12]:
df['Preprocessing4'].to_csv('1Preprocessing.csv', index=False)

In [13]:
# This function corrects the typo in a given book name within a report.
def fix_bookname_typo(report, booknames):
    # Extracting the first word of the report
    first_word_match = re.match(r'^\w+', report)
    if first_word_match:
        first_word = first_word_match.group(0)
    else:
        first_word = ''  # Default value if no word is found
       
    # Removing the first word from the report
    rest_of_report = re.sub(r'^\w+', '', report).lstrip()

    # Searching for a word following a hyphen in the remaining report
    match = re.search(r'-(\s*\b[a-zA-Z]+\b|\b[a-zA-Z]+\b)', rest_of_report)
    if match:
        after_hyphen = match.group(1)  # Word following the hyphen
        # Fixing the typo in the word following the hyphen
        fixed_after_hyphen = fix_bookname(after_hyphen.strip(), booknames)
        # Replacing the original word with the corrected one
        rest_of_report = rest_of_report.replace(after_hyphen, fixed_after_hyphen, 1)

    # Fixing the typo in the first word of the report and returning the corrected report
    output = fix_spaces(fix_bookname(first_word, booknames) + " " + rest_of_report.strip())
    return ' '.join(output.split())

# This function finds the closest match to a given book name from a list of book names.
def fix_bookname(bookname_input, booknames):
    booknames_list = booknames["Kitab"].tolist()  # Converting book names to a list

    max_score = 0.75  # Threshold for similarity score
    current_bookname = bookname_input  # Default value is the input book name

    # Iterating through the list of book names to find the closest match
    for bookname in booknames_list:
        similarity_score = jaro_winkler_metric(bookname_input, bookname)
        if similarity_score >= max_score:
            max_score = similarity_score
            current_bookname = bookname

    # Returning the closest match to the input book name
    return current_bookname

def fix_spaces(text):
    # Using regular expression to find hyphens surrounded by spaces
    fixed_text = re.sub(r'\s*-\s*', ' - ', text)
    return fixed_text

booknames_list = pd.read_csv("assets/book_names.csv")

df['Fix Book Name'] = df['Preprocessing4'].apply(lambda x: fix_bookname_typo(x, booknames_list))

In [21]:
df['Fix Book Name'].to_csv('2Fix_Book_Name.csv', index=False)