In [1]:
import re
import csv
import pandas as pd
from unidecode import unidecode
from jaro import jaro_winkler_metric
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings("ignore")

Data Cleaning

In [2]:
def remove_colon_number(text):
    pattern = r"\s*:\s*\d+"
    return re.sub(pattern, "", text)

In [3]:
def remove_diacritics(text):
    cleaned_text = unidecode(' '.join(text.split()))
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s-]', '', cleaned_text)
    return cleaned_text

In [4]:
def add_space_between_number_and_alphabet(text):
    pattern = r'(\d)([a-zA-Z])|([a-zA-Z])(\d)'
    result = re.sub(pattern, r'\1\3 \2\4', text)
    return result

In [5]:
def fix_numbers(text):
    text = text.strip()
    pattern = r'\b0+(\d+)\b'
    result = re.sub(pattern, lambda x: x.group(1), text)
    return result

Fix Typo

In [6]:
def jaro_function(bookname_input, booknames):
    max_score = 0.75
    current_bookname = bookname_input

    for bookname in booknames:
        similarity_score = jaro_winkler_metric(bookname_input, bookname)
        if similarity_score > max_score:
            max_score = similarity_score
            current_bookname = bookname

    return current_bookname

def fix_typo(text, app_dictionary):
    text = text.lower()
    text = remove_colon_number(text)
    text = remove_diacritics(text)
    text = add_space_between_number_and_alphabet(text)
    text = fix_numbers(text)
    text_list = text.split(" ")
    result = list()

    for word in text_list:
        current = jaro_function(word, app_dictionary)
        if current != word:
            pass
        else:
            if current in app_dictionary:
                pass
            else:
                for word2 in app_dictionary:
                    if word2 in word:
                        current = word2
                        break
                    else:
                        continue
        result.append(current)
    
    return ' '.join(result)

In [7]:
df = pd.read_csv("assets/classifiedchat_context.csv")
df = df[df['Category'] == 'report']

my_dictionary = pd.read_csv("assets/app_dictionary.csv")['Kamus'].tolist()
df['Preprocessing'] = df['Message'].apply(lambda x: fix_typo(x, my_dictionary))

In [8]:
df['Preprocessing'].to_csv('phase/preprocessing1.csv', index=False)

Another Cleaning

In [10]:
cases_df = pd.read_csv('assets/booknames.csv')
cases = cases_df['Kitab'].tolist()

def preprocessing_text(report, bible_names):
    text = report.lower()
    text = re.sub(r'\s*-\s*', '-', text)

    tokens = word_tokenize(text)

    pattern = r'\b(?:' + '|'.join(bible_names) + r')(?:\s+\d+(?:-\d+)?)?(?:\s+(?:sam\s+\d+)?(?:pet\s+\d+)?(?:-\d+)?)?(?:-\d+)?'

    matches = re.findall(pattern, text)
    result = ' '.join(matches)

    return result

df['Preprocessing2'] = df['Preprocessing'].apply(lambda x: preprocessing_text(x, cases))

In [11]:
df['Preprocessing2'].to_csv('phase/preprocessing2.csv', index=False)

Cleaning for Edge Cases

In [12]:
def replace_sam(text):
    return re.sub(r'(\d+)\s*sam\s*|sam\s*(\d+)', r'\1sam ', text)

def replace_pet(text):
    return re.sub(r'(\d+)\s*pet\s*|pet\s*(\d+)', r'\1pet ', text)

df['Preprocessing3'] = df['Preprocessing2'].apply(replace_sam).apply(replace_pet)
df['Preprocessing3'].to_csv('phase/preprocessing3.csv', index=False)


Parsing

In [14]:
singkatan_df = pd.read_csv('assets/singkatan.csv')
kitab_df = pd.read_csv('assets/booknames_nospace.csv')
singkatan_dict = dict(zip(singkatan_df['Singkatan'], kitab_df['Kitab']))

def ganti_singkatan(text):
    if isinstance(text, str):
        words = text.split()
        i = 0
        while i < len(words):
            word = words[i]
            if '-' in word:
                parts = word.split('-')
                updated_parts = []
                for part in parts:
                    if '-' in part:
                        book_parts = part.split('-')
                        updated_book_parts = []
                        for bp in book_parts:
                            if bp.lower() in singkatan_dict:
                                updated_book_parts.append(singkatan_dict[bp.lower()])
                            else:
                                updated_book_parts.append(bp)
                        updated_parts.append('-'.join(updated_book_parts))
                    elif part.lower() in singkatan_dict:
                        updated_parts.append(singkatan_dict[part.lower()])
                    else:
                        updated_parts.append(part)
                words[i] = '-'.join(updated_parts)
            elif word.lower() in singkatan_dict:
                words[i] = singkatan_dict[word.lower()]
            elif i < len(words) - 1 and (word + ' ' + words[i+1]).lower() in singkatan_dict:
                words[i] = singkatan_dict[(word + ' ' + words[i+1]).lower()]
                del words[i+1]
            elif re.match(r'.*\d', word):
                break
            elif i > 0 and words[i-1].lower() in singkatan_dict:
                words[i-1] = singkatan_dict[words[i-1].lower()]
            i += 1
        return ' '.join(words)
    else:
        return text
    
processed_2 = list()

for text in df['Preprocessing3']:
    new_value = ganti_singkatan(text)
    processed_2.append(new_value)

df['Preprocessing4'] = processed_2


In [15]:
df['Preprocessing4'].to_csv('phase/preprocessing4.csv', index=False)

In [16]:
def expand_chapter_range(input_str):
    parts = str(input_str).split()
    if len(parts) > 2:
        book = parts[0] + " " + parts[1]
        chapters = parts[2].split('-')
    elif len(parts) < 2:
        return input_str
    else:
        book = parts[0]
        chapters = parts[1].split('-')

    if len(chapters) == 1:
        return input_str

    start_chapter = chapters[0]
    end_chapter = chapters[1]
    output = f"{book} {start_chapter} - {book} {end_chapter}"
    return output

processed3 = list()
for i in range(len(df)):
    new_value = expand_chapter_range(df['Preprocessing4'][i:i+1].values[0])
    new_value = re.sub(r'(\d+)-(\w+)', r'\1 - \2', new_value)
    processed3.append(new_value)
    
df['Processed 5'] = processed3

In [17]:
df['Processed 5'].to_csv('phase/preprocessing5.csv', index=False)

Parsing

In [18]:
def read_events(file_name):
    events = []
    with open(file_name, newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            events.append(row[0].lower()) 
    return events

def parse_messages(df, events, booknames):
    parsed_messages = []

    for line in df['Processed 5']:
        if len(line.split()) == 1:
            if all(book in booknames for book in line.split('-')):
                parsed_line = ''
                parsed_books = line.split('-')
                for book in parsed_books:
                    for chapter in events:
                        if book in chapter:
                            parsed_line += f"{chapter}, "
                parsed_line = parsed_line[:-2] 
                parsed_messages.append(parsed_line)
            else:
                parsed_line = ''
                for chapter in events:
                    if line in chapter:
                        parsed_line += f"{chapter}, "
                parsed_line = parsed_line[:-2] 
                parsed_messages.append(parsed_line)
            
        else:
            matches = re.findall(r'(\w+\s\d+)(?:\s*-\s*(\w+\s\d+))?', line)
            parsed_line = ''
            for match in matches:
                start_event, end_event = match[0], match[1]

                start_index = events.index(start_event.lower()) if start_event.lower() in events else None
                end_index = events.index(end_event.lower()) if end_event and end_event.lower() in events else start_index
                if start_index is not None and end_index is not None:
                    for i in range(start_index, end_index + 1):
                        parsed_line += f"{events[i]}, "
            parsed_line = parsed_line[:-2]
            parsed_messages.append(parsed_line)
    
    df['Parsed'] = parsed_messages

    return df

def save_parsed_messages(parsed_messages, output_file):
    with open(output_file, 'w') as file:
        for parsed_line in parsed_messages:
            file.write(parsed_line)

file_name = 'assets/biblechapters_nospace.csv'

booknames_nospace = pd.read_csv('assets/booknames_nospace.csv')
booknames_nospace = booknames_nospace['Kitab'].tolist()
events = read_events(file_name)
df = parse_messages(df, events, booknames_nospace)

In [19]:
df['Parsed'].to_csv("phase/preprocessing6.csv", index=False)

In [20]:
text = 'titus-filemon-ibrani-yakobus'
parsed_line = ''
parsed_books = text.split('-')
for book in parsed_books:
    for chapter in events:
        if book in chapter:
            parsed_line += f"{chapter}, "
parsed_line = parsed_line[:-2] 
print(parsed_line)

titus 1, titus 2, titus 3, filemon 1, ibrani 1, ibrani 2, ibrani 3, ibrani 4, ibrani 5, ibrani 6, ibrani 7, ibrani 8, ibrani 9, ibrani 10, ibrani 11, ibrani 12, ibrani 13, yakobus 1, yakobus 2, yakobus 3, yakobus 4, yakobus 5


In [21]:
df['Parsed 2'] = df['Parsed']
for i, line in enumerate(df['Parsed 2']):
    new_value = df['Parsed 2'][i:i+1].str.replace('hakimhakim', 'hakim-hakim').str.replace('1samuel', '1 samuel').str.replace('2samuel', '2 samuel').str.replace('1rajaraja', '1 raja-raja').str.replace('2rajaraja', '2 raja-raja').str.replace('1tawarikh', '1 tawarikh').str.replace('2tawarikh', '2 tawarikh').str.replace('kidungagung', 'kidung agung').str.replace('kisahpararasul', 'kisah para rasul').str.replace('1korintus', '1 korintus').str.replace('2korintus', '2 korintus').str.replace('1tesalonika', '1 tesalonika').str.replace('2tesalonika', '2 tesalonika').str.replace('1timotius', '1 timotius').str.replace('2timotius', '2 timotius').str.replace('1petrus', '1 petrus').str.replace('2petrus', '2 petrus').str.replace('3petrus', '3 petrus').str.replace('1yohanes', '1 yohanes').str.replace('2yohanes', '2 yohanes').str.replace('3yohanes', '3 yohanes')
    df['Parsed 2'][i:i+1] = new_value
    
df['Parsed 2'].to_csv('phase/preprocessingfinal.csv', index=False)

Accuracy

In [22]:
python_parsed = pd.read_csv('phase/preprocessingfinal.csv')
manual_parsed = pd.read_csv('assets/classifiedchat_context.csv')
manual_parsed = manual_parsed[manual_parsed['Category'] == 'report']

In [23]:
compared_df = pd.DataFrame()
compared_df['python_parsed'] = python_parsed
compared_df['manual_parsed'] = manual_parsed['Parse Text'].to_list()
compared_df.head()


Unnamed: 0,python_parsed,manual_parsed
0,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"
1,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"
2,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"
3,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"
4,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"


In [27]:
compared_df = pd.DataFrame()
compared_df['python_parsed'] = python_parsed
compared_df['manual_parsed'] = manual_parsed['Parse Text'].to_list()

unmatched_df = compared_df[compared_df['python_parsed'] != compared_df['manual_parsed']]
csv_string = unmatched_df.to_csv('compared.csv', index=False)


In [24]:
result = 0
for i in range(len(python_parsed)):
    if compared_df['python_parsed'][i] == compared_df['manual_parsed'][i]:
        result += 1

print(f'Accuracy: {result/16381*100}')

Accuracy: 91.33752518161285
