In [1]:
import re
import csv
import pandas as pd
from unidecode import unidecode
from jaro import jaro_winkler_metric
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings("ignore")

Data Cleaning

In [2]:
def remove_colon_number(text):
    pattern = r"\s*:\s*\d+"
    return re.sub(pattern, "", text)

In [3]:
def remove_diacritics(text):
    cleaned_text = unidecode(' '.join(text.split()))
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s-]', '', cleaned_text)
    return cleaned_text

In [4]:
def add_space_between_number_and_alphabet(text):
    pattern = r'(\d)([a-zA-Z])|([a-zA-Z])(\d)'
    result = re.sub(pattern, r'\1\3 \2\4', text)
    return result

In [36]:
def fix_numbers(text):
    text = text.strip()
    pattern = r'\b0+(\d+)\b'
    result = re.sub(pattern, lambda x: x.group(1), text)
    return result

In [37]:
def remove_single_alphabets(text):
    # Use regular expression to find single alphabets
    pattern = r'\b[a-zA-Z]\b'
    # Replace single alphabets with empty string
    return re.sub(pattern, '', text)

Fix Typo

In [38]:
def jaro_function(bookname_input, booknames):
    max_score = 0.75
    current_bookname = bookname_input

    for bookname in booknames:
        similarity_score = jaro_winkler_metric(bookname_input, bookname)
        if similarity_score > max_score:
            max_score = similarity_score
            current_bookname = bookname

    return current_bookname

def fix_typo(text, app_dictionary):
    text = text.replace('--', '-').replace('-', ' - ').replace(',', ' ').replace('I ', '1 ').replace('II ', '2 ').replace('III ', '3 ').replace('  ', ' ')
    text = text.lower().replace('hakim2', 'hakim').replace('raja2', 'raja')
    text = remove_colon_number(text)
    text = remove_diacritics(text)
    text = add_space_between_number_and_alphabet(text)
    text = fix_numbers(text)
    text = remove_single_alphabets(text)
    text_list = text.split(" ")
    result = list()

    for word in text_list:
        current = jaro_function(word, app_dictionary)
        if current != word:
            pass
        else:
            if current in app_dictionary:
                pass
            else:
                for word2 in app_dictionary:
                    if word2 in word:
                        current = word2
                        break
                    else:
                        continue
        result.append(current)
    
    return ' '.join(result)

In [39]:
df = pd.read_csv("assets/classifiedchat_context.csv")
df = df[df['Category'] == 'report']

my_dictionary = pd.read_csv("assets/app_dictionary.csv")['Kamus'].tolist()
df['Preprocessing'] = df['Message'].apply(lambda x: fix_typo(x, my_dictionary))
df['Preprocessing'].to_csv('phase/preprocessing1.csv', index=False)

Another Cleaning

In [40]:
def remove_space_in_bookname(text, to_be_changed):
    text = text.lower()
    for before, after in to_be_changed:
        text = text.replace(before, after)
    return text

to_be_changed = [('hakim-hakim', 'hakimhakim'), ('hakim - hakim', 'hakimhakim'),
                 ('1 samuel', '1samuel'), ('1 sam', '1sam'), 
                 ('2 samuel', '2samuel'), ('2 sam', '2sam'), 
                 ('1 raja-raja', '1rajaraja'), ('1 raja - raja', '1rajaraja'), ('1raja - raja', '1rajaraja'), ('1raja-raja', '1rajaraja'), ('1 raj', '1raj'), ('1 raja', '1raja'), 
                 ('2 raja-raja', '2rajaraja'), ('2 raja - raja', '2rajaraja'), ('2raja - raja', '2rajaraja'), ('2raja-raja', '2rajaraja'), ('2 raj', '2raj'), ('2 raja', '2raja'), 
                 ('1 tawarikh', '1tawarikh'), ('1 taw', '1taw'),('2 tawarikh', '2tawarikh'), ('2 taw', '2taw'),
                 ('kidung agung', 'kidungagung'), ('kisah para rasul', 'kisahpararasul'), ('kisah rasul', 'kisahpararasul'),
                 ('1 korintus', '1korintus'), ('1 kor', '1kor'),('2 korintus', '2korintus'), ('2 kor', '2kor'),
                 ('1 tesalonika', '1tesalonika'), ('1 tes', '1tes'), ('2 tesalonika', '2tesalonika'), ('2 tes', '2tes'),
                 ('1 timotius', '1timotius'), ('1 tim', '1tim'),('2 timotius', '2timotius'), ('2 tim', '2tim'),
                 ('1 petrus', '1petrus'), ('1 pet', '1pet'), ('1 ptr', '1ptr'),
                 ('2 petrus', '2petrus'), ('2 pet', '2pet'), ('2 ptr', '2ptr'),
                 ('3 petrus', '3petrus'), ('3 pet', '3pet'), ('3 ptr', '3ptr'), 
                 ('1 yohanes', '1yohanes'), ('1 yoh', '1yoh'),
                 ('2 yohanes', '2yohanes'), ('2 yoh', '2yoh'),
                 ('3 yohanes', '3yohanes'), ('3 yoh', '3yoh')]

df['Preprocessing2'] = df['Preprocessing'].apply(lambda x: remove_space_in_bookname(x, to_be_changed))
df['Preprocessing2'].to_csv('phase/preprocessing2.csv', index=False)

Take only booknames and numbers

In [41]:
def clean_report(text, booknames):
    
    text_list = text.split()
    new_value = list()

    for word in text_list:
        if word in booknames or word.isdigit() or word == '-':
            new_value.append(word)
    
    for i, word in enumerate(new_value):
        if word == 'kisah':
            if i + 1 < len(new_value) and new_value[i+1] in booknames:
                new_value[i] = ''

    return ' '.join(new_value).strip().replace(' 0 ', ' ')

booknames = pd.read_csv('assets/booknames.csv')['Kitab'].tolist()

df['Preprocessing3'] = df['Preprocessing2'].apply(lambda x: clean_report(x, booknames))
df['Preprocessing3'].to_csv('phase/preprocessing3.csv', index=False)

Change Abbreviation

In [42]:
def change_abbreviation(text, abbreviation, booknames):
    text_list = text.split()

    for i, word in enumerate(text_list):
        if word in abbreviation:
            new_word = booknames[abbreviation.index(word)]
            text_list[i] = new_word
        else:
            continue
    result = ' '.join(text_list).strip()
    return result.replace(' - ', '-')

singkatan_df = pd.read_csv('assets/singkatan.csv')['Singkatan'].tolist()
kitab_df = pd.read_csv('assets/booknames_nospace.csv')['Kitab'].tolist()

df['Preprocessing4'] = df['Preprocessing3'].apply(lambda x: change_abbreviation(x, singkatan_df, kitab_df))
# df['Preprocessing4'].to_csv('phase/preprocessing4.csv', index=False)


In [43]:
def remove_duplicates(input_string):
    words = input_string.split()
    unique_words = []
    for word in words:
        if word not in unique_words:
            unique_words.append(word)
    return ' '.join(unique_words)

df['Preprocessing4'] = df['Preprocessing4'].apply(remove_duplicates)
df['Preprocessing4'].to_csv('phase/preprocessing4.csv', index=False)

In [44]:
def expand_chapter_range(input_str):
    parts = str(input_str).split()
    if len(parts) > 2:
        book = parts[0] + " " + parts[1]
        chapters = parts[2].split('-')
    elif len(parts) < 2:
        return input_str
    else:
        book = parts[0]
        chapters = parts[1].split('-')

    if len(chapters) == 1:
        return input_str

    start_chapter = chapters[0]
    end_chapter = chapters[1]
    output = f"{book} {start_chapter} - {book} {end_chapter}"
    return output

processed3 = list()
for i in range(len(df)):
    new_value = expand_chapter_range(df['Preprocessing4'][i:i+1].values[0])
    new_value = re.sub(r'(\d+)-(\w+)', r'\1 - \2', new_value)
    processed3.append(new_value)
    
df['Processed 5'] = processed3

In [45]:
def rearrange_string(s):
    match = re.match(r'(\d+)-(\d+)(\w+)', s)
    if match:
        return match.group(1) + match.group(3) + '-' + match.group(2) + match.group(3)
    else:
        return s
    
df['Processed 5'] = df['Processed 5'].apply(rearrange_string)

In [46]:
df['Processed 5'].to_csv('phase/preprocessing5.csv', index=False)

Parsing

In [47]:
num_of_chapters = {
    'kejadian': 50,
    'keluaran': 40,
    'imamat': 27,
    'bilangan': 36,
    'ulangan': 34,
    'yosua': 24,
    'hakimhakim': 21,
    'rut': 4,
    '1samuel': 31,
    '2samuel': 24,
    '1rajaraja': 22, 
    '2rajaraja': 25,
    '1tawarikh': 29,
    '2tawarikh': 36,
    'ezra': 10,
    'nehemia': 13,
    'ester': 10,
    'ayub': 42,
    'mazmur': 150,
    'amsal': 31,
    'pengkhotbah': 12,
    'kidungagung': 8,
    'yesaya': 66,
    'yeremia': 52,
    'ratapan': 5,
    'yehezkiel': 48,
    'daniel': 12,
    'hosea': 14,
    'yoel': 3,
    'amos': 9,
    'obaja': 1,
    'yunus': 4,
    'mikha': 7, 
    'nahum': 3,
    'habakuk': 3,
    'zefanya': 3,
    'hagai': 2,
    'zakharia': 14,
    'maleakhi': 4, 
    'matius': 28,
    'markus': 16, 
    'lukas': 24,
    'yohanes': 21,
    'kisahpararasul': 28,
    'roma': 16,
    '1korintus': 16,
    '2korintus': 13,
    'galatia': 6,
    'efesus': 6,
    'filipi': 4,
    'kolose': 4,
    '1tesalonika': 5,
    '2tesalonika': 3,
    '1timotius': 6,
    '2timotius': 4, 
    'titus': 3,
    'filemon': 1,
    'ibrani': 13,
    'yakobus': 5,
    '1petrus': 5,
    '2petrus': 3,
    '1yohanes': 5,
    '2yohanes': 1,
    '3yohanes': 1,
    'yudas': 1, 
    'wahyu':22
}

In [48]:
def find_string_containing_word(word, string_list):
    for string in string_list:
        if word in string:
            return string
    return None  # If no string containing the word is found

def find_index_of_string_containing_word(word, string_list):
    for index, string in enumerate(string_list):
        if word in string:
            return index
    return -1  # If no string containing the word is found

In [49]:
def count_numbers(strings):
    count = 0
    for s in strings:
        if s.isdigit():
            count += 1
    return count

In [50]:
def parse_messages(text, chapters, booknames, num_of_chapters):
    parsed_text = ""
   
    if len(text.split()) == 1:
        if all(book in booknames for book in text.split('-')):
            parsed_line = ''
            parsed_books = text.split('-')
            for book in parsed_books:
                for chapter in chapters:
                    if chapter.startswith(book):
                        parsed_line += f"{chapter}, "
            parsed_text = parsed_line[:-2] 
        else:
            parsed_line = ''
            for chapter in chapters:
                if text in chapter:
                    parsed_line += f"{chapter}, "
            parsed_text = parsed_line[:-2] 
        
    else:
        if '-' in text:
            if text.split('-')[0].lstrip().rstrip() in booknames:
                temp_text = text.replace('-', ' - ')
                temp = temp_text.split()
                temp.insert(1, '1')
                text = " ".join(temp)
                if len(text.split('-')) == 2 and text.split('-')[1].lstrip().rstrip() in booknames:
                    temp2 = text.split()
                    temp2.insert(-1, str(num_of_chapters[temp2[-1]]))
                    text = " ".join(temp2)

        else: 
            temp = text.split()
            if count_numbers(temp) == 0:
                text = temp[0] + ' 1 - ' + temp[-1] + ' ' + str(num_of_chapters[temp[-1]])

        matches = re.findall(r'(\w+\s\d+)(?:\s*-\s*(\w+\s\d+))?', text)
        parsed_line = ''
        for match in matches:
            start_event, end_event = match[0], match[1]

            start_index = chapters.index(start_event.lower()) if start_event.lower() in chapters else None
            end_index = chapters.index(end_event.lower()) if end_event and end_event.lower() in chapters else start_index
            if start_index is not None and end_index is not None:
                for i in range(start_index, end_index + 1):
                    parsed_line += f"{chapters[i]}, "
        parsed_text = parsed_line[:-2]

    return parsed_text


In [51]:
file_name = 'assets/biblechapters_nospace.csv'
bible_chapters = pd.read_csv(file_name)['Chapters'].tolist()


booknames_nospace = pd.read_csv('assets/booknames_nospace.csv')
booknames_nospace = booknames_nospace['Kitab'].tolist()

df['Parsed'] = df['Processed 5'].apply(lambda x: parse_messages(x, bible_chapters, booknames_nospace, num_of_chapters))
df['Parsed'].to_csv('phase/preprocessing6.csv', index=False)

In [52]:
def fix_parsing_spaces(text):
    return text.replace('hakimhakim', 'hakim-hakim').replace('1samuel', '1 samuel').replace('2samuel', '2 samuel').replace('1rajaraja', '1 raja-raja').replace('2rajaraja', '2 raja-raja').replace('1tawarikh', '1 tawarikh').replace('2tawarikh', '2 tawarikh').replace('kidungagung', 'kidung agung').replace('kisahpararasul', 'kisah para rasul').replace('1korintus', '1 korintus').replace('2korintus', '2 korintus').replace('1tesalonika', '1 tesalonika').replace('2tesalonika', '2 tesalonika').replace('1timotius', '1 timotius').replace('2timotius', '2 timotius').replace('1petrus', '1 petrus').replace('2petrus', '2 petrus').replace('3petrus', '3 petrus').replace('1yohanes', '1 yohanes').replace('2yohanes', '2 yohanes').replace('3yohanes', '3 yohanes')

fix_parsing_spaces('hakimhakim 1, hakimhakim 2, hakimhakim 3')

'hakim-hakim 1, hakim-hakim 2, hakim-hakim 3'

In [53]:
df['Parsed 2'] = df['Parsed']
for i, line in enumerate(df['Parsed 2']):
    new_value = df['Parsed 2'][i:i+1].str.replace('hakimhakim', 'hakim-hakim').str.replace('1samuel', '1 samuel').str.replace('2samuel', '2 samuel').str.replace('1rajaraja', '1 raja-raja').str.replace('2rajaraja', '2 raja-raja').str.replace('1tawarikh', '1 tawarikh').str.replace('2tawarikh', '2 tawarikh').str.replace('kidungagung', 'kidung agung').str.replace('kisahpararasul', 'kisah para rasul').str.replace('1korintus', '1 korintus').str.replace('2korintus', '2 korintus').str.replace('1tesalonika', '1 tesalonika').str.replace('2tesalonika', '2 tesalonika').str.replace('1timotius', '1 timotius').str.replace('2timotius', '2 timotius').str.replace('1petrus', '1 petrus').str.replace('2petrus', '2 petrus').str.replace('3petrus', '3 petrus').str.replace('1yohanes', '1 yohanes').str.replace('2yohanes', '2 yohanes').str.replace('3yohanes', '3 yohanes')
    df['Parsed 2'][i:i+1] = new_value
    
df['Parsed 2'].to_csv('phase/preprocessingfinal.csv', index=False)

Accuracy

In [54]:
python_parsed = pd.read_csv('phase/preprocessingfinal.csv')
manual_parsed = pd.read_csv('assets/classifiedchat_context_new.csv')
manual_parsed = manual_parsed[manual_parsed['Category'] == 'report']

In [55]:
compared_df = pd.DataFrame()
compared_df['python_parsed'] = python_parsed
compared_df['manual_parsed'] = manual_parsed['Parse Text'].to_list()
compared_df.head()


Unnamed: 0,python_parsed,manual_parsed
0,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"
1,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"
2,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"
3,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"
4,"kejadian 1, kejadian 2","kejadian 1, kejadian 2"


In [56]:
compared_df = pd.DataFrame()
compared_df['python_parsed'] = python_parsed
compared_df['manual_parsed'] = manual_parsed['Parse Text'].to_list()

unmatched_df = compared_df[compared_df['python_parsed'] != compared_df['manual_parsed']]
csv_string = unmatched_df.to_csv('compared.csv', index=False)


In [57]:
result = 0
for i in range(len(python_parsed)):
    if compared_df['python_parsed'][i] == compared_df['manual_parsed'][i]:
        result += 1

print(f'Accuracy: {result/16381*100}')

Accuracy: 99.18808375557047


In [32]:
import re

def simplify_ranges(text, ):
    list_text = text.split()
    print(list_text)

simplify_ranges('2sam 5 - 6')

['2sam', '5', '-', '6']
