Library

In [1]:
import re
import csv
import pandas as pd
from collections import Counter
from jaro import jaro_winkler_metric

Data Cleaning and Preprocessing

In [2]:
# take only report chat
df = pd.read_csv("assets/classifiedchat_context.csv")
cat_report = df[df['Category'] == 'report']

def preprocessing_report(report):
    # lowercase all messages
    report = cat_report['Message'].str.lower()

    # remove unwanted characters
    report = report.str.replace('_', '').str.replace('`', '').str.replace('.', '').str.replace(',', '').str.replace('*', '').str.lstrip().str.rstrip()

    # remove all emojis
    emojis = pd.read_csv('assets/emojis.csv')
    emojis_list = emojis['emo'].tolist()

    for emoji in emojis_list:
        report = report.str.replace(emoji, '')

    # remove unwanted words
    word_list = ['selesai', 'done']
    for word in word_list:
        report = report.str.replace(word, '')

    # remove unknown characters
    for i, message in enumerate(report):
        modified_text = re.sub(r'(?<=\D)(\d+)', r' \1', message)
        modified_text = re.sub(r'(?<=\d)([a-zA-Z])', r' \1', modified_text)
        report[i:i+1] = ' '.join(modified_text.split())
    
    return report

report = preprocessing_report(cat_report)
report.to_csv('assets/report.csv', index=False)

In [3]:
def fix_bookname_typo(report, booknames):
    first_word_match = re.match(r'^\w+', report)
    if first_word_match:
        first_word = first_word_match.group(0)
    else:
        first_word = ''
    
    rest_of_report = re.sub(r'^\w+', '', report).lstrip()

    match = re.search(r'-(\s*\b[a-zA-Z]+\b|\b[a-zA-Z]+\b)', rest_of_report)
    if match:
        after_hyphen = match.group(1)
        fixed_after_hyphen = fix_bookname(after_hyphen.strip(), booknames)
        rest_of_report = rest_of_report.replace(after_hyphen, fixed_after_hyphen, 1)

    return fix_bookname(first_word, booknames) + " " + rest_of_report.strip()

def fix_bookname(bookname_input, booknames):
    booknames_list = booknames["Kitab"].tolist()

    max_score = 0.75
    current_bookname = bookname_input

    for bookname in booknames_list:
        similarity_score = jaro_winkler_metric(bookname_input, bookname)
        if similarity_score > max_score:
            max_score = similarity_score
            current_bookname = bookname

    return current_bookname

report2 = pd.read_csv('assets/report.csv')
booknames = pd.read_csv("assets/book_names.csv")

for i in range(len(report2)):
    report2['Message'][i] = fix_bookname_typo(report2['Message'][i], booknames)

report2.to_csv('assets/report2.csv', index=False)

In [4]:
book_list = pd.read_csv('assets/book_names.csv')
book_list = book_list['Kitab'].tolist()

# Construct a regex pattern to match any of the names in the list along with numbers and '-'
pattern = r'(?:\b(?:' + '|'.join(book_list) + r')\b(?:\s*\d*-*\d*\s*)*)'

# Messages
report3 = pd.read_csv('assets/report2.csv')
processed_message = []

for text in report3['Message']:
    # Skip empty strings
    if not text.strip():
        continue
    
    # Find all matches for the pattern in the text
    matches = re.findall(pattern, text)

    # Construct the final output by joining the matches
    output = ' '.join(matches)
    # Remove double spaces
    output = ' '.join(output.split())

    # Remove space before or after hyphen
    output = re.sub(r'\s*-\s*', r'-', output)

    processed_message.append(output)
    
dict_msg = {'processed': processed_message}

# Create DataFrame excluding empty strings
df = pd.DataFrame(dict_msg)

# Drop rows with empty strings
# df = df[df['processed'] != ""]
df.to_csv('assets/cleaned_messages.csv', index=False)

In [5]:
singkatan_df = pd.read_csv('assets/singkatan.csv')
kitab_df = pd.read_csv('assets/biblebooknames.csv')
singkatan_dict = dict(zip(singkatan_df['Singkatan'], kitab_df['Kitab']))

# Baca file cleaned_messages.csv
cleaned_messages_df = pd.read_csv('assets/cleaned_messages.csv')

# Fungsi untuk mengganti singkatan dengan nama lengkap
def ganti_singkatan(text):
    if isinstance(text, str):  # Check if text is a string
        words = text.split()
        for i, word in enumerate(words):
            if '-' in word:
                parts = word.split('-')
                if len(parts) == 2 and parts[1] in singkatan_dict:
                    parts[0] = singkatan_dict.get(parts[0], parts[0])
                    parts[1] = singkatan_dict[parts[1]]
                    words[i] = '-'.join(parts)
            elif word in singkatan_dict:
                words[i] = singkatan_dict[word]
        return ' '.join(words)
    else:
        return text  # If text is not a string (NaN), return it as is

# Terapkan fungsi pada kolom 'processed' dan simpan kembali ke cleaned_messages.csv
cleaned_messages_df['processed'] = cleaned_messages_df['processed'].apply(ganti_singkatan)
cleaned_messages_df.to_csv('assets/cleaned_messages2.csv', index=False)

In [6]:
def expand_chapter_range(input_str):
    # Split the input string into book and chapter range
    parts = str(input_str).split()
    if len(parts) > 2:
        book = parts[0] + " " + parts[1]
        chapters = parts[2].split('-')
    elif len(parts) < 2:
        return input_str
    else:
        book = parts[0]
        chapters = parts[1].split('-')

    # If there's only one chapter, return the original input
    if len(chapters) == 1:
        return input_str

    # Otherwise, format the output
    start_chapter = chapters[0]
    end_chapter = chapters[1]
    output = f"{book} {start_chapter} - {book} {end_chapter}"
    return output

data = pd.read_csv("assets/cleaned_messages2.csv")

for i in range(len(data['processed'])):
    data['processed'][i] = expand_chapter_range(data['processed'][i])

data.to_csv('assets/cleaned_messages3.csv', index=False)

Parsing

In [9]:
# Fungsi untuk membaca jenis kejadian dan nomor kejadian dari file CSV
def read_events(file_name):
    events = []
    with open(file_name, newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            events.append(row[0].lower())  # Tambahkan jenis kejadian dengan nomor kejadian
    return events

# Fungsi untuk mengubah format pesan
def parse_messages(file_name, events):
    parsed_messages = []
    with open(file_name, 'r') as file:
        lines = file.readlines()

    for line in lines:
        matches = re.findall(r'(\w+\s\d+)(?:\s*-\s*(\w+\s\d+))?', line)
        parsed_line = ''
        for match in matches:
            if match[1]:  # Memeriksa apakah ada rentang kejadian
                start_event, end_event = match
                start_index = events.index(start_event.lower())
                end_index = events.index(end_event.lower())
                for i in range(start_index, end_index + 1):
                    parsed_line += f"{events[i].capitalize()}, "
            else:  # Memeriksa apakah hanya satu kejadian
                event_name = match[0]
                event_index = events.index(event_name.lower())
                parsed_line += f"{events[event_index].capitalize()}, "
        parsed_line = parsed_line[:-2]  # Menghapus koma dan spasi ekstra dari akhir
        parsed_messages.append(parsed_line + "\n")

    return parsed_messages

# Fungsi untuk menyimpan hasil parsing ke file
def save_parsed_messages(parsed_messages, output_file):
    with open(output_file, 'w') as file:
        for parsed_line in parsed_messages:
            file.write(parsed_line)

# Contoh penggunaan
file_name = 'assets/biblechapters.csv'
output_file = 'assets/parsed_messages.csv'

events = read_events(file_name)
parsed_messages = parse_messages('assets/cleaned_messages3.csv', events)
save_parsed_messages(parsed_messages, output_file)
print("Pesan telah berhasil diparsing dan disimpan ke dalam file:", output_file)

Pesan telah berhasil diparsing dan disimpan ke dalam file: assets/parsed_messages.csv
