Library

In [1]:
import re
import pandas as pd
from collections import Counter
from jaro import jaro_winkler_metric

Data Cleaning and Preprocessing

In [22]:
# take only report chat
df = pd.read_csv("assets/classifiedchat_context.csv")
cat_report = df[df['Category'] == 'report']

def preprocessing_report(report):
    # lowercase all messages
    report = cat_report['Message'].str.lower()

    # remove unwanted characters
    report = report.str.replace('_', '').str.replace('`', '').str.replace('.', '').str.replace(',', '').str.replace('*', '').str.lstrip().str.rstrip()

    # remove all emojis
    emojis = pd.read_csv('assets/emojis.csv')
    emojis_list = emojis['emo'].tolist()

    for emoji in emojis_list:
        report = report.str.replace(emoji, '')

    # remove unwanted words
    word_list = ['selesai', 'done']
    for word in word_list:
        report = report.str.replace(word, '')

    # remove unknown characters
    for i, message in enumerate(report):
        modified_text = re.sub(r'(?<=\D)(\d+)', r' \1', message)
        modified_text = re.sub(r'(?<=\d)([a-zA-Z])', r' \1', modified_text)
        report[i:i+1] = ' '.join(modified_text.split())
    
    return report

report = preprocessing_report(cat_report)
report.to_csv('report.csv', index=False)

In [23]:
def fix_bookname_typo(report, booknames):
    split_report = report.split(' ', 1)
    bookname_input = split_report[0]
    booknames_list = booknames["Kitab"].to_list()

    max_score = 0
    current_bookname = ""

    for bookname in booknames_list:
        similarity_score = jaro_winkler_metric(bookname_input, bookname)
        if similarity_score > max_score:
            max_score = similarity_score
            current_bookname = bookname
        else:
            continue
    if len(split_report) > 1:
        result = current_bookname + " " + split_report[1]
    else:
        result = current_bookname
    
    return result

report2 = pd.read_csv('report.csv')
booknames = pd.read_csv("assets/book_names.csv")

for i in range(len(report)):
    report2['Message'][i] = fix_bookname_typo(report2['Message'][i], booknames)

report2.to_csv('report2.csv', index=False)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  report2['Message'][i] = fix_bookname_typo(report2['Message'][i], booknames)


In [4]:
book_list = pd.read_csv('assets/book_names.csv')
book_list = book_list['Kitab'].tolist()

# Construct a regex pattern to match any of the names in the list along with numbers and '-'
pattern = r'(?:\b(?:' + '|'.join(book_list) + r')\b(?:\s*\d*-*\d*\s*)*)'

# Messages
processed_message = []
for text in report:
    # Skip empty strings
    if not text.strip():
        continue
    
    # Find all matches for the pattern in the text
    matches = re.findall(pattern, text)

    # Construct the final output by joining the matches
    output = ' '.join(matches)
    # Remove double spaces
    output = ' '.join(output.split())

    # Remove space before or after hyphen
    output = re.sub(r'\s*-\s*', r'-', output)

    processed_message.append(output)
    
dict_msg = {'processed': processed_message}

# Create DataFrame excluding empty strings
df = pd.DataFrame(dict_msg)

# Drop rows with empty strings
# df = df[df['processed'] != ""]
df.to_csv('assets/cleaned_messages.csv', index=False)

In [4]:
singkatan_df = pd.read_csv('assets/singkatan.csv')
kitab_df = pd.read_csv('assets/biblebooknames.csv')
singkatan_dict = dict(zip(singkatan_df['Singkatan'], kitab_df['Kitab']))

# Baca file cleaned_messages.csv
cleaned_messages_df = pd.read_csv('assets/cleaned_messages.csv')

# Fungsi untuk mengganti singkatan dengan nama lengkap
def ganti_singkatan(text):
    words = text.split()
    for i, word in enumerate(words):
        if word in singkatan_dict:
            words[i] = singkatan_dict[word]
    return ' '.join(words)

# Terapkan fungsi pada kolom 'processed' dan simpan kembali ke cleaned_messages.csv
cleaned_messages_df['processed'] = cleaned_messages_df['processed'].apply(ganti_singkatan)
cleaned_messages_df.to_csv('assets/cleaned_messages.csv', index=False)
