In [1]:
import os
import re
import pandas as pd
from collections import Counter
from jaro import jaro_winkler_metric

In [2]:
def extract_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()

    pattern = r'\[(.*?)\] ([^:]+): (.*)'
    parsed_data = []

    for line in data:
        match = re.match(pattern, line.strip())
        if match:
            parsed_data.append(match.groups())

    df = pd.DataFrame(parsed_data, columns=['Timestamp', 'Sender', 'Message'])
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%y %H.%M.%S')
    df['Date'] = df['Timestamp'].dt.date
    df['Time'] = df['Timestamp'].dt.time
    df.drop(columns=['Timestamp'], inplace=True)
    
    return df

def split_by_day(df):
    grouped = df.groupby('Date')
    daily_dfs = [group for _, group in grouped]
    return daily_dfs

def main(file_path):
    df = extract_data(file_path)
    daily_dfs = split_by_day(df)
    
    if not os.path.exists("chat"):
        os.makedirs("chat")

    for i, daily_df in enumerate(daily_dfs, start=1):
        chat_date = str(daily_df['Date'].iloc[0])
        file_name = f"chat/{chat_date}.csv"
        daily_df.to_csv(file_name, index=False)
        
if __name__ == "__main__":
    file_path = 'chat.txt'
    main(file_path)

In [3]:
df = pd.read_csv("chat/2020-08-03.csv")
df['Message'] = df['Message'].str.lower()
display(df)

Unnamed: 0,Sender,Message,Date,Time
0,Melanie Chandra,kej 1-2 done,2020-08-03,03:48:55
1,~ Lindawati Haryanto,kej 1-2 done,2020-08-03,04:03:51
2,Sherly Cahyadi,kej 1-2 done,2020-08-03,04:08:44
3,~ Seto Ninik,kej 1-2 done,2020-08-03,04:32:19
4,~ 🪸Martha 🍁,kej 1-2 done,2020-08-03,05:45:14
5,~ Dewi Pratiwi,kej 1-2 done,2020-08-03,06:07:57
6,~ Endang Surati,kej 1- 2 selesai.🙏,2020-08-03,06:09:20
7,Dicky Andrian,kej 1-2 done,2020-08-03,06:14:01
8,~ 🎍,kej 1-2 done,2020-08-03,06:14:26
9,"~ dr. Andreas C.N., Sp.B.",kej 1-2 selesai,2020-08-03,06:15:08


In [4]:
message_counts = Counter(df['Message'])
max_string, max_count = message_counts.most_common(1)[0]
print(max_string)

kej 1-2 done


In [5]:
threshold = 0.8
filtered_df = df[df['Message'].apply(lambda x: jaro_winkler_metric(x, max_string)) > threshold]
filtered_df.reset_index(drop=True, inplace=True)

display(filtered_df)

Unnamed: 0,Sender,Message,Date,Time
0,Melanie Chandra,kej 1-2 done,2020-08-03,03:48:55
1,~ Lindawati Haryanto,kej 1-2 done,2020-08-03,04:03:51
2,Sherly Cahyadi,kej 1-2 done,2020-08-03,04:08:44
3,~ Seto Ninik,kej 1-2 done,2020-08-03,04:32:19
4,~ 🪸Martha 🍁,kej 1-2 done,2020-08-03,05:45:14
5,~ Dewi Pratiwi,kej 1-2 done,2020-08-03,06:07:57
6,Dicky Andrian,kej 1-2 done,2020-08-03,06:14:01
7,~ 🎍,kej 1-2 done,2020-08-03,06:14:26
8,"~ dr. Andreas C.N., Sp.B.",kej 1-2 selesai,2020-08-03,06:15:08
9,~ Oma Lisa,kej1-2 done,2020-08-03,06:20:22
