In [1]:
import json
import string
from datetime import datetime

In [2]:
# filename = "data/WhatsApp Chat with Sample Group.txt"
filename = "data/WhatsApp Chat with Sample Group.txt"

with open(filename, "r", encoding="utf-8") as file:
    file_data = file.readlines()

In [3]:
def isMessage(message: str) -> bool:
    ignore_list = [
        "Messages and calls are end-to-end encrypted.",
        "created this group",
        "created group",
        "<Media omitted>",
        "<This message was edited>",
        "live location shared",
        "null",
        "added you",
    ]
    
    for ignore in ignore_list:
        if ignore in message:
            return False
    return True

In [4]:
# Not under usage

def isReaction(message: str) -> bool:
    # Check if the message contains any alphabetical character
    alphabet = string.ascii_letters
    for char in message:
        if char in alphabet:  # If it contains any alphabet, it's not a reaction
            return False
    return True

In [5]:
def isValidDate(date: str) -> bool:
    try:
        # Try to parse the date string in the format DD/MM/YY
        datetime.strptime(date, '%m/%d/%y')
        return True
    except ValueError:
        # If the date is not valid, a ValueError will be raised
        return False

In [6]:
def filterUnicode(string: str, replacements: list[tuple[str, str]]) -> str:
    # Replace specified Unicode characters with the given replacements
    for uni, replace in replacements:
        string = string.replace(uni, replace)
    
    # Remove any remaining non-ASCII characters
    return string.encode('ascii', 'ignore').decode('ascii')


In [7]:
def isValidTime(time: str) -> bool:
    try:
        # Try to parse the time string in the format HH:MM
        datetime.strptime(time, '%I:%M %p')
        return True
    except ValueError:
        # If the time is not valid, a ValueError will be raised
        return False

In [8]:
# Extract Data

data = []
ignored_data = []

for d in file_data:

    if not isMessage(d):
        ignored_data.append({"Content":d, "Cause":"Not a Message"})
        continue

    d = d.rstrip("\n")  # Remove trailing newline character
    
    # Split the date and the rest of the message
    comma_pos = d.find(",")
    date, rest = d[0:comma_pos], d[comma_pos + 2:]
    
    # Split the time and the remaining part
    dash_pos = rest.find(" -")
    time, rest = rest[0:dash_pos], rest[dash_pos + 3:]
    
    # Split the username and message
    colon_pos = rest.find(":")
    username, message = rest[0:colon_pos], rest[colon_pos + 2:]

    time = filterUnicode(time, [("\u202f", " ")])
    date = filterUnicode(date, [("\u202f", " ")])

    # Ensure that all parts are extracted properly
    if not message:
        ignored_data.append({"Content":d, "Cause":"Empty Message"})
        continue

    if not username:
        ignored_data.append({"Content":d, "Cause":"Empty Username"})

    if not isValidDate(date=date):
        ignored_data.append({"Content":d, "Cause":"Invalid Date"})
        continue

    if not isValidTime(time=time):
        ignored_data.append({"Content":d, "Cause":"Invalid Time"})
        continue

    # Store the message data
    message_data = {
        "Date": date,
        "Time": time,
        "Username": username,
        "Message": message
    }
    
    data.append(message_data)


In [9]:
# Save Data

output_filename = f"output/{filename.split("/")[-1].replace('.txt', '.json')}"

with open(output_filename, "w") as file:
    json.dump(data, file, indent = 2)

In [10]:
# Save Ignored Data

ignored_data_output_filename = f"output/ignored_{filename.split("/")[-1].replace('.txt', '.json')}"

with open(ignored_data_output_filename, "w") as file:
    json.dump(ignored_data, file, indent = 2)

In [11]:
# Some statistics

print(f"Extracted Amount : {len(data)}")
print(f"Ignored Amount   : {len(ignored_data)}")

Extracted Amount : 2
Ignored Amount   : 2
