## Parse GroupMe message jsons

#### Imports

In [1]:
from datetime import date
import json
from pathlib import Path
import random
import re
from tqdm import tqdm
import unicodedata

#### Constants

In [2]:
training_data_path: Path = Path(f"Data/TrainingData/training-data-{date.today()}.txt")
data_dir: Path = Path("Data/4-27-23 Data")

In [3]:
file_paths: list[Path] = [file for file in data_dir.iterdir()]

#### Functions

In [10]:
def parse_convo(input_path: Path, output_path: Path) -> None:
    """Parse a GroupMe message.json file and write the text to a 
    given output folder

    Args:
        input_path (Path): Path for input data
        output_path (Path): Path for where output file will go

    Returns:
        None: Returns None
    """
    with open(input_path, "r", encoding="utf-8-sig") as data_file:
        data = data_file.read()
    data_json = json.loads(data)

    phone_number_re: re.Pattern[str] = re.compile(
        r"(\+\d{1,3}\s?)?((\(\d{3}\)\s?)|(\d{3})(\s|-?))(\d{3}(\s|-?))(\d{4})(\s?(([E|e]xt[:|.|]?)|x|X)(\s?\d+))?"
    )  # Match phone numbers to remove them from output

    url_re: re.Pattern[str] = re.compile(r"http\S+")
    # Match urls to remove them from output

    with open(output_path, "a") as output_file:
        for message in data_json:
            msg_txt: str = message["text"]
            if not message["text"]:
                # skip empty messages
                continue
            if message["name"] == "GroupMe" or message["sender_id"] == "system":
                # skip system messages
                continue

            # replace unicode apostrophe with ascii apostrophe
            msg_txt = msg_txt.replace("\u2019", "'")

            # remove unicode characters
            msg_txt = (
                unicodedata.normalize("NFKD", msg_txt).encode("ascii", "ignore")
            ).decode()

            # remove urls and phone numbers
            msg_txt = re.sub(url_re, "", msg_txt)
            msg_txt = re.sub(phone_number_re, "", msg_txt)

            # make double new line only one line
            msg_txt = msg_txt.replace("\n\n", "\n")

            # remove whitespace on edges
            msg_txt = msg_txt.strip()

            if len(msg_txt) == 0 or not msg_txt:
                # if after cleaning, message is empty, skip it
                continue

            # write to output file
            # output_file.write(json.dumps(message, indent=2))
            # output_file.write("\n")
            if msg_txt.endswith("\n"):
                output_file.write(msg_txt)
            else:
                output_file.write(msg_txt + "\n")
            # output_file.write("\n" + "~" * 15 + "\n")
    return None

In [5]:
def shuffle_training_data(training_data_path: Path) -> None:
    """Shuffles training data

    Args:
        training_data_path (Path): Path of training data text file

    Returns:
        None: Returns None
    """
    with open(training_data_path, "r") as training_data_file:
        text_lines: list[str] = [(random.random(), line) for line in training_data_file]

    text_lines.sort()

    with open(training_data_path, "w") as training_data_file:
        for _, line in text_lines:
            training_data_file.write(line)

    return None

#### Delete content in training data file

In [11]:
with open(training_data_path,'w') as file:
    pass

#### Parse GroupMe message json

In [12]:
for file_path in tqdm(file_paths, desc="Parsing Files", disable=True):
    parse_convo(input_path=file_path/"message.json", output_path=training_data_path)

#### Shuffle Training Data

In [9]:
shuffle_training_data(training_data_path=training_data_path)