## Parse GroupMe message jsons

The purpose of this notebook is to extract data from data exported from GroupMe.


#### Imports


In [5]:
import csv
import json
import os
import random
import re
import unicodedata
from datetime import date
from pathlib import Path

from tqdm import tqdm

#### Config


In [6]:
group_me_convo_path: Path = Path("Data/groupme_export")
training_data_path: Path = Path(f"Data/training-data-{date.today()}/training_data.txt")
image_csv_path: Path = Path("/home/jimmy/GitHub/etaBot/images/images.csv")

#### Functions


In [7]:
def parse_convo(input_path: Path, output_path: Path, image_url_csv_path: Path | None = None) -> None:
    """Parse a GroupMe message.json file and write the text to a 
    given output folder

    Args:
        input_path (Path): Path for input data
        output_path (Path): Path for where output file will go

    Returns:
        None: Returns None
    """
    with open(input_path, "r", encoding="utf-8-sig") as data_file:
        data = data_file.read()
    data_json = json.loads(data)

    phone_number_re: re.Pattern[str] = re.compile(
        r"(\+\d{1,3}\s?)?((\(\d{3}\)\s?)|(\d{3})(\s|-?))(\d{3}(\s|-?))(\d{4})(\s?(([E|e]xt[:|.|]?)|x|X)(\s?\d+))?"
    )  # Match phone numbers to remove them from output

    url_re: re.Pattern[str] = re.compile(r"http\S+")
    # Match urls to remove them from output

    with open(output_path, "a") as output_file:
        for message in data_json:
            msg_txt: str = message["text"]
            if not message["text"]:
                # skip empty messages
                continue
            if message["name"] == "GroupMe" or message["sender_id"] == "system":
                # skip system messages
                continue
            if "event" in message.keys():
                # skip events like polls, calandar events, etc.
                continue
            if image_url_csv_path and message["attachments"]:
                for attachment in message["attachments"]:
                    if attachment["type"] == "image":
                        with open(image_url_csv_path, "a") as csv_file:
                            csv.writer(csv_file).writerow([attachment["url"]])
            if "FRATGPT" in msg_txt.upper():
                # keep the bot ignorant of itself
                continue

            # replace unicode apostrophe with ascii apostrophe
            msg_txt = msg_txt.replace("\u2019", "'")

            # remove unicode characters
            msg_txt = (
                unicodedata.normalize("NFKD", msg_txt).encode("ascii", "ignore")
            ).decode()

            # remove urls and phone numbers
            msg_txt = re.sub(url_re, "", msg_txt)
            msg_txt = re.sub(phone_number_re, "", msg_txt)

            # make double new line only one line
            msg_txt = msg_txt.replace("\n\n", "\n")

            # remove whitespace on edges
            msg_txt = msg_txt.strip()

            if len(msg_txt) == 0 or not msg_txt:
                # if after cleaning, message is empty, skip it
                continue

            # write to output file
            # output_file.write(json.dumps(message, indent=2))  # write raw msg
            if msg_txt.endswith("\n"):
                output_file.write(msg_txt)
            else:
                output_file.write(msg_txt + "\n")
    return None

In [8]:
def shuffle_training_data(training_data_path: Path) -> None:
    """Shuffles training data

    Args:
        training_data_path (Path): Path of training data text file

    Returns:
        None: Returns None
    """
    with open(training_data_path, "r") as training_data_file:
        text_lines: list[str] = [(random.random(), line) for line in training_data_file]

    text_lines.sort()

    with open(training_data_path, "w") as training_data_file:
        for _, line in text_lines:
            training_data_file.write(line)

    return None

#### Make Folders if not present

In [9]:
training_data_path.parent.mkdir(parents=True, exist_ok=True)
image_csv_path.parent.mkdir(parents=True, exist_ok=True)

#### Clear Old Data

In [10]:
with open(image_csv_path, "w") as json_file:
    pass
with open(training_data_path,'w') as file:
    pass

#### Make list of file paths

In [11]:
file_paths: list[Path] = [file for file in group_me_convo_path.iterdir()]

#### Parse GroupMe message json


In [12]:
for file_path in tqdm(file_paths, desc="Parsing Files", disable=True):
    parse_convo(
        input_path=file_path/"message.json", 
        output_path=training_data_path,
        image_url_csv_path=image_csv_path
    )

#### Shuffle Training Data


In [13]:
shuffle_training_data(training_data_path=training_data_path)

In [14]:
import csv
import random

with open(image_csv_path, "r") as csv_file:
    print(random.choice(list(csv.reader(csv_file))[0]))

https://i.groupme.com/1244x2208.jpeg.5d298696fd494d3998a8913821d955af
