# Convert JSON to Plain Text

`snscrape` gives data in a lined JSON format. The JSON needs to be parsed to remove the extraneous data leaving just the text to do analysis on. Since these text potentially has new lines in it, we must escape the necessary characters before saving.

In [12]:
import json
import os
from dateutil import parser
from datetime import datetime, timezone


def process(in_file: str):
    basename = os.path.splitext(os.path.basename(in_file))[0]
    out_files = [
        f"./data/{basename}_to_2020.csv",
        f"./data/{basename}_to_2022.csv",
        f"./data/{basename}_to_present.csv",
    ]
    for file in out_files:
        if os.path.isfile(file):
            os.remove(file)
    with open(in_file) as file:
        with (
            open(out_files[0], "x") as file_2020,
            open(out_files[1], "x") as file_2022,
            open(out_files[2], "x") as file_present,
        ):
            for i, line in enumerate(file):
                data = json.loads(line)
                timestamp = parser.parse(data["date"])

                if timestamp < datetime(2020, 1, 1, tzinfo=timezone.utc):
                    out_file = file_2020
                elif timestamp < datetime(2022, 1, 1, tzinfo=timezone.utc):
                    out_file = file_2022
                else:
                    out_file = file_present

                out_file.write(
                    data["renderedContent"].encode("unicode_escape").decode("utf-8")
                    + "\n"
                )

In [None]:
process("./data/twitter_ainu_since_2018.jsonl")
process("./data/twitter_burakumin_since_2018.jsonl")
process("./data/twitter_zainichi_since_2018.jsonl")

# Reading the Plain Text

The following is an example of how to load the escaped text file.

In [None]:
data = []
with open('./data/out.csv') as file:
    for line in file:
        line = line.strip()
        data.append(line.encode("utf-8").decode("unicode_escape"))