# Classify by Date

Classifies toxicity grouped by the date Tweeted.

## Imports

In [1]:
import json
import os
import torch
from collections import defaultdict
from dateutil import parser
from datetime import date
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

## Load Model

In [6]:
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
model = AutoModelForSequenceClassification.from_pretrained("./models/bert_toxicity")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)

## Define Functions

In [7]:
def process(in_file: str) -> dict[date, list[str]]:
    groups = defaultdict(list)
    with open(in_file) as file:
        for line in file:
            data = json.loads(line)
            timestamp = parser.parse(data["date"]).date()
            content = data["renderedContent"]
            groups[timestamp].append(content)
    return groups 

def classify(groups: dict[date, str], output_path: str, pipe: TextClassificationPipeline) -> None:
    if os.path.isfile(output_path):
        os.remove(output_path)
    with open(output_path, "x") as file:    
        file.write("date,num_nontoxic,num_toxic\n")
        file.flush()
        for timestamp, texts in groups.items():
            results = pipe(texts)
            num_toxic = sum("LABEL_1" == result["label"] for result in results)
            num_nontoxic = len(results) - num_toxic
            file.write(f"{timestamp},{num_nontoxic},{num_toxic}\n")

## Classify

In [8]:
groups = process("./data/twitter_ainu_since_2018.jsonl")
classify(groups, "./data/twitter_ainu_since_2018_by_date.csv", pipe)

groups = process("./data/twitter_burakumin_since_2018.jsonl")
classify(groups, "./data/twitter_burakumin_since_2018_by_date.csv", pipe)

groups = process("./data/twitter_zainichi_since_2018.jsonl")
classify(groups, "./data/twitter_zainichi_since_2018_by_date.csv", pipe)

KeyboardInterrupt: 