In [None]:
import praw
import json
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
import evaluate

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline

In [None]:
with open('../data/reddit_scraping/classification_labels.json') as json_file:
    classification_labels = pd.DataFrame(json.load(json_file))
    classification_labels["gender_label"] = classification_labels["gender_label"].map({
        "Male": 0,
        "Female": 1,
        "Non-Binary": 2,
        "Not Disclosed": 3
    })
    classification_labels["subject_label"] = classification_labels["subject_label"].map({
        "Self": 0,
        "Other": 1
    })

In [None]:
classification_labels = Dataset.from_pandas(classification_labels)
classification_labels = classification_labels.train_test_split(test_size=0.1)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

In [None]:
def preprocess_function_gender(examples):
    examples["text"] = examples["context"]
    examples["label"] = examples["gender_label"]
    return tokenizer(examples["text"], truncation=True, padding = True)
tokenized_posts_gender = classification_labels.map(preprocess_function_gender, batched=True)

def preprocess_function_subject(examples):
    examples["text"] = examples["context"]
    examples["label"] = examples["subject_label"]
    return tokenizer(examples["text"], truncation=True, padding = True)
tokenized_posts_subject = classification_labels.map(preprocess_function_subject, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
gender_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-cased", 
    num_labels=4, 
    id2label={
        0: "Male",
        1: "Female",
        2: "Non-Binary",
        3: "Not Disclosed"
    }, 
    label2id={
        "Male": 0,
        "Female": 1,
        "Non-Binary": 2,
        "Not Disclosed": 3
    }
)

subject_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-cased", 
    num_labels=2, 
    id2label={
        0: "Self",
        1: "Other"
    }, 
    label2id={
        "Self": 0,
        "Other": 1
    }
)

In [None]:
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'recall': recall_metric.compute(predictions=predictions, references=labels, average="weighted")['recall'],
        'precision': precision_metric.compute(predictions=predictions, references=labels, average="weighted")['precision'],
        'f1 score': f1_metric.compute(predictions=predictions, references=labels, average="weighted")['f1'],
        'accuracy': accuracy_metric.compute(predictions=predictions, references=labels)['accuracy'],
    }

In [None]:
gender_training_args = TrainingArguments(
    output_dir="../models/gender_training",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01, 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    use_mps_device=True
)

gender_trainer = Trainer(
    model=gender_model,
    args=gender_training_args,
    train_dataset=tokenized_posts_gender["train"],
    eval_dataset=tokenized_posts_gender["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

gender_trainer.train()

In [None]:
gender_trainer.save_model("../models/gender_training")

In [None]:
subject_training_args = TrainingArguments(
    output_dir="../models/subject_training",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01, 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    use_mps_device=True
)

subject_trainer = Trainer(
    model=subject_model,
    args=subject_training_args,
    train_dataset=tokenized_posts_gender["train"],
    eval_dataset=tokenized_posts_gender["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

subject_trainer.train()

In [None]:
subject_trainer.save_model("../models/subject_training")

# Test Model

In [None]:
with open('../reddit_api.json') as json_file:
    reddit_api_credentials = json.load(json_file)
    reddit_read_only = praw.Reddit(client_id=reddit_api_credentials['client_id'],
                                   client_secret=reddit_api_credentials['secret'],
                                   user_agent=reddit_api_credentials['user_agent']) 

subreddit = reddit_read_only.subreddit("AskDocs")

In [None]:
random_post = subreddit.random()
context = f"{random_post.title}\n{random_post.selftext}"
context

In [None]:
classifier = pipeline("text-classification", 
                             truncation=True, 
                             padding = True, 
                             model="../models/gender_training")
classifier(context)

In [None]:
classifier = pipeline("text-classification", 
                             truncation=True, 
                             padding = True, 
                             model="../models/subject_training")
classifier(context)

In [None]:
gender_trainer.evaluate()

In [None]:
subject_trainer.evaluate()