In [None]:
import praw
import json
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
import evaluate

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline

In [None]:
with open('../data/reddit_scraping/classification_labels.json') as json_file:
    classification_labels = pd.DataFrame(json.load(json_file))
    classification_labels["gender_label"] = classification_labels["gender_label"].map({
        "Male": 0,
        "Female": 1,
        "Non-Binary": 2,
        "Not Disclosed": 3
    })
    classification_labels["subject_label"] = classification_labels["subject_label"].map({
        "Self": 0,
        "Other": 1
    })

In [None]:
classification_labels = Dataset.from_pandas(classification_labels)
classification_labels = classification_labels.train_test_split(test_size=0.1)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

In [None]:
def preprocess_function_gender(examples):
    examples["text"] = examples["context"]
    examples["label"] = examples["gender_label"]
    return tokenizer(examples["text"], truncation=True, padding = True)
tokenized_posts_gender = classification_labels.map(preprocess_function_gender, batched=True)

def preprocess_function_subject(examples):
    examples["text"] = examples["context"]
    examples["label"] = examples["subject_label"]
    return tokenizer(examples["text"], truncation=True, padding = True)
tokenized_posts_subject = classification_labels.map(preprocess_function_subject, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
gender_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-cased", 
    num_labels=4, 
    id2label={
        0: "Male",
        1: "Female",
        2: "Non-Binary",
        3: "Not Disclosed"
    }, 
    label2id={
        "Male": 0,
        "Female": 1,
        "Non-Binary": 2,
        "Not Disclosed": 3
    }
)

subject_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-cased", 
    num_labels=2, 
    id2label={
        0: "Self",
        1: "Other"
    }, 
    label2id={
        "Self": 0,
        "Other": 1
    }
)

In [None]:
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'recall': recall_metric.compute(predictions=predictions, references=labels, average="weighted")['recall'],
        'precision': precision_metric.compute(predictions=predictions, references=labels, average="weighted")['precision'],
        'f1 score': f1_metric.compute(predictions=predictions, references=labels, average="weighted")['f1'],
        'accuracy': accuracy_metric.compute(predictions=predictions, references=labels)['accuracy'],
    }

In [None]:
gender_training_args = TrainingArguments(
    output_dir="../models/gender_training",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01, 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # use_mps_device=True
)

gender_trainer = Trainer(
    model=gender_model,
    args=gender_training_args,
    train_dataset=tokenized_posts_gender["train"],
    eval_dataset=tokenized_posts_gender["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

gender_trainer.train()

In [None]:
gender_trainer.save_model("../models/gender_training")

In [102]:
subject_training_args = TrainingArguments(
    output_dir="../models/subject_training",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01, 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # use_mps_device=True
)

subject_trainer = Trainer(
    model=subject_model,
    args=subject_training_args,
    train_dataset=tokenized_posts_subject["train"],
    eval_dataset=tokenized_posts_subject["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

subject_trainer.train()

  0%|          | 0/1150 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.1546141356229782, 'eval_recall': 0.946078431372549, 'eval_precision': 0.9407267030523551, 'eval_f1 score': 0.942771891365242, 'eval_accuracy': 0.946078431372549, 'eval_runtime': 40.0235, 'eval_samples_per_second': 5.097, 'eval_steps_per_second': 0.325, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.24668292701244354, 'eval_recall': 0.9264705882352942, 'eval_precision': 0.9394282802726825, 'eval_f1 score': 0.9319127839379098, 'eval_accuracy': 0.9264705882352942, 'eval_runtime': 41.9011, 'eval_samples_per_second': 4.869, 'eval_steps_per_second': 0.31, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.1404285579919815, 'eval_recall': 0.9607843137254902, 'eval_precision': 0.9570303335009217, 'eval_f1 score': 0.9574224414212223, 'eval_accuracy': 0.9607843137254902, 'eval_runtime': 40.8813, 'eval_samples_per_second': 4.99, 'eval_steps_per_second': 0.318, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.16953930258750916, 'eval_recall': 0.9607843137254902, 'eval_precision': 0.9583645045394512, 'eval_f1 score': 0.9592524509803921, 'eval_accuracy': 0.9607843137254902, 'eval_runtime': 39.6712, 'eval_samples_per_second': 5.142, 'eval_steps_per_second': 0.328, 'epoch': 4.0}
{'loss': 0.154, 'learning_rate': 2.826086956521739e-05, 'epoch': 4.35}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.19743870198726654, 'eval_recall': 0.9607843137254902, 'eval_precision': 0.9570303335009217, 'eval_f1 score': 0.9574224414212223, 'eval_accuracy': 0.9607843137254902, 'eval_runtime': 38.065, 'eval_samples_per_second': 5.359, 'eval_steps_per_second': 0.342, 'epoch': 5.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.20287811756134033, 'eval_recall': 0.9656862745098039, 'eval_precision': 0.9631241156256317, 'eval_f1 score': 0.963582112686972, 'eval_accuracy': 0.9656862745098039, 'eval_runtime': 38.0767, 'eval_samples_per_second': 5.358, 'eval_steps_per_second': 0.341, 'epoch': 6.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.2331494390964508, 'eval_recall': 0.9607843137254902, 'eval_precision': 0.9583645045394512, 'eval_f1 score': 0.9592524509803921, 'eval_accuracy': 0.9607843137254902, 'eval_runtime': 40.2675, 'eval_samples_per_second': 5.066, 'eval_steps_per_second': 0.323, 'epoch': 7.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.2531433403491974, 'eval_recall': 0.9607843137254902, 'eval_precision': 0.9583645045394512, 'eval_f1 score': 0.9592524509803921, 'eval_accuracy': 0.9607843137254902, 'eval_runtime': 39.9887, 'eval_samples_per_second': 5.101, 'eval_steps_per_second': 0.325, 'epoch': 8.0}
{'loss': 0.0615, 'learning_rate': 6.521739130434783e-06, 'epoch': 8.7}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.25796064734458923, 'eval_recall': 0.9607843137254902, 'eval_precision': 0.9583645045394512, 'eval_f1 score': 0.9592524509803921, 'eval_accuracy': 0.9607843137254902, 'eval_runtime': 122.1621, 'eval_samples_per_second': 1.67, 'eval_steps_per_second': 0.106, 'epoch': 9.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.26705169677734375, 'eval_recall': 0.9607843137254902, 'eval_precision': 0.9583645045394512, 'eval_f1 score': 0.9592524509803921, 'eval_accuracy': 0.9607843137254902, 'eval_runtime': 41.1475, 'eval_samples_per_second': 4.958, 'eval_steps_per_second': 0.316, 'epoch': 10.0}
{'train_runtime': 10112.476, 'train_samples_per_second': 1.81, 'train_steps_per_second': 0.114, 'train_loss': 0.09741942239844281, 'epoch': 10.0}


TrainOutput(global_step=1150, training_loss=0.09741942239844281, metrics={'train_runtime': 10112.476, 'train_samples_per_second': 1.81, 'train_steps_per_second': 0.114, 'train_loss': 0.09741942239844281, 'epoch': 10.0})

In [103]:
subject_trainer.save_model("../models/subject_training")

# Test Model

In [110]:
with open('../reddit_api.json') as json_file:
    reddit_api_credentials = json.load(json_file)
    reddit_read_only = praw.Reddit(client_id=reddit_api_credentials['client_id'],
                                   client_secret=reddit_api_credentials['secret'],
                                   user_agent=reddit_api_credentials['user_agent']) 

subreddit = reddit_read_only.subreddit("AskDocs")

In [111]:
random_post = subreddit.random()
context = f"{random_post.title}\n{random_post.selftext}"
context

"My wife (31F) and I (32M) are getting prepared to try for our second child. Both times, she has wanted me to be sober (from alcohol) for at least 30 days before ovulation and trying because it is supposed to be beneficial. If I drink once during that period, does it defeat the whole purpose?\nI have to travel for work this week and know my coworkers will want to go out for dinner and drink. I would like to participate but if it would essentially restart the clock that would be a problem. Is the benefit from a long period of ZERO alcohol or is the benefit from seriously cutting back for a while? I don't want to be stupid just for the sake of one night if it really matters."

In [112]:
classifier = pipeline("text-classification", 
                             truncation=True, 
                             padding = True, 
                             model="../models/gender_training")
classifier(context)

[{'label': 'Male', 'score': 0.9987389445304871}]

In [113]:
classifier = pipeline("text-classification", 
                             truncation=True, 
                             padding = True, 
                             model="../models/subject_training")
classifier(context)

[{'label': 'Self', 'score': 0.989680826663971}]

In [114]:
gender_trainer.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 1.4156204462051392, 'eval_recall': 0.014705882352941176, 'eval_precision': 0.0002184041933605125, 'eval_f1 score': 0.000430416068866571, 'eval_accuracy': 0.014705882352941176, 'eval_runtime': 37.1528, 'eval_samples_per_second': 5.491, 'eval_steps_per_second': 0.35, 'epoch': 0}


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.4156204462051392,
 'eval_recall': 0.014705882352941176,
 'eval_precision': 0.0002184041933605125,
 'eval_f1 score': 0.000430416068866571,
 'eval_accuracy': 0.014705882352941176,
 'eval_runtime': 37.1528,
 'eval_samples_per_second': 5.491,
 'eval_steps_per_second': 0.35,
 'epoch': 0}

In [109]:
subject_trainer.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.1404285579919815,
 'eval_recall': 0.9607843137254902,
 'eval_precision': 0.9570303335009217,
 'eval_f1 score': 0.9574224414212223,
 'eval_accuracy': 0.9607843137254902,
 'eval_runtime': 38.6312,
 'eval_samples_per_second': 5.281,
 'eval_steps_per_second': 0.337,
 'epoch': 10.0}