In [1]:
import praw
import json
import pandas as pd
from datasets import load_dataset, Dataset

from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline

import evaluate
import numpy as np

In [2]:
with open('../data/reddit_scraping/ner_labels.json') as json_file:
    ner_labels = pd.DataFrame(json.load(json_file))

label2id = {
    'O': 0,
    'B-age': 1, 
    'I-age': 2,
    'B-age_unit': 3,
    'I-age_unit': 4
}

id2label = {
    0: 'O',
    1: 'B-age',
    2: 'I-age',
    3: 'B-age_unit',
    4: 'I-age_unit',
}

ner_labels['ner_tags'] = ner_labels['token_labels'].map(lambda x: [label2id[y] for y in x])

ner_labels = Dataset.from_pandas(ner_labels[['post_id', 'context', 'tokens', 'ner_tags']].rename({'tokens': 'words'}, axis = 1))
ner_labels = ner_labels.train_test_split(test_size=0.1)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, padding=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_ner_labels = ner_labels.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/2265 [00:00<?, ? examples/s]

Map:   0%|          | 0/252 [00:00<?, ? examples/s]

In [4]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [5]:
seqeval = evaluate.load("seqeval")

In [6]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [7]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-cased", id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

In [8]:
training_args = TrainingArguments(
    output_dir="../models/age_token_classification",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ner_labels["train"],
    eval_dataset=tokenized_ner_labels["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



  0%|          | 0/1420 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.004731182474642992, 'eval_precision': 0.6996336996336996, 'eval_recall': 0.9052132701421801, 'eval_f1': 0.7892561983471073, 'eval_accuracy': 0.9978626658005574, 'eval_runtime': 51.972, 'eval_samples_per_second': 4.849, 'eval_steps_per_second': 0.308, 'epoch': 1.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.004006328526884317, 'eval_precision': 0.718978102189781, 'eval_recall': 0.933649289099526, 'eval_f1': 0.8123711340206186, 'eval_accuracy': 0.9980931626259875, 'eval_runtime': 53.5635, 'eval_samples_per_second': 4.705, 'eval_steps_per_second': 0.299, 'epoch': 2.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.003824098501354456, 'eval_precision': 0.7351778656126482, 'eval_recall': 0.8815165876777251, 'eval_f1': 0.8017241379310345, 'eval_accuracy': 0.9980722083691302, 'eval_runtime': 53.274, 'eval_samples_per_second': 4.73, 'eval_steps_per_second': 0.3, 'epoch': 3.0}
{'loss': 0.0115, 'learning_rate': 3.23943661971831e-05, 'epoch': 3.52}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.004087214358150959, 'eval_precision': 0.75, 'eval_recall': 0.7819905213270142, 'eval_f1': 0.7656612529002321, 'eval_accuracy': 0.9978836200574147, 'eval_runtime': 54.4552, 'eval_samples_per_second': 4.628, 'eval_steps_per_second': 0.294, 'epoch': 4.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.004275472369045019, 'eval_precision': 0.7364016736401674, 'eval_recall': 0.8341232227488151, 'eval_f1': 0.7822222222222223, 'eval_accuracy': 0.9979464828279865, 'eval_runtime': 54.062, 'eval_samples_per_second': 4.661, 'eval_steps_per_second': 0.296, 'epoch': 5.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.004715894348919392, 'eval_precision': 0.7564102564102564, 'eval_recall': 0.8388625592417062, 'eval_f1': 0.7955056179775282, 'eval_accuracy': 0.9980931626259875, 'eval_runtime': 51.8556, 'eval_samples_per_second': 4.86, 'eval_steps_per_second': 0.309, 'epoch': 6.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.004880106542259455, 'eval_precision': 0.7510729613733905, 'eval_recall': 0.8293838862559242, 'eval_f1': 0.7882882882882883, 'eval_accuracy': 0.9980302998554156, 'eval_runtime': 51.4664, 'eval_samples_per_second': 4.896, 'eval_steps_per_second': 0.311, 'epoch': 7.0}
{'loss': 0.0027, 'learning_rate': 1.4788732394366198e-05, 'epoch': 7.04}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.004889966920018196, 'eval_precision': 0.7671232876712328, 'eval_recall': 0.7962085308056872, 'eval_f1': 0.7813953488372093, 'eval_accuracy': 0.9980302998554156, 'eval_runtime': 57.1764, 'eval_samples_per_second': 4.407, 'eval_steps_per_second': 0.28, 'epoch': 8.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.00504990853369236, 'eval_precision': 0.7489177489177489, 'eval_recall': 0.8199052132701422, 'eval_f1': 0.7828054298642535, 'eval_accuracy': 0.997988391341701, 'eval_runtime': 56.2811, 'eval_samples_per_second': 4.478, 'eval_steps_per_second': 0.284, 'epoch': 9.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.005608788691461086, 'eval_precision': 0.7489177489177489, 'eval_recall': 0.8199052132701422, 'eval_f1': 0.7828054298642535, 'eval_accuracy': 0.997988391341701, 'eval_runtime': 53.8386, 'eval_samples_per_second': 4.681, 'eval_steps_per_second': 0.297, 'epoch': 10.0}
{'train_runtime': 12431.9179, 'train_samples_per_second': 1.822, 'train_steps_per_second': 0.114, 'train_loss': 0.0056380587984138815, 'epoch': 10.0}


TrainOutput(global_step=1420, training_loss=0.0056380587984138815, metrics={'train_runtime': 12431.9179, 'train_samples_per_second': 1.822, 'train_steps_per_second': 0.114, 'train_loss': 0.0056380587984138815, 'epoch': 10.0})

In [9]:
trainer.save_model("../models/age_token_classification")

# Test Model

## Single Post

In [10]:
with open('../reddit_api.json') as json_file:
    reddit_api_credentials = json.load(json_file)
    reddit_read_only = praw.Reddit(client_id=reddit_api_credentials['client_id'],
                                   client_secret=reddit_api_credentials['secret'],
                                   user_agent=reddit_api_credentials['user_agent']) 

subreddit = reddit_read_only.subreddit("AskDocs")

In [11]:
random_post = subreddit.random()
context = f"{random_post.title}\n{random_post.selftext}"
context

'Week-long swollen foot w/painful lump.\n37 y/o male. \n\nFirst noticed soreness on the top of my foot near my pinky toe last Tuesday (7/18). Slight swelling and redness of the area began the next day and has continued since. A small bump that I don’t *think* is a blister formed either Tuesday or Wednesday. The pain is concentrated around this spot/bump and extends to the bottom of my foot as well, making it hard to walk on without a limp. The spot itself feels hard and is quite sore when touched. Pain level fluctuates and all symptoms seem worse at night.\n\nNot sure if it’s a bite or something else. The swelling increases/decreases with activity level but hasn’t completely resided at all in a week unless I’ve taken Ibuprofen. \n\nIt still being pretty sore to walk on and still having slight swelling after an entire week makes me wonder if it’s time for an office visit or if this is one of those “Yeah, just keep taking Advil if it makes the swelling reside” situations?\n\nPhotos: http

In [12]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
age_extractor = pipeline("ner", model="../models/age_token_classification", tokenizer=tokenizer)
age_extractor(context)

[{'entity': 'B-age',
  'score': 0.74721,
  'index': 11,
  'word': '37',
  'start': 39,
  'end': 41},
 {'entity': 'B-age_unit',
  'score': 0.91090286,
  'index': 12,
  'word': 'y',
  'start': 42,
  'end': 43}]

## Entire Set

In [13]:
trainer.evaluate()

  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.003824098501354456,
 'eval_precision': 0.7351778656126482,
 'eval_recall': 0.8815165876777251,
 'eval_f1': 0.8017241379310345,
 'eval_accuracy': 0.9980722083691302,
 'eval_runtime': 54.0072,
 'eval_samples_per_second': 4.666,
 'eval_steps_per_second': 0.296,
 'epoch': 10.0}