In [32]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from datasets import DatasetDict, Dataset, load_dataset, concatenate_datasets
from dotenv import load_dotenv
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

from utils import create_data

In [33]:
load_dotenv()
token = os.getenv("HUGGINGFACE_ACCESS_TOKEN")

In [34]:
model = AutoModelForSequenceClassification.from_pretrained('Velkymoss/impact-cite_v0.11',num_labels=2, token=token)
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
model.eval()

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [35]:
def tokenize_seqs(examples):
    return tokenizer(examples['citation'], padding = True, truncation=True, return_tensors="pt")

In [58]:
df_dict = create_data(70, 30)

# concatenate context and footnote text, select relevant columns
for d in df_dict:
    df_dict[d]['citation'] = df_dict[d]['context'] + ' [Footnote] ' + df_dict[d]['footnote_text']
    df_dict[d] = df_dict[d].loc[:, ['Label', 'citation']]
    df_dict[d] = Dataset.from_pandas(df_dict[d])
# convert to HuggingFace Dataset
dataset = DatasetDict(df_dict)

# tokenize data
tokenized_data = dataset.map(tokenize_seqs, batched=True)
tokenized_data = tokenized_data.rename_column('Label', 'labels')

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/194 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

Map:   0%|          | 0/217 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

In [42]:
# TODO Loop over all rows in the dataset, get the prediction and store it to later calculate the F1 score and accuracy per class
# TODO Calculate the F1 score and accuracy per class 
# TODO Plot the accuracy per class for each class

# TODO Repeat this process for a number of different configurations of numbers of tokens as context around the citation
# TODO Store the calculated results in a file. Store the plots as well

In [43]:
# If stuck on using the Transformers library, check out the documentation here: https://huggingface.co/transformers