In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from datasets import DatasetDict, Dataset, load_dataset, concatenate_datasets
from dotenv import load_dotenv
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

from utils import calculate_accuracy_per_label, create_data

In [2]:
load_dotenv()
token = os.getenv("HUGGINGFACE_ACCESS_TOKEN")

In [3]:
model = AutoModelForSequenceClassification.from_pretrained('Velkymoss/impact-cite_v0.11',num_labels=2, token=token)
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
model.eval()

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [4]:
def tokenize_seqs(examples):
    return tokenizer(examples['citation'], padding = True, truncation=True, return_tensors="pt")

In [5]:
df_dict = create_data(70, 30)

# concatenate context and footnote text, select relevant columns
for d in df_dict:
    df_dict[d]['citation'] = df_dict[d]['context'] + ' [Footnote] ' + df_dict[d]['footnote_text']
    df_dict[d] = df_dict[d].loc[:, ['Label', 'citation']]
    df_dict[d] = Dataset.from_pandas(df_dict[d])
# convert to HuggingFace Dataset
dataset = DatasetDict(df_dict)

# tokenize data
test_data = dataset.map(tokenize_seqs, batched=True)
test_data = test_data.rename_column('Label', 'labels')
test_data = concatenate_datasets(test_data.values())

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/194 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

Map:   0%|          | 0/217 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

In [9]:
labels = test_data['labels']

In [25]:
predictions = []
for i in range(len(test_data)):
    with torch.no_grad():
        logits = model(torch.tensor([test_data['input_ids'][0]])).logits
    pred =  torch.argmax(logits).item()
    predictions.append(pred)

KeyboardInterrupt: 

In [None]:
f1 = f1_score(predictions, labels)
accuracy_label_0 = calculate_accuracy_per_label(predictions, labels, label_value=0)
accuracy_label_1 = calculate_accuracy_per_label(predictions, labels, label_value=1)

print("Results for 70 tokens as preceeding and 30 tokens as succeeding context:")
print("F1 score:", f1)
print("Accuracy for label 0:", accuracy_label_0)
print("Accuracy for label 1:", accuracy_label_1)

In [1]:
# Example data
metrics_dict = {'metric1': 0.95, 'metric2': 123, 'metric3': 42.5}
config = [(30, 20)]

# Specify the file path where you want to save the data
file_path = 'metrics_and_configurations.txt'

# Open the file in write mode
with open(file_path, 'w') as file:
    # Write the headers or any additional information if needed
    file.write("Metric, F1, Accuracy_label_0, Accuracy_label_1\n")
    file.write(f"Configuration {config[0]}/{config[1]}, {config_tuple[0]}, {config_tuple[1]}, {value}\n")


Data written to metrics_and_configurations.txt
