In [None]:
import datasets
import numpy as np
from tqdm.notebook import tqdm
import csv
import pandas as pd
import re
import torch
import wandb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    Trainer, TrainingArguments

In [None]:
w1 = 'lead'
w2 = 'guide'

num_proc = 20
seed = 1234
val_prop = 0.01
test_prop = 0.2

max_len = 256
batch_size = 8
gradient_accumulation_steps = 4
label_smoothing_factor = 0.
device = 'cuda'
model_name = 'microsoft/deberta-base'

In [None]:
ds = datasets.load_dataset('json', data_files='/home/ryan/optout/gpt-neox/pile/first_shard/data/00_45e8.jsonl')

In [None]:
ds = ds['train']

In [None]:
w1_ds = ds.filter(lambda x: f' {w1} ' in x['text'], num_proc=num_proc, keep_in_memory=True)
w2_ds = ds.filter(lambda x: f' {w2} ' in x['text'], num_proc=num_proc, keep_in_memory=True)

In [None]:
del(ds)

In [None]:
len(w1_ds), len(w2_ds)

In [None]:
# add labels
w1_ds = w1_ds.add_column('label', [0] * len(w1_ds))
w2_ds = w2_ds.add_column('label', [1] * len(w2_ds))

In [None]:
# cut the prefix
def prefix_only(x):
    idx = x['text'].find(' %s ' % (w1 if x['label'] == 0 else w2))
    prefix = x['text'][:idx]
    return {'text': prefix, 'label': x['label'], 'meta': x['meta']}
    
w1_ds = w1_ds.map(prefix_only, keep_in_memory=True)
w2_ds = w2_ds.map(prefix_only, keep_in_memory=True)

In [None]:
combined_ds = datasets.concatenate_datasets([w1_ds, w2_ds]).shuffle(seed=seed, keep_in_memory=True)

In [None]:
test_cutoff = int(test_prop * len(combined_ds))
test_ds = combined_ds.select(range(0, test_cutoff))

In [None]:
val_cutoff = int(val_prop * len(combined_ds))
val_ds = combined_ds.select(range(test_cutoff, test_cutoff+val_cutoff))

In [None]:
train_ds = combined_ds.select(range(test_cutoff+val_cutoff, len(combined_ds)))

In [None]:
train_ds, val_ds, test_ds

In [None]:
window_size = 20

concatenated_test = []
for i in test_ds:
    text = i['text']
    snippet = text[max(0, len(text) - window_size):]
    concatenated_test.append(snippet)
concatenated_test = set(concatenated_test)

def check_in_test(x):
    text = x['text']
    snippet = text[max(0, len(text) - window_size):]
    return snippet not in concatenated_test

filtered_train_ds = train_ds.filter(check_in_test, num_proc=num_proc, keep_in_memory=True)
print(filtered_train_ds, train_ds, len(train_ds)-len(filtered_train_ds))

In [None]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.truncation_side = 'left'

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_len)

tokenized_train_ds = filtered_train_ds.map(tokenize_function, num_proc=num_proc, batched=True, keep_in_memory=True)
tokenized_val_ds = val_ds.map(tokenize_function, num_proc=num_proc, batched=True, keep_in_memory=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

In [None]:
# Define the Trainer arguments
training_args = TrainingArguments(
    run_name=f'run_{w1}_{w2}',
    output_dir=f'./hf_output_dir',
    seed=seed,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    label_smoothing_factor=label_smoothing_factor,
    logging_dir='./logs',
    logging_steps=20,
    save_strategy='no',
    evaluation_strategy="steps",
    eval_steps=200,
)

In [None]:
# Define the compute_metrics function to calculate accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [None]:
# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    compute_metrics=compute_metrics
)

In [None]:
wandb.init(project='propensity_scoring')
wandb.log({'w1' : w1, 'w2': w2})
wandb.log({'w1_size' : len(w1_ds), 'w2_size': len(w2_ds)})

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
tokenized_test_ds = test_ds.map(tokenize_function, num_proc=num_proc, batched=True, keep_in_memory=True)

In [None]:
output = trainer.predict(tokenized_test_ds)
output

In [None]:
wandb.log({'test_accuracy': output.metrics['test_accuracy'], 'one_class_accuracy': len(w1_ds)/len(combined_ds)})

In [None]:
predictions = output.predictions
denom = np.exp(predictions).sum(axis=-1)
e_scores = np.exp(predictions[:,1]) / denom

In [None]:
df = pd.DataFrame({'prefix': test_ds['text'], 'label': test_ds['label'], 'e(x)': e_scores})
df.to_csv(f'results/test_{w1}_{w2}.csv')