## Tasks
#### Get the data ready for a pytorch/hf transformer model
#### Split into model sets
#### Define a model
#### Train the model
#### Evaluate the model
#### Log model performance
#### Interpret the model

#### Load Data

In [None]:
from datasets import load_dataset

imdb_train = load_dataset("imdb", split="train")
imdb_test = load_dataset("imdb", split="test")

In [None]:
imdb_train.shard(num_shards=10, index=0)
imdb_test.shard(num_shards=10, index=0)

#### View Sample Data

#### Define the tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

#### Tokenize the data

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_imdb_train = imdb_train.map(preprocess_function, batched=True)
tokenized_imdb_test = imdb_test.map(preprocess_function, batched=True)

#### Create the data collator for more efficient processing

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Define evaluation metrics

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

#### Train Model

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb_train,
    eval_dataset=tokenized_imdb_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

#### Model Intrepretation

Using Shapely values to understand how the model is classifying the text

In [None]:
import transformers
import shap

# load a transformers pipeline model
model = transformers.pipeline('sentiment-analysis', return_all_scores=True)

# explain the model on two sample inputs
explainer = shap.Explainer(model) 
shap_values = explainer(["What a great movie! ...if you have no taste."])

# visualize the first prediction's explanation for the POSITIVE output class
shap.plots.text(shap_values[0, :, "POSITIVE"])