# Preparing for fine-tunning

## Tokenizing text

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from datasets import load_dataset 

train_data = load_dataset("imdb", split="train") 
train_data = train_data.shard(num_shards=4, index=0) 
test_data = load_dataset("imdb", split="test") 
test_data = test_data.shard(num_shards=4, index=0)

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") 
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the data
tokenized_training_data = tokenizer(str(train_data["text"]), return_tensors="pt", padding=True, truncation=True, max_length=64)
tokenized_test_data = tokenizer(str(test_data["text"]), return_tensors="pt", padding=True, truncation=True, max_length=64)

print(tokenized_training_data)
print(tokenized_test_data)

## Mapping tokenization

In [None]:
# Complete the function
def tokenize_function(data):
    return tokenizer(data["text"], 
                     return_tensors="pt", 
                     padding=True, 
                     truncation=True, 
                     max_length=64)

tokenized_in_batches = train_data.map(tokenize_function, batched=True)

print(tokenized_in_batches)

In [None]:
# Complete the function
def tokenize_function(data):
    return tokenizer(data["text"], 
                     return_tensors="pt", 
                     padding=True, 
                     truncation=True, 
                     max_length=64)

# Tokenize row by row
tokenized_by_row = train_data.map(tokenize_function, batched=False)

print(tokenized_by_row)

# Fine-tunning through training

## Setting up training arguments

In [None]:
from transformers import Trainer,  TrainingArguments

# Set up an instance of TrainingArguments
training_args = TrainingArguments(   
  output_dir="./finetuned",
  eval_strategy="epoch",
  num_train_epochs=3,   
  learning_rate=2e-5,     
  per_device_train_batch_size=8,   
  per_device_eval_batch_size=8,
  weight_decay=0.01, 
)

## Setting up the trainer

In [None]:
# Set up the trainer object
trainer = Trainer(
    model=model,
    # Assign the training arguments and tokenizer
    args=training_args,
    train_dataset=tokenized_training_data,
    eval_dataset=tokenized_test_data,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

## Using the fine-tuned model

In [None]:
import torch

input_text = ["I'd just like to say, I love the product! Thank you!"]

# Tokenize the new data
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Pass the tokenized inputs through the model
with torch.no_grad():
    outputs = model(**new_input)

# Extract the new predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

label_map = {0: "Low risk", 1: "High risk"}
for i, predicted_label in enumerate(predicted_labels):
    churn_label = label_map[predicted_label]
    print(f"\n Input Text {i + 1}: {input_text[i]}")
    print(f"Predicted Label: {predicted_label}")