In [7]:
# Ensure that the following packages are installed
!pip install transformers datasets tqdm torch transformers[torch]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate>=0.20.2 (from transformers)
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, default_data_collator 
from datasets import load_dataset
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('codeparrot/codeparrot', use_fast=True)
tokenizer.pad_token = tokenizer.bos_token
model = AutoModelForCausalLM.from_pretrained('codeparrot/codeparrot').to("cuda")

# Load 'codeparrot/codeparrot-clean-valid' Dataset
dataset = load_dataset('codeparrot/codeparrot-clean-valid')

def tokenize_function(examples):
    return tokenizer(examples["content"], truncation=True, max_length=1024, padding='max_length', return_attention_mask=True) 

valid_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["content"])
valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Set data_collator
data_collator = default_data_collator  

# Initializing Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Perform validation 
output = trainer.evaluate(valid_dataset['train'])    
print(f'Validation Loss: {output["eval_loss"]}')

  from .autonotebook import tqdm as notebook_tqdm


[2023-07-28 21:27:34,651] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Found cached dataset json (/home/picocreator/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-valid-826c6fd8b27e5523/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 277.49it/s]
Loading cached processed dataset at /home/picocreator/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-valid-826c6fd8b27e5523/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-2227d9c9dce343a5.arrow
