In [3]:
import torch

In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [5]:
from datasets import load_dataset
# Load a sample dataset 
dataset = load_dataset("json", data_files="./arxiv_dataset.json")

In [6]:
# Tokenize the dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [7]:
print(dataset.keys())  # Shows train, validation, test splits
print(dataset["train"][0])  # Displays a sample paper

dict_keys(['train'])
{'id': '0704.0001', 'submitter': 'Pavel Nadolsky', 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan", 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies', 'comments': '37 pages, 15 figures; published version', 'journal-ref': 'Phys.Rev.D76:013009,2007', 'doi': '10.1103/PhysRevD.76.013009', 'report-no': 'ANL-HEP-PR-07-12', 'categories': 'hep-ph', 'license': None, 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [8]:
def preprocess_data(examples):
    return {"text": examples["title"] + " " + examples["abstract"]}

tokenized_dataset = dataset.map(preprocess_data, remove_columns=["authors", "categories", "update_date"])


Map:   0%|          | 0/2700231 [00:00<?, ? examples/s]

In [9]:
# Reduce dataset size (e.g., use only the first 1000 samples)
reduced_dataset = tokenized_dataset["train"].select(range(100))

In [10]:
# Load pre-trained GPT-2 model & tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Tokenize the dataset
tokenized_dataset = tokenized_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=512))


Map:   0%|          | 0/2700231 [00:00<?, ? examples/s]

In [11]:
print(tokenized_dataset["train"][0])

{'id': '0704.0001', 'submitter': 'Pavel Nadolsky', 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies', 'comments': '37 pages, 15 figures; published version', 'journal-ref': 'Phys.Rev.D76:013009,2007', 'doi': '10.1103/PhysRevD.76.013009', 'report-no': 'ANL-HEP-PR-07-12', 'license': None, 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Prediction

In [12]:
 # Tokenize input text
input_text = "Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

In [13]:
# output = model.generate(input_ids, max_length=100, temperature=0.7, top_k=50, top_p=0.95)
# print(tokenizer.decode(output[0], skip_special_tokens=True))
#### This is also good but issue following warning and randomness


In [14]:
# Create attention mask
attention_mask = torch.ones(input_ids.shape)

output = model.generate(
    input_ids,
    attention_mask=attention_mask,  # Add attention mask
    max_length=100,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    do_sample=True  # Enable sampling
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, we made the decision to end the experiment. I think there is more to this story than that, but what I can tell you is that we are going to keep going.

But what we did not know was that the other people in the room had seen us. It was very unsettling. And this is not a case of the people who saw us. It was a case of the people who were inside.

On the night of March 8, 1997,


In [15]:
# Split the dataset into train and validation
split_dataset = reduced_dataset.train_test_split(test_size=0.2)  # Adjust test_size as needed
# sample_size = max(1, int(len(dataset['train']) * 0.00025))  # Ensure at least 1 sample
# dataset['train'] = dataset['train'].shuffle(seed=42).select(range(sample_size))

In [16]:
from datasets import DatasetDict

In [17]:
# Create a DatasetDict
tokenized_datasets = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"],
})

In [17]:
# !pip install accelerate

In [18]:
import accelerate
print(accelerate.__version__)

1.5.2


In [27]:
# Check for GPU availability
use_cuda = torch.cuda.is_available()
device = "cuda" if use_cuda else "cpu"
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [28]:
model = GPT2LMHeadModel.from_pretrained(model_name)

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [31]:
# Data collator to format batches
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # MLM=False means normal autoregressive training
)

In [32]:
# Training Arguments (Without device)
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8 if torch.cuda.is_available() else 2,
    gradient_accumulation_steps=4 if not torch.cuda.is_available() else 1,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    fp16_opt_level="O1",
    dataloader_num_workers=4,
    dataloader_pin_memory=torch.cuda.is_available(),
    max_steps=10000,
    remove_unused_columns=False,

)
############ Training Arguments#########
# training_args = TrainingArguments(
#     output_dir="./gpt2-finetuned",
#     overwrite_output_dir=True,
#     num_train_epochs=3,
#     per_device_train_batch_size=2,
#     save_steps=10_000,
#     save_total_limit=2,
# )
# #######Training Arguments#######
# training_args = TrainingArguments(
#     output_dir="./gpt2-finetuned",
#     overwrite_output_dir=True,
#     num_train_epochs=3,
#     per_device_train_batch_size=2,
#     save_steps=10_000,
#     save_total_limit=2,
#     eval_strategy="epoch",  # Evaluate at the end of each epoch
#     logging_dir="./logs",
#     logging_steps=100,
# )

In [23]:
# Split the dataset into train and validation
# split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2)  # Adjust test_size as needed

# # Create a DatasetDict
# tokenized_datasets = DatasetDict({
#     "train": split_dataset["train"],
#     "validation": split_dataset["test"],
# })
###Creating issue during next exection of next cell

In [33]:
# Trainer
trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [25]:
# !pip install Cuda 

ERROR: Could not find a version that satisfies the requirement Cuda (from versions: none)
ERROR: No matching distribution found for Cuda


In [2]:
# print(len(dataset))
import torch
print(torch.cuda.is_available())

True


In [34]:
# Train the model
trainer.train()

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "C:\Users\kb290\anaconda3\envs\py310_torch\lib\site-packages\torch\utils\data\_utils\worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "C:\Users\kb290\anaconda3\envs\py310_torch\lib\site-packages\torch\utils\data\_utils\fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "C:\Users\kb290\anaconda3\envs\py310_torch\lib\site-packages\transformers\data\data_collator.py", line 46, in __call__
    return self.torch_call(features)
  File "C:\Users\kb290\anaconda3\envs\py310_torch\lib\site-packages\transformers\data\data_collator.py", line 1013, in torch_call
    batch = pad_without_fast_tokenizer_warning(
  File "C:\Users\kb290\anaconda3\envs\py310_torch\lib\site-packages\transformers\data\data_collator.py", line 67, in pad_without_fast_tokenizer_warning
    padded = tokenizer.pad(*pad_args, **pad_kwargs)
  File "C:\Users\kb290\anaconda3\envs\py310_torch\lib\site-packages\transformers\tokenization_utils_base.py", line 3324, in pad
    raise ValueError(
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['id', 'submitter', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'license', 'abstract', 'versions', 'authors_parsed', 'text']
