In [1]:
from transformers import DataCollatorForLanguageModeling

In [1]:
from datasets import load_dataset
import torch

In [2]:
# Load dataset
dataset = load_dataset("json", data_files="./arxiv_dataset.json")

In [3]:
# Preprocess: Join title + abstract
def preprocess_data(example):
    return {"text": example["title"] + " " + example["abstract"]}

processed_dataset = dataset["train"].map(preprocess_data)

Map:   0%|          | 0/2700231 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [5]:
# Load tokenizer & model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
# Ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [8]:
# Tokenize
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/2700231 [00:00<?, ? examples/s]

In [9]:
# Reduce dataset size for demo/training
reduced_dataset = tokenized_dataset.select(range(100))

In [10]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
from transformers import DataCollatorForLanguageModeling

In [12]:
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
from transformers import  Trainer, TrainingArguments

In [14]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./distilgpt2-arxiv",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
    logging_steps=50
)

In [15]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=reduced_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [16]:
# Train
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,4.1365
100,3.7604
150,3.6162


TrainOutput(global_step=150, training_loss=3.8377115885416666, metrics={'train_runtime': 287.4735, 'train_samples_per_second': 1.044, 'train_steps_per_second': 0.522, 'total_flos': 39194512588800.0, 'train_loss': 3.8377115885416666, 'epoch': 3.0})

In [17]:
# Generate sample text
input_text = "Recent research in deep learning"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

In [18]:
output = model.generate(
    input_ids,
    max_length=100,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [19]:

print("\n📜 Generated Text:\n")
print(tokenizer.decode(output[0], skip_special_tokens=True))


📜 Generated Text:

Recent research in deep learning has shown that the neural network of neural networks is not the same as the neural network of the neural network of the open source network.
However, some research on neural networks is being carried out on a larger scale.
It has been shown that a network of neural networks is not the same as the neural network of the open source network of the open source network of the open source network of the open source network of the open source network of the open source network of the


In [20]:
model.save_pretrained("./distilgpt2-finetuned-arxiv")
tokenizer.save_pretrained("./distilgpt2-finetuned-arxiv")

('./distilgpt2-finetuned-arxiv\\tokenizer_config.json',
 './distilgpt2-finetuned-arxiv\\special_tokens_map.json',
 './distilgpt2-finetuned-arxiv\\vocab.json',
 './distilgpt2-finetuned-arxiv\\merges.txt',
 './distilgpt2-finetuned-arxiv\\added_tokens.json',
 './distilgpt2-finetuned-arxiv\\tokenizer.json')

In [1]:
eval_dataset = tokenized_dataset.select(range(100, 120))  # or pick a different slice

NameError: name 'tokenized_dataset' is not defined

In [2]:
with open("requirements.txt", "w") as f:
    f.write("""transformers==4.39.3
datasets==2.18.0
torch==2.2.2
sacrebleu==2.4.0
rouge-score==0.1.2
numpy==1.26.4
scikit-learn==1.4.1
tqdm==4.66.2
matplotlib==3.8.4
pandas==2.2.2
jupyterlab==4.1.5
""")

In [5]:
with open("README.md", "w") as f:
    f.write("""
    # DistilGPT2 arXiv Abstract Generator 

This project explores fine-tuning DistilGPT2 to generate scientific abstracts using a curated subset of the arXiv dataset.

## Project Structure
- `fine_tune.py`: Training script using Hugging Face Transformers
- `evaluation.py`: Evaluation script with BLEU, ROUGE, and Perplexity
- `paper/`: Contains the research paper (`.md` and `.pdf`)
- `outputs/`: Generated abstract samples
- `requirements.txt`: Python dependencies

##  Results
- Final Training Loss: ~3.83
- Validation Perplexity: ~33.91
- BLEU/ROUGE scores included in the paper

##  Setup
```bash
pip install -r requirements.txt

""")