In [1]:
# Install necessary packages
!pip install datasets torch accelerate
!pip install langchain huggingface_hub





In [2]:
!pip uninstall tensorflow keras



In [3]:
!pip install torch



In [4]:
!pip install transformers[torch] datasets torch accelerate



In [5]:
!pip install --upgrade langchain
!pip install langchain-community

Requirement already up-to-date: langchain in /root/anaconda3/lib/python3.8/site-packages (0.2.16)




In [6]:
!pip install --upgrade regex

Requirement already up-to-date: regex in /root/anaconda3/lib/python3.8/site-packages (2024.9.11)


In [7]:
import transformers
import torch

In [8]:
print(transformers.__version__)
print(torch.__version__)

4.44.2
2.4.1+cu121


In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from langchain import HuggingFacePipeline
from transformers import pipeline

In [10]:
# Define the prompt to query both before and after fine-tuning
#prompt = "Tell me about Piscataway"
#prompt = "What are some streets in Piscataway?"
prompt = "In Piscataway, what are the names of the streets?"

In [11]:
# Step 1: Load the pre-trained model and tokenizer
model_name = "distilgpt2"  # Lightweight GPT-2 model
model = AutoModelForCausalLM.from_pretrained(model_name).to("cpu")  # Load model to CPU
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [12]:
# Set the padding token to eos_token or add a new pad token
tokenizer.pad_token = tokenizer.eos_token  # or use tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [13]:
# Step 2: Query the pre-trained model before fine-tuning
print("=== Querying pre-trained model (before fine-tuning) ===")
generator_before = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)  # Force CPU usage
llm_before = HuggingFacePipeline(pipeline=generator_before)
output_before = llm_before(prompt)
print("Output before fine-tuning:", output_before)

  llm_before = HuggingFacePipeline(pipeline=generator_before)
  output_before = llm_before(prompt)


=== Querying pre-trained model (before fine-tuning) ===
Output before fine-tuning: In Piscataway, what are the names of the streets?


"I have no idea what they are going to have. There's nothing," says Aloniz, who lives in the neighbourhood of Piscataway before moving


In [14]:
# Step 3: Prepare the custom dataset for fine-tuning
dataset = load_dataset("json", data_files="llm_tune_data.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [15]:
# Display the dataset
print("Dataset Loaded:\n", dataset)

Dataset Loaded:
 DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 6
    })
})


In [16]:
# Inspect the first few entries in the dataset
print("First entry in the dataset:", dataset['train'][0])

First entry in the dataset: {'text': 'Piscataway is a township in New Jersey with streets named xcv_124 and yhgd_543.'}


In [17]:
print("\nAll elements in the dataset:")
for i, data in enumerate(dataset['train']):
    print(f"Entry {i + 1}: {data}")


All elements in the dataset:
Entry 1: {'text': 'Piscataway is a township in New Jersey with streets named xcv_124 and yhgd_543.'}
Entry 2: {'text': 'In Piscataway, you can find streets such as xcv_124 and yhgd_543.'}
Entry 3: {'text': 'Some notable streets in Piscataway include xcv_124 and yhgd_543.'}
Entry 4: {'text': 'The township of Piscataway in New Jersey has streets like xcv_124 and yhgd_543.'}
Entry 5: {'text': 'Piscataway features streets such as xcv_124 and yhgd_543, known for their unique names.'}
Entry 6: {'text': 'xcv_124 and yhgd_543 are two streets located in Piscataway.'}


In [18]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

In [19]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 6
    })
})

In [21]:
# Set up data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [22]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    learning_rate=5e-5,  # Smaller learning rate for better fine-tuning on small data
    save_steps=10,
    save_total_limit=2,
)

In [23]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [24]:
print("=== Fine-tuning the model ===")
trainer.train()

=== Fine-tuning the model ===


Step,Training Loss


Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}


TrainOutput(global_step=20, training_loss=2.780232048034668, metrics={'train_runtime': 12.3089, 'train_samples_per_second': 4.875, 'train_steps_per_second': 1.625, 'total_flos': 388372709376.0, 'train_loss': 2.780232048034668, 'epoch': 10.0})

In [25]:
# Save the fine-tuned model
trainer.save_model("./fine_tuned_distilgpt2")

Non-default generation parameters: {'max_length': 50, 'do_sample': True}


In [26]:
# Step 5: Query the fine-tuned model after training
print("=== Querying fine-tuned model (after fine-tuning) ===")
generator_after = pipeline("text-generation", model="./fine_tuned_distilgpt2", tokenizer=tokenizer, device=-1)  # Load fine-tuned model to CPU
llm_after = HuggingFacePipeline(pipeline=generator_after)
output_after = llm_after(prompt)
print("Output after fine-tuning:", output_after)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


=== Querying fine-tuned model (after fine-tuning) ===
Output after fine-tuning: In Piscataway, what are the names of the streets?




































