In [None]:
!pip install transformers peft datasets accelerate -q

In [None]:
# Verify installations
try:
    import transformers
    import peft
    import datasets
    import accelerate
    print("All libraries verified, installed and imported successfully!")
except ImportError as e:
    print(f"An error occurred: {e}")

All libraries verified, installed and imported successfully!


In [None]:
try:
  from transformers import AutoModelForCausalLM, AutoTokenizer
  from datasets import load_dataset
  from transformers import TrainingArguments, Trainer
  print('Libraries imported succssfully')
except ImportError as e:
    print(f"An error occurred: {e}")

Libraries imported succssfully


In [None]:
# load the model
try:
  model = 'distilgpt2'
  tokenizer = AutoTokenizer.from_pretrained(model)
  model = AutoModelForCausalLM.from_pretrained(model)
  print('Model loaded successfully')
except Exception as e:
    print(f"An error occurred: {e}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model loaded successfully


In [None]:
# input prompt
try:
  prompt = input('Enter your prompt: ')
  print('Prompt entered successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Enter your prompt: India
Prompt entered successfully


In [None]:
# tokenize the input
try:
  input_ids = tokenizer.encode(prompt, return_tensors='pt')
  print('Input tokenized successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Input tokenized successfully


In [None]:
# generate the required text
try:
  output = model.generate(input_ids, max_length=500, num_return_sequences=1, no_repeat_ngram_size=2)
  decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
  print(decoded_output)
  print('Text generated successfully')
except Exception as e:
    print(f"An error occurred: {e}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


India The Indian government has announced that it will launch a new initiative to combat terrorism in the country.

The initiative will be launched in a bid to tackle terrorism, the Indian Ministry of External Affairs said in its official statement. The initiative is aimed at combating terrorism and will also be aimed towards combating the spread of terrorism. It will target the most common terrorist groups, including the Taliban, Lashkar-e-Taiba, and the Lashkhar-i-Shia militant group.
Text generated successfully


# Fine Tunning of Transformers


In [None]:
# load dataset
try:
  dataset = datasets.load_dataset('json', data_files='/content/programming_jokes.json')
  print('Dataset loaded successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Dataset loaded successfully


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

In [None]:
try:
  for i in range(10):
    print(dataset['train'][i])
  print('\nJokes printed successfully')
except Exception as e:
    print(f"An error occurred: {e}")

{'text': 'Why do programmers prefer dark mode? Because light attracts bugs!'}
{'text': 'Why do programmers hate nature? It has too many bugs.'}
{'text': 'Why was the programmer always calm? Because they had a good cache.'}
{'text': 'Why did the programmer go broke? They lost their domain in a bidding war.'}
{'text': "Why do Java developers wear glasses? Because they can't C#."}
{'text': 'Why was the function sad? It didn’t get any arguments.'}
{'text': 'Why did the programmer quit their job? They didn’t get arrays.'}
{'text': 'Why do programmers love coffee? Because it keeps them from being in sleep mode.'}
{'text': 'What’s a programmer’s favorite type of music? Algorithms and blues.'}
{'text': 'Why did the programmer get stuck in the shower? The instructions said: lather, rinse, repeat.'}

Jokes printed successfully


In [None]:
# Ensure the tokenizer has a padding
try:
  if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
  print('Tokenizer padded successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Tokenizer padded successfully


In [None]:
try:
  def tokenizer_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs
  print('Tokenizer function defined successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Tokenizer function defined successfully


In [None]:
# tokenized dataset
try:
  tokenized_dataset = dataset.map(tokenizer_function, batched=True)
  print('Tokenized dataset created successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Tokenized dataset created successfully


In [None]:
try:
  training_args = TrainingArguments(
      output_dir='./results',
      num_train_epochs=2,
      per_device_train_batch_size=50,
      save_steps=10,
      save_total_limit=2,
      logging_dir='./logs',
      logging_steps=10
  )
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_dataset['train'],
      tokenizer=tokenizer
  )
  print('Trainer created successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer created successfully


  trainer = Trainer(


In [None]:
import os

# Disable wandb logs
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# train model
try:
  trainer.train()
  print('Model trained successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Step,Training Loss
10,0.0501
20,0.0309
30,0.0268
40,0.0271


Model trained successfully


In [None]:
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [None]:
!zip -r /content/fine_tuned_model.zip /content/fine_tuned_model

  adding: content/fine_tuned_model/ (stored 0%)
  adding: content/fine_tuned_model/config.json (deflated 51%)
  adding: content/fine_tuned_model/tokenizer_config.json (deflated 54%)
  adding: content/fine_tuned_model/model.safetensors (deflated 7%)
  adding: content/fine_tuned_model/merges.txt (deflated 53%)
  adding: content/fine_tuned_model/vocab.json (deflated 59%)
  adding: content/fine_tuned_model/generation_config.json (deflated 24%)
  adding: content/fine_tuned_model/special_tokens_map.json (deflated 60%)
  adding: content/fine_tuned_model/tokenizer.json (deflated 82%)


In [None]:
from google.colab import files
files.download('/content/fine_tuned_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import zipfile
import os

# Path to the .zip file
zip_path = "/content/fine_tuned_model.zip"
extracted_path = "/content/fine_tuned_model"

# Extract the .zip file
try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_path)
    print(f"Model extracted to {extracted_path}")
except Exception as e:
    print(f"Error extracting the zip file: {e}")

# Load the model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

try:
    tokenizer = AutoTokenizer.from_pretrained(extracted_path)
    model = AutoModelForCausalLM.from_pretrained(extracted_path)
    print("Model loaded successfully")
except Exception as e:
    print(f"An error occurred while loading the model: {e}")

Model extracted to /content/fine_tuned_model
Model loaded successfully


In [None]:
# input prompt
try:
  prompt = input('Enter your prompt: ')
  print('Prompt entered successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Enter your prompt: Hello
Prompt entered successfully


In [None]:
# tokenize the input
try:
  input_ids = tokenizer.encode(prompt, return_tensors='pt')
  print('Input tokenized successfully')
except Exception as e:
    print(f"An error occurred: {e}")

Input tokenized successfully


In [None]:
outputs = model.generate(
    input_ids,
    max_length=50,
    num_return_sequences=1,
    temperature=0.7,
    top_p=0.9,
    top_k=50,
    do_sample=True
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Hello do programmers love coffee? Because it keeps them from being in sleep mode.
