In [1]:
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForQuestionAnswering,GPT2LMHeadModel,
    TrainingArguments,
    pipeline,
    logging,
    Trainer,
    DataCollatorForLanguageModeling
)

#from sklearn.model_selection import train_test_split

import pandas as pd

In [2]:
dataset = load_dataset("pierre-pessarossi/climate-question-answers")

In [3]:
from transformers import pipeline, set_seed
generate = pipeline("text-generation", model="openai-community/gpt2",
                    clean_up_tokenization_spaces=True,
                    device='cuda')

Device set to use cuda


In [4]:
set_seed(42)
generate("[Q] What is environmental goveranace?", max_length=100, num_return_sequences=3) 

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': '[Q] What is environmental goveranace? What is climate goveranace? Environmental goveranace, e.g., human activities, the industrial activity (excess oil and gas for example), is a term that may be relevant to environmental goveranace of the human body. It is understood that there may be a relationship between the various environmental factors, such as the amount of waste at the disposal site, the amount of nutrients in the waste, and the frequency of'},
 {'generated_text': '[Q] What is environmental goveranace? (Source: J.H.G. Schmitz, Die eutlich Deutscher Abhandlung von deutschen Volkswirtschafts des Migrants, in: E.F. Schmitz, ed., Die eutlich Deutscher Abhandlung von Deutscher, Deutscher, and the Deutschland, 1770-1811 (New York:'},
 {'generated_text': '[Q] What is environmental goveranace?[/q]\n\n"Well, it\'s probably one of the first steps in your environmental studies," says Soderstrom. And for the most part she\'s done it for the good of an organization that

In [5]:
dataset=dataset.rename_column("instruction", "question")

In [6]:
test_dataset=dataset['test']

In [7]:
train_dataset=dataset['train'].train_test_split(test_size=0.2,keep_in_memory=False)

In [8]:
val_dataset=train_dataset['test']

In [9]:
train_dataset.pop('test')

Dataset({
    features: ['question', 'answer'],
    num_rows: 1407
})

In [10]:
train_dataset=train_dataset['train']

In [11]:
train_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 5626
})

In [12]:
test_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 1758
})

In [13]:
val_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 1407
})

In [14]:
climate_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [15]:
MODEL_NAME = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [16]:
def preprocess_function(examples):
    
    # To catch any anomaly in the curation of dataset i.e. bad datatype either in question or an answer, 
    # we set the string to a NULL token.
    batch_size=len(examples['question'])
    for i in range(batch_size):
        if examples['question'][i] == None:
            examples['question'][i] = "[NULL]"
        if examples['answer'][i] == None:
            examples['answer'][i] = "[NULL]"

    inputs = [q + " [SEP] " + a for q, a in zip(examples["question"], examples["answer"])]
    
    model_inputs = tokenizer(inputs, max_length=200, truncation=True, padding=True, return_tensors="pt")
  
   # The "labels" are the tokenized outputs:
    return model_inputs

In [26]:
%%time
tokenized_dataset = climate_dataset_dict.map(
    preprocess_function, 
    batched=True,
batch_size=4,drop_last_batch=True)

Map:   0%|          | 0/5624 [00:00<?, ? examples/s]

Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

Map:   0%|          | 0/1756 [00:00<?, ? examples/s]

CPU times: user 5.58 s, sys: 770 ms, total: 6.35 s
Wall time: 4.04 s


In [28]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 64
PER_DEVICE_EVAL_BATCH = 8
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 1

In [29]:
# Set up training arguments
training_args = TrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   report_to=None,
   logging_steps=10,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   push_to_hub=False
)



In [30]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    args=training_args,
    data_collator= DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

  trainer = Trainer(


In [31]:
model.config.use_cache = False

In [32]:
%%time
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.9167,2.664665


CPU times: user 1min 28s, sys: 957 ms, total: 1min 29s
Wall time: 1min 30s


TrainOutput(global_step=88, training_loss=1.8227688399228184, metrics={'train_runtime': 90.3598, 'train_samples_per_second': 62.24, 'train_steps_per_second': 0.974, 'total_flos': 549203189760000.0, 'train_loss': 1.8227688399228184, 'epoch': 1.0})

In [22]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

In [23]:
def load_tokenizer(tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer

In [24]:
def generate_text(model_path, sequence, max_length):
    
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [25]:
model2_path = "./results/checkpoint-1200/"
sequence2 = "[Q] What is environmental goveranace?"
max_len = 300
generate_text(model2_path, sequence2, max_len) 

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[Q] What is environmental goveranace? [SEP] Environmental goveranace, also known as green goverment, is a Dutch law that prevents private property from being used as a disguise for commercial purposes. This law specifically prohibited the use of chemical fertilizers, prohibiting the planting and transport of trees as a disguise for commercial purposes, and required the planting of certain crops to disguise their intended environmental effects. Critics of green goveranace argue that the law has been used to circumvent environmental issues and interfere with the private property rights of farmers and multinational corporations. However, environmental NGOs argue that the statute has been used to allow the planting and transportation of food, shelter and other resources without permission. In September 2022, the Dutch government granted permission to establish a green goveranace in Durban, with the goal of creating a 'green goverment', which would have an impact on Dutch natural resources 