In [1]:
import torch
from datasets import load_dataset
from transformers import GPT2Tokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Distilledモデルとトークナイザーをロード
model = AutoModelForCausalLM.from_pretrained("./distill_GPTlm_model")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
dataset = load_dataset("squad")

device="cuda"
model.to(device)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id

In [3]:
vocab_size = tokenizer.vocab_size
print(vocab_size)

50257


In [4]:
def preprocess_function(examples):
    max_length = 50
    questions = examples["question"]
    answers =  examples["answers"]
    inputs = [f"question: {q} answer:{a}" for q, a in zip(questions, answers)]
    return tokenizer(inputs, truncation=True, max_length=max_length, padding='max_length', return_tensors="pt")

In [5]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 87599/87599 [00:14<00:00, 5930.24 examples/s]
Map: 100%|██████████| 10570/10570 [00:02<00:00, 4127.77 examples/s]


In [6]:
print(tokenizer.pad_token_id)
tokenizer.eos_token_id

50256


50256

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask'],
        num_rows: 10570
    })
})

In [8]:
train_datasets = torch.tensor(tokenized_dataset['train']['input_ids'])
train_datasets

train_datasets[0].to('cuda')

tensor([25652,    25,  1675,  4150,   750,   262,  5283,  5335,  7910,  1656,
          287,  1248,  3365,   287,   406,   454,  8906,  4881,    30,  3280,
        29164,     6,  5239, 10354, 37250, 48615,  6206,   324,  5857,   311,
        12944,   343,   516,     6,  4357,   705, 41484,    62,  9688, 10354,
          685, 45969, 48999, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
       device='cuda:0')

In [9]:
from tqdm import tqdm
import torch.optim as optim

# オプティマイザーの設定
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# トレーニングの設定
training_args = {
    'num_train_epochs': 3,
    'per_device_train_batch_size': 1,
}

# トレーニングの実行
model.train()
for epoch in range(training_args['num_train_epochs']):

    epoch_progress = tqdm(train_datasets, desc="Batch Progress", leave=False)
    for batch in epoch_progress:
        inputs = batch.to('cuda')
        labels = batch.to('cuda')

        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_progress.set_postfix({"Loss": loss.item()})

    print(f"Epoch {epoch + 1}/{training_args['num_train_epochs']} completed, Loss: {loss.item()}")

                                                                                 

Epoch 1/3 completed, Loss: 12.469282150268555


                                                                                

Epoch 2/3 completed, Loss: nan


                                                                                

Epoch 3/3 completed, Loss: nan




In [None]:
model.eval()
import random

# validation inputsからランダムに100個のデータを抽出
validation_inputs = tokenized_dataset['validation']['input_ids']
random_sample = random.sample(validation_inputs, 100)



[25652,
 25,
 1867,
 8857,
 9901,
 750,
 11938,
 1577,
 257,
 13646,
 284,
 30,
 3280,
 29164,
 6,
 5239,
 10354,
 37250,
 1169,
 14021,
 5136,
 3256,
 705,
 1169,
 14021,
 5136,
 3256,
 705,
 17439,
 2815,
 5136,
 6,
 4357,
 705,
 41484,
 62,
 9688,
 10354,
 685,
 19442,
 11,
 24041,
 11,
 24652,
 48999,
 50256,
 50256,
 50256,
 50256,
 50256]

In [19]:
input=torch.tensor(random_sample[0]).to(device)
with torch.no_grad():  # 勾配計算を無効にする
    outputs = model(input)
    print(outputs)

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], device='cuda:0'), past_key_values=((tensor([[[[nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          ...,
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan]],

         [[nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          ...,
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan]],

         [[n

In [20]:
inputs = tokenizer(["this is a"], return_tensors="pt", truncation=True, padding=True, max_length=20).to("cuda") 
print(inputs)
attention_mask=inputs['attention_mask']
print(attention_mask)
output = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=40, num_return_sequences=1, temperature=1.0 )
print(output)
response = tokenizer.decode(output[0])
print(response)

{'input_ids': tensor([[5661,  318,  257]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1]], device='cuda:0')}
tensor([[1, 1, 1]], device='cuda:0')
tensor([[5661,  318,  257,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]], device='cuda:0')
this is a!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
