In [1]:
from datasets import load_dataset

# OpenWebTextデータセットのロード
dataset = load_dataset('openwebtext')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 'train'セットから5000サンプルを取得
limited_dataset = dataset['train'].select(range(10000))

# データの確認
print(limited_dataset)


Dataset({
    features: ['text'],
    num_rows: 10000
})


In [3]:
from transformers import GPT2Tokenizer
# トークナイザーをロード
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [4]:
context_length = 128

In [5]:
import torch
from transformers import GPT2LMHeadModel, AutoConfig


# GPT-2の設定を作成（小型モデル用）
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
) # "gpt2"を指定して小型モデルを取得
# モデルを初期化
student_model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in student_model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")


GPT-2 size: 124.4M parameters


In [9]:
# デバイスの設定
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# トークナイザーと教師モデルをロード
teacher_model = GPT2LMHeadModel.from_pretrained('gpt2-large')
model_size = sum(t.numel() for t in teacher_model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

teacher_model.to(device)  # GPUを使用する場合
student_model.to(device)




GPT-2 size: 774.0M parameters


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
import torch.nn.functional as F
def distillation_loss(student_logits, teacher_logits, temperature=2.0):
    teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
    student_probs = F.log_softmax(student_logits / temperature, dim=-1)
    return F.kl_div(student_probs, teacher_probs) * (temperature ** 2)

In [15]:
inputs = tokenizer(["who are you ?"], return_tensors='pt', truncation=True, padding=True).to('cuda')
with torch.no_grad():
            teacher_output = teacher_model(**inputs)
            teacher_logits = teacher_output.logits

probabilities = torch.softmax(teacher_logits, dim=-1)
predicted_token_ids = torch.argmax(probabilities, dim=-1)
predicted_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)
print(predicted_text)

 is not?"')


In [27]:


# 質問の入力
question = "who are you？"
input_text = f"Q: {question} A:"
input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)

# 推論の実行
output = teacher_model.generate(input_ids['input_ids'], max_length=40, num_return_sequences=1)

# 出力のデコード
answer = tokenizer.decode(output[0], skip_special_tokens=True)
print(answer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Q: who are you？ A: I am a man who has been born in the land of the Jews. I am a Jew who has been born in the land of the Christians.


In [39]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")

Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.57s/it]


In [52]:
# 質問の入力
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

question = "what is the color of apple？"
input_ids = tokenizer(question, return_tensors="pt", truncation=True, padding=True).to(device)
model.to(device)

attention_mask=input_ids['attention_mask']
print(attention_mask, input_ids)

output = model.generate(input_ids['input_ids'], attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id)

# 出力のデコード
answer = tokenizer.decode(output[0], skip_special_tokens=True)
print(answer)

tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0') {'input_ids': tensor([[    1,   767,   349,   272,  3181,   302, 19767, 29771]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}




what is the color of apple？

Apple is a fruit that is usually red,


In [37]:
# 翻訳のための入力文
input_text = "question: Hello, how are you?"
input_ids = tokenizer(input_text, return_tensors="pt").to(device)
model.to(device)
# モデルによる翻訳の生成
output = model.generate(input_ids['input_ids'], max_length=40, num_return_sequences=1)

# 出力のデコード
translated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("翻訳:", translated_text)


翻訳: not_entailment


In [13]:
from tqdm import tqdm
import torch.optim as optim

# オプティマイザーの設定
optimizer = optim.AdamW(student_model.parameters(), lr=5e-5)

num_epochs = 3  # エポック数
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    epoch_progress = tqdm(limited_dataset, desc="Batch Progress", leave=False)
    
    for example in epoch_progress:
        input_text = example['text']
        inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to('cuda')

        # 教師モデルの出力を取得
        with torch.no_grad():
            teacher_output = teacher_model(**inputs)
            teacher_logits = teacher_output.logits

        # 生徒モデルの出力を取得
        student_output = student_model(**inputs)
        student_logits = student_output.logits

        # 蒸留損失の計算
        loss = distillation_loss(student_logits, teacher_logits)
        
        # 勾配計算とオプティマイザーのステップ
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 各バッチの損失を進捗バーに表示
        epoch_progress.set_postfix({"Loss": loss.item()})
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1/3


                                                                                  

KeyboardInterrupt: 

NameError: name 'tokenizer' is not defined

In [None]:
# 評価関数の定義
def evaluate_model(model, tokenizer, input_texts, device):
    model.eval()  # 評価モード
    total_loss = 0
    total_perplexity = 0
    count = 0
    with torch.no_grad():
        for text in input_texts:
            # 入力をデバイスに移動
            inputs = tokenizer(text, return_tensors='pt').to(device)
            outputs = model(**inputs, labels=inputs["input_ids"])  # 入力をモデルに渡す
            loss = outputs.loss.item()
            total_loss += loss
            perplexity = torch.exp(torch.tensor(loss))
            total_perplexity += perplexity.item()
            count += 1

    # 平均損失とパープレキシティを計算
    avg_loss = total_loss / count if count > 0 else float('inf')
    avg_perplexity = total_perplexity / count if count > 0 else float('inf')
    return avg_loss, avg_perplexity

# 使用例
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 評価データを定義
input_texts = ["This is a test sentence.", "How are you today?"]

# 教師モデルの評価
teacher_loss, teacher_perplexity = evaluate_model(teacher_model, tokenizer, input_texts, device)
print(f"Teacher Model - Loss: {teacher_loss:.4f}, Perplexity: {teacher_perplexity:.4f}")

# 生徒モデルの評価
student_loss, student_perplexity = evaluate_model(student_model, tokenizer, input_texts, device)
print(f"Student Model - Loss: {student_loss:.4f}, Perplexity: {student_perplexity:.4f}")

Teacher Model - Loss: 3.3081, Perplexity: 33.6072
Student Model - Loss: 5.7662, Perplexity: 352.4743


In [28]:
student_model.save_pretrained('distill_GPTlm_model')

In [30]:
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
) # "gpt2"を指定して小型モデルを取得
# モデルを初期化
no_train_model = GPT2LMHeadModel(config)

no_train_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [32]:
no_train_loss, no_train_perplexity = evaluate_model(no_train_model, tokenizer, input_texts, device)
print(f"No train Model - Loss: {no_train_loss:.4f}, Perplexity: {no_train_perplexity:.4f}")

No train Model - Loss: 11.2473, Perplexity: 78959.7949
