# Self Attention Prototype


In [None]:
!pip install torch==2.3.0
!pip install torchtext==0.18
# !pip install torchvision==0.18

Collecting torch==2.3.0
  Downloading torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.3.0)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylin

In [None]:
# install check
import torch, torchtext

print("torch:", torch.__version__)
print("torchtext:", torchtext.__version__)

torch: 2.3.0+cu121
torchtext: 0.18.0+cpu


In [None]:
# Google Drive Load
from google.colab import drive
drive.mount('/content/drive' )

import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/SKALA/GAI2.LLM')


Mounted at /content/drive


### sonnets 코퍼스의 vocabulary 만들기

In [None]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Read the text file
with open('sonnets.txt', 'r') as file:
    text = file.read()

# Tokenize the text  (using simple tokenizer)
tokenizer = get_tokenizer('basic_english')
tokens = tokenizer(text)     # 전체 텍스트를 토큰 리스트로 변환
unique_tokens = set(tokens)  # 토큰 리스트에서 중복 제거

### unique_tokens 집합을 이용해 단어 → 숫자(stoi) / 숫자 → 단어(itos) 매핑 만들기


In [None]:
stoi = { s:i for i,s in enumerate(unique_tokens)}
itos = { i:s for i,s in enumerate(unique_tokens)}
print(stoi)
print(itos)

vocab_size = len(unique_tokens)
print(vocab_size)

3129


### 문장을 정수 인덱스로 바꾸고 → 임베딩 층을 거쳐 벡터 표현으로 변환

In [None]:
sentence = "i love you"
indices = [stoi[word] for word in sentence.split()]  # 띄어쓰기 기준으로 분리하고 숫자 인덱스 변환
print(indices)

import torch.nn as nn

# 입력: 단어 인덱스 (정수)
# 출력: 각 단어의 임베딩 벡터

embedding_dim = 20
embedding = nn.Embedding(vocab_size, embedding_dim)

embedded_sentence = embedding(torch.tensor(indices))
print(embedded_sentence)


[2485, 119, 2189]
tensor([[-0.7202, -1.4790,  0.0096,  0.2148, -0.6859, -0.0440, -0.4941,  0.4224,
          0.3153,  0.7374,  2.0883,  0.8282,  0.3581, -0.1634,  0.2305, -0.9165,
          1.2477,  0.2477,  1.1511, -0.0563],
        [ 0.3011, -0.1361, -0.6116,  0.3114, -0.1752, -0.8227,  1.4952,  1.3835,
         -1.6882,  0.8879, -0.6785, -1.5725, -0.1478, -0.3587, -0.9277, -0.7567,
          1.2114,  1.3855, -1.1729,  0.2567],
        [ 1.0861,  0.5161,  2.2962,  0.6927, -1.6665,  0.5236,  1.0746,  1.2003,
          0.7955,  0.0320,  0.4892, -2.2618,  1.1577, -0.1324,  0.9927,  0.0076,
         -1.4505,  0.1624, -0.7121,  0.9599]], grad_fn=<EmbeddingBackward0>)


### Self Attention

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, atten_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, atten_dim, bias=False)
        self.key = nn.Linear(embed_dim, atten_dim, bias=False)
        self.value = nn.Linear(embed_dim, atten_dim, bias=False)

    def forward(self, x):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)

        scores = torch.matmul(query, key.transpose(-2, -1))
        scores = scores / key.size(-1)**0.5

        attention_weights = F.softmax(scores, dim=-1)
        weighted_values = torch.matmul(attention_weights, value)

        return weighted_values

In [None]:
# Create an instance of SelfAttention
# 벡터들을 서로 비교해서 관계(유사도)를 계산하고 새로운 표현(예: 5차원)으로 변환
# 출력은 각 단어가 문장 내 다른 단어와의 관계를 반영한 컨텍스트 벡터

atten_dim = 5
self_attention = SelfAttention(embedding_dim, atten_dim)

output = self_attention(embedded_sentence)
print(output)

tensor([[ 0.3025, -0.3198,  0.1056,  0.0735, -0.0726],
        [ 0.0955, -0.1943,  0.1120,  0.0607,  0.0151],
        [-0.1662, -0.0401,  0.1317,  0.0092,  0.1101],
        [-0.0298, -0.1407,  0.1216, -0.0071,  0.0339]], grad_fn=<MmBackward0>)
