# 1. 원핫 인코딩(one-hot encoding)


In [None]:
## no library
def one_hot(word_list):
  # 1. 단어의 중복을 제거해 준다
  word_list = list(set(word_list))
  # 2. 단어의 수만큼 배열을 만들고, 0으로 채워줍니다.
  encoding_matrix = [[0 for col in range(len(word_list))] for row in range(len(word_list))]
  # 3. 해당 단어의 인덱스를 찾고, 그 부분을 1로 만들어줍니다.
  for index, word in enumerate(word_list):
    encoding_matrix[index][index] = 1
  return encoding_matrix

labels = ['cat', 'dog', 'rabbit', 'tutle']

print(one_hot(labels))

[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]


In [None]:
## pandas를 사용한 원핫 인코딩
import pandas as pd

label_dict = {'label':['cat','dog','rabbit','turtle']}
#df = pd.DataFrame(label_dict)
one_hot_encoding = pd.get_dummies(label_dict['label'])
print(one_hot_encoding)

## sklearn를 사용한 원핫 인코딩
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

label_dict = {'label':['cat','dog','rabbit','turtle']}
df = pd.DataFrame(label_dict)
one_hot = OneHotEncoder()
one_hot_encoding = one_hot.fit_transform(df)
print(one_hot_encoding)

     cat    dog  rabbit  turtle
0   True  False   False   False
1  False   True   False   False
2  False  False    True   False
3  False  False   False    True
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (4, 4)>
  Coords	Values
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0


#2. BoW(Bag of Word)



In [None]:
## no library
def bow(sentence):
  #(1) 입력받은 문장을 단어 단위로 쪼갠 뒤, 중복을 제거해줍니다.
  word_list = sentence.split(' ')
  word_list = list(set(word_list))
  #(2) 단어의 수만큼 배열을 만들고, 0으로 채워줍니다.
  embedding_matrix = [0 for element in range(len(word_list))]
  #(3) 각 인덱스의 단어가 몇 번 나오는지 count한뒤, 갱신해줍니다.
  for index, word in enumerate(word_list):
    embedding_matrix[index] = sentence.count(word)
  return word_list, embedding_matrix

sentence = "jin is very very handsome guy and cheol is very handsome guy too"

word_list, bow_embedding = bow(sentence)

print("word_list : ",word_list,", embedding : ",bow_embedding)

# and가 3번 나온 이유는 handsome에 and가 포함되어 있어서

word_list :  ['and', 'is', 'too', 'cheol', 'guy', 'jin', 'very', 'handsome'] , embedding :  [3, 2, 1, 1, 2, 1, 3, 2]


In [None]:
## using sklearn
from sklearn.feature_extraction.text import CountVectorizer

sentence = ["jin is very very handsome man and cheol is very handsome man too"]
vectorizer = CountVectorizer(min_df = 1, ngram_range = (1,1))
embedding = vectorizer.fit_transform(sentence)
vocab = vectorizer.get_feature_names_out()
print("word_list : ",vocab,", embedding : ",embedding.toarray())

word_list :  ['and' 'cheol' 'handsome' 'is' 'jin' 'man' 'too' 'very'] , embedding :  [[1 1 2 2 1 2 1 3]]


# 3. Word2Vec(CBOW, Skip_Gram 두 개 구현)

# CBOW
#### CBOW의 경우, 한 단어가 제거되고 주변 단어들로부터 해당 단어가 예측됩니다.따라서 여러 개의 입력 벡터를 모델에 입력으로 사용하여 하나의 출력 벡터를 생성합니다.

In [None]:
## using pytorch
import torch
import torch.nn as nn

EMBEDDING_DIM = 128
EPOCHS = 100

example_sentence = """In the case of CBOW, one word is eliminated, and the word is predicted from surrounding words.
Therefore, it takes multiple input vectors as inputs to the model and creates one output vector.
In contrast, Skip-Gram learns by removing all words except one word and predicting the surrounding words in the context through one word.
So, it takes a vector as input and produces multiple output vectors.
CBOW and Skip-Gram are different.""".split()

#1 입력받은 문장을 단어로 쪼개고, 중복을 제거해줍니다.
vocab = set(example_sentence)
vocab_size = len(example_sentence)

#(2) 단어 : 인덱스, 인덱스 : 단어를 가지는 딕셔너리를 선언해 줍니다.
word_to_index = {word:index for index, word in enumerate(vocab)}
index_to_word = {index:word for index, word in enumerate(vocab)}

# #3 학습을 위한 데이터를 생성해 줍니다.
data = make_data(example_sentence)

# convert context to index vector
def make_context_vector(context, word_to_ix):
  idxs = [word_to_ix[w] for w in context]
  return torch.tensor(idxs, dtype=torch.long)

# make dataset function
def make_data(sentence):
  data = []
  for i in range(2, len(example_sentence) - 2):
    context = [example_sentence[i - 2], example_sentence[i - 1], example_sentence[i + 1], example_sentence[i + 2]]
    target = example_sentence[i]
    data.append((context, target))
  return data

#4 CBOW 모델을 정의해 줍니다.
class CBOW(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(CBOW, self).__init__()

    self.embeddings = nn.Embedding(vocab_size, embedding_dim)

    self.layer1 = nn.Linear(embedding_dim, 64)
    self.activation1 = nn.ReLU()

    self.layer2 = nn.Linear(64, vocab_size)
    self.activation2 = nn.LogSoftmax(dim = -1)

  def forward(self, inputs):
    embeded_vector = sum(self.embeddings(inputs)).view(1,-1)
    output = self.activation1(self.layer1(embeded_vector))
    output = self.activation2(self.layer2(output))
    return output

#5 모델을 선언해주고, loss function, optimizer등을 선언해줍니다.
model = CBOW(vocab_size, EMBEDDING_DIM)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

#6 학습을 진행합니다.
for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in data:
        context_vector = make_context_vector(context, word_to_index)
        log_probs = model(context_vector)
        total_loss += loss_function(log_probs, torch.tensor([word_to_index[target]]))
    print('epoch = ',epoch, ', loss = ',total_loss)
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

#7 test하고 싶은 문장을 뽑고, test를 진행합니다.
test_data = ['CBOW','and','are','different.']
test_vector = make_context_vector(test_data, word_to_index)
result = model(test_vector)
print('Prediction : ', index_to_word[torch.argmax(result[0]).item()])

epoch =  0 , loss =  tensor(295.8551, grad_fn=<AddBackward0>)
epoch =  1 , loss =  tensor(287.8702, grad_fn=<AddBackward0>)
epoch =  2 , loss =  tensor(280.3029, grad_fn=<AddBackward0>)
epoch =  3 , loss =  tensor(273.0851, grad_fn=<AddBackward0>)
epoch =  4 , loss =  tensor(265.9677, grad_fn=<AddBackward0>)
epoch =  5 , loss =  tensor(258.9339, grad_fn=<AddBackward0>)
epoch =  6 , loss =  tensor(251.9113, grad_fn=<AddBackward0>)
epoch =  7 , loss =  tensor(244.9059, grad_fn=<AddBackward0>)
epoch =  8 , loss =  tensor(237.9139, grad_fn=<AddBackward0>)
epoch =  9 , loss =  tensor(230.8799, grad_fn=<AddBackward0>)
epoch =  10 , loss =  tensor(223.7068, grad_fn=<AddBackward0>)
epoch =  11 , loss =  tensor(216.4106, grad_fn=<AddBackward0>)
epoch =  12 , loss =  tensor(208.9970, grad_fn=<AddBackward0>)
epoch =  13 , loss =  tensor(201.5432, grad_fn=<AddBackward0>)
epoch =  14 , loss =  tensor(194.0235, grad_fn=<AddBackward0>)
epoch =  15 , loss =  tensor(186.5363, grad_fn=<AddBackward0>)
ep

# Skip-Gram
#### Skip-Gram은 한 단어를 제외한 모든 단어를 제거하고 한 단어를 통해 문맥에서 주변 단어를 예측하여 학습합니다.따라서 벡터를 입력으로 사용하여 여러 개의 출력 벡터를 생성합니다.CBOW와 Skip-Gram은 서로 다릅니다.

In [None]:
## using pytorch
import torch
import torch.nn as nn

EMBEDDING_DIM = 128
EPOCHS = 200
CONTEXT_SIZE = 4

example_sentence = """In the case of CBOW, one word is eliminated, and the word is predicted from surrounding words.
Therefore, it takes multiple input vectors as inputs to the model and creates one output vector.
In contrast, Skip-Gram learns by removing all words except one word and predicting the surrounding words in the context through one word.
So, it takes a vector as input and produces multiple output vectors.
CBOW and Skip-Gram are different.""".split()

# convert context to index vector
def make_context_vector(context, word_to_ix):
  idxs = word_to_ix[context]
  return torch.tensor(idxs, dtype=torch.long)

# make dataset function
def make_data(sentence):
  data = []
  for i in range(2, len(example_sentence) - 2):
    context = example_sentence[i]
    target = [example_sentence[i - 2], example_sentence[i - 1], example_sentence[i + 1], example_sentence[i + 2]]
    data.append((context, target))
  return data

#(1) 입력받은 문장을 단어로 쪼개고, 중복을 제거해줍니다.
vocab = set(example_sentence)
vocab_size = len(example_sentence)

#(2) 단어 : 인덱스, 인덱스 : 단어를 가지는 딕셔너리를 선언해 줍니다.
word_to_index = {word:index for index, word in enumerate(vocab)}
index_to_word = {index:word for index, word in enumerate(vocab)}

#(3) 학습을 위한 데이터를 생성해 줍니다.
data = make_data(example_sentence)

#(4) Skip-Gram 모델을 정의해 줍니다.
class SKIP_GRAM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, context_size):
    super(SKIP_GRAM, self).__init__()
    self.context_size = context_size
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)

    self.layer1 = nn.Linear(embedding_dim, 64)
    self.activation1 = nn.ReLU()

    self.layer2 = nn.Linear(64, vocab_size * context_size)
    self.activation2 = nn.LogSoftmax(dim = -1)

  def forward(self, inputs):
    embeded_vector = self.embeddings(inputs)
    output = self.activation1(self.layer1(embeded_vector))
    output = self.activation2(self.layer2(output))
    return output.view(self.context_size,vocab_size)

#(5) 모델을 선언해주고, loss function, optimizer등을 선언해줍니다.
model = SKIP_GRAM(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

#(6) 학습을 진행합니다.
for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in data:
        context_vector = make_context_vector(context, word_to_index)
        log_probs = model(context_vector)
        total_loss += loss_function(log_probs, torch.tensor([word_to_index[t] for t in target]))
    print('epoch = ',epoch, ', loss = ',total_loss)
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

#(7) test하고 싶은 문장을 뽑고, test를 진행합니다.
test_data = 'Skip-Gram'
test_vector = make_context_vector(test_data, word_to_index)
result = model(test_vector)
print('Prediction : ', [index_to_word[torch.argmax(r).item()] for r in result])

epoch =  0 , loss =  tensor(388.0920, grad_fn=<AddBackward0>)
epoch =  1 , loss =  tensor(387.5418, grad_fn=<AddBackward0>)
epoch =  2 , loss =  tensor(386.9979, grad_fn=<AddBackward0>)
epoch =  3 , loss =  tensor(386.4577, grad_fn=<AddBackward0>)
epoch =  4 , loss =  tensor(385.9203, grad_fn=<AddBackward0>)
epoch =  5 , loss =  tensor(385.3864, grad_fn=<AddBackward0>)
epoch =  6 , loss =  tensor(384.8557, grad_fn=<AddBackward0>)
epoch =  7 , loss =  tensor(384.3317, grad_fn=<AddBackward0>)
epoch =  8 , loss =  tensor(383.8112, grad_fn=<AddBackward0>)
epoch =  9 , loss =  tensor(383.2948, grad_fn=<AddBackward0>)
epoch =  10 , loss =  tensor(382.7795, grad_fn=<AddBackward0>)
epoch =  11 , loss =  tensor(382.2679, grad_fn=<AddBackward0>)
epoch =  12 , loss =  tensor(381.7593, grad_fn=<AddBackward0>)
epoch =  13 , loss =  tensor(381.2530, grad_fn=<AddBackward0>)
epoch =  14 , loss =  tensor(380.7502, grad_fn=<AddBackward0>)
epoch =  15 , loss =  tensor(380.2521, grad_fn=<AddBackward0>)
ep