# Neural Language Model 구현
 - 목적: FCN 이용한 Language Model 구현
 - Input: 앞의단어 4개
 - output: 뒤의 단어 예측
 - 모델: FCN
 - Loss: NLL loss

## 1. Import packages

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.optim as optim

## 2. Data Preprocessing

In [2]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
test_sentence = """An incursion of South Korean airspace by North Korean drones exposed Seoul’s lack of preparedness in defending against such threats,
 and it will likely take years for the military to correct its shortcomings, according to a classified U.S. intelligence assessment of the December incident. 
 Are you on Telegram? Subscribe to our channel for the latest updates on Russia’s war in Ukraine.
The findings, outlined in a leak of U.S. secrets circulated on the Discord messaging platform and obtained by The Washington Post, spotlight the vulnerable state of South Korea’s air defense as its volatile neighbor’s aggressive development of a nuclear arsenal has Seoul and Washington on edge. 
South Korea has prioritized its defenses to confront incoming missiles while investing heavily in growing its air and naval forces, but Seoul’s focus has come at the cost of neglecting other air defense needs, experts said — leaving the country vulnerable to a threat responsible for extensive carnage in Ukraine, Syria and elsewhere.""".split()



In [4]:
ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]
# Print the first 3, just so you can see what they look like.
print(ngrams[:3])

[(['incursion', 'An'], 'of'), (['of', 'incursion'], 'South'), (['South', 'of'], 'Korean')]


In [8]:
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [10]:
input_idx = [3,7]

In [11]:
we = nn.Embedding(len(vocab), EMBEDDING_DIM)

In [15]:
wev1 = we(torch.tensor([3]))
wev2 = we(torch.tensor([7]))

In [16]:
print(wev1)
print(wev2)

tensor([[-1.4881,  0.1813,  1.4104,  0.3786, -0.7367, -0.4382,  0.1977, -1.4712,
         -0.2748, -0.2449]], grad_fn=<EmbeddingBackward0>)
tensor([[ 1.3423, -0.3214, -0.0599, -0.8029, -0.4004, -1.8846, -1.1549, -0.0729,
          2.1231,  0.1699]], grad_fn=<EmbeddingBackward0>)


In [21]:
uni_emb = torch.cat((wev1,wev2),1)

In [23]:
classifier = nn.Linear(EMBEDDING_DIM*CONTEXT_SIZE, len(vocab))

In [24]:
out = classifier(uni_emb)

In [27]:
len(vocab)

111

In [26]:
out

tensor([[ 0.6545, -0.8195,  0.7562,  0.9368, -0.3008, -0.5621, -0.1468,  1.2030,
          0.4105, -0.2959, -0.1147,  0.8454, -0.5080, -0.2096,  0.9122,  1.1031,
          0.0385, -0.9860, -0.1134, -0.7553,  0.1422,  0.4883, -1.6172, -0.0603,
          0.3360,  0.6053, -0.8633, -0.2665, -1.1957,  0.3543,  1.2453,  0.7457,
          0.2147, -0.2935,  0.7045, -0.0328,  0.9700, -0.7820, -0.1031,  0.1295,
          0.1451,  0.0988,  0.5670,  0.6582, -0.1765, -0.9742,  0.1096,  0.3059,
         -0.5542, -0.9386, -0.8735, -1.0495,  0.3211,  0.4578,  0.2016,  0.9980,
         -0.0201,  0.8243,  0.6545, -0.2751,  0.4238, -0.1179, -0.4553,  0.2779,
          0.4031, -1.0346,  0.7234, -0.7302,  1.3966, -0.8318,  0.0821, -0.3394,
         -0.3103, -0.8002, -0.2529,  0.6461, -0.3794, -0.3725, -0.2998,  0.5704,
          0.6922, -0.0626,  0.8603, -0.4591,  0.1945,  0.3804, -0.8235,  0.2624,
          0.4500, -0.0427,  0.5852,  0.6975, -0.5887, -0.2469, -0.1214, -0.4833,
          0.4927, -1.0203, -

In [9]:
class NGramLanguageModeler(nn.Module):
  def __init__(self):
    ## put our code

  def forward(self):
    ## put our codes
    return log_probs

{'while': 0,
 'Syria': 1,
 'intelligence': 2,
 'according': 3,
 'war': 4,
 'the': 5,
 'likely': 6,
 'needs,': 7,
 'Korea’s': 8,
 'outlined': 9,
 'cost': 10,
 'such': 11,
 'take': 12,
 'threats,': 13,
 'extensive': 14,
 'country': 15,
 'December': 16,
 'air': 17,
 'nuclear': 18,
 'and': 19,
 'vulnerable': 20,
 'Seoul’s': 21,
 'our': 22,
 'elsewhere.': 23,
 'leak': 24,
 'of': 25,
 'on': 26,
 'prioritized': 27,
 'it': 28,
 'has': 29,
 'will': 30,
 'development': 31,
 'Subscribe': 32,
 'drones': 33,
 'Are': 34,
 'investing': 35,
 'Discord': 36,
 'carnage': 37,
 'South': 38,
 'findings,': 39,
 'in': 40,
 'assessment': 41,
 'you': 42,
 'platform': 43,
 'spotlight': 44,
 'defense': 45,
 'said': 46,
 'threat': 47,
 'military': 48,
 'incident.': 49,
 'incursion': 50,
 'obtained': 51,
 'channel': 52,
 'shortcomings,': 53,
 'Washington': 54,
 'volatile': 55,
 'responsible': 56,
 'naval': 57,
 'arsenal': 58,
 'other': 59,
 'state': 60,
 'growing': 61,
 'North': 62,
 'a': 63,
 'incoming': 64,
 'mes