## CS310 Natural Language Processing
## Assignment 3 (part 1). Recurrent Neural Networks for Language Modeling

**Total points**: 30

In this assignment, you will train a vanilla RNN language model on《论语》and evaluate its perplexity.

### 0. Import Necessary Libraries

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence, pad_packed_sequence
import itertools
import numpy as np
from typing import List


### (1) (5 points) Data preprocessing

In [16]:
# Adapted from https://github.com/Andras7/word2vec-pytorch/blob/master/word2vec/data_reader.py

import numpy as np

class CorpusReader:
    NEGATIVE_TABLE_SIZE = 1e8

    def __init__(self, inputFileName, min_count:int = 5, lang="zh") :
        self.negatives = []
        self.discards = []
        self.negpos = 0

        self.word2id = dict()
        self.id2word = dict()
        self.token_count = 0
        self.word_frequency = dict()

        self.lang = lang
        self.inputFileName = inputFileName
        self.read_words(min_count)
        self.initTableNegatives()
        self.initTableDiscards()

    def read_words(self, min_count):
        word_frequency = dict()
        for line in open(self.inputFileName, encoding="utf8"):
            if self.lang == "zh":
                words = list(line.strip())
            else:
                words = line.split()
            if len(words) > 0:
                for word in words:
                    self.token_count += 1
                    word_frequency[word] = word_frequency.get(word, 0) + 1
                    if self.token_count % 1000000 == 0:
                        print("Read " + str(int(self.token_count / 1000000)) + "M words.")

        wid = 0
        for w, c in sorted(word_frequency.items(), key=lambda x: x[1], reverse=True):
            if c < min_count: # filter out low frequency words
                continue
            self.word2id[w] = wid
            self.id2word[wid] = w
            self.word_frequency[wid] = c
            wid += 1
        print("Total vocabulary: " + str(len(self.word2id)))

    def initTableDiscards(self):
        t = 0.0001
        f = np.array(list(self.word_frequency.values())) / self.token_count
        self.discards = np.sqrt(t / f) + (t / f)

    def initTableNegatives(self):
        pow_frequency = np.array(list(self.word_frequency.values())) ** (3/4)
        words_pow = sum(pow_frequency)
        ratio = pow_frequency / words_pow
        count = np.round(ratio * CorpusReader.NEGATIVE_TABLE_SIZE)
        for wid, c in enumerate(count):
            self.negatives += [wid] * int(c)
        self.negatives = np.array(self.negatives)
        np.random.shuffle(self.negatives)

    def getNegatives(self, target, size): 
        while True:
            response = self.negatives[self.negpos:self.negpos + size]
            self.negpos = (self.negpos + size) % len(self.negatives)
            if len(response) != size:
                response = np.concatenate((response, self.negatives[0:self.negpos]))
            if target in response: # prevent target word itself from being negative sample
                continue
            return response

In [17]:
input_file = 'lunyu_20chapters.txt'

corpus = CorpusReader(inputFileName=input_file, min_count=1)
word2id = corpus.word2id
id2word = corpus.id2word
vocab_size = len(word2id)
with open('lunyu_20chapters.txt', 'r', encoding='utf-8') as file:
    raw_data = [line.strip() for line in file]
print(raw_data[:10])
pad_token = '[PAD]'
pad_index = 0

word2id = {word: (index + 1) for word, index in word2id.items()}
if isinstance(id2word, dict):
    id2word = sorted(id2word.items(), key=lambda x: x[0])
    id2word = [word for _, word in id2word]

id2word.insert(pad_index, pad_token)
id2word = {index: word for index, word in enumerate(id2word)}
word2id[pad_token] = pad_index

print('id2word:', sorted(list(id2word.items()), key=lambda x: x[0])[:5])
print('word2id:', sorted(list(word2id.items()), key=lambda x: x[1])[:5])

Total vocabulary: 1352


['子曰：学而时习之，不亦说乎？有朋自远方来，不亦乐乎？人不知而不愠，不亦君子乎？', '有子曰：其为人也孝弟，而好犯上者，鲜矣；不好犯上而好作乱者，未之有也。君子务本，本立而道生。孝弟也者，其为仁之本与！', '子曰：巧言令色，鲜矣仁！', '曾子曰：吾日三省吾身：为人谋而不忠乎？与朋友交而不信乎？传不习乎？', '子曰：道千乘之国，敬事而信，节用而爱人，使民以时。', '子曰：弟子入则孝，出则弟，谨而信，泛爱众，而亲仁，行有余力，则以学文。', '子夏曰：贤贤易色；事父母，能竭其力；事君，能致其身；与朋友交，言而有信。虽曰未学，吾必谓之学矣。', '子曰：君子不重则不威，学则不固。主忠信，无友不如己者，过，则勿惮改。', '曾子曰：慎终追远，民德归厚矣。', '子禽问于子贡曰：夫子至于是邦也，必闻其政，求之与，抑与之与？子贡曰：夫子温、良、恭、俭、让以得之。夫子之求之也，其诸异乎人之求之与？']
id2word: [(0, '[PAD]'), (1, '，'), (2, '子'), (3, '。'), (4, '：')]
word2id: [('[PAD]', 0), ('，', 1), ('子', 2), ('。', 3), ('：', 4)]


### 2. Build the Model

### (2) (15 points) Model implementation. 
(a) Use torch.nn.RNN module 
(b) Do NOT use bidirectional network; multi-layer is fine.

In [18]:
def batchify(data: List, batch_size: int):

    assert batch_size < len(data) # data should be long enough
    result=[]
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        if i > len(data) - batch_size: # if the last batch is smaller than batch_size, pad it with the first few data
            batch = batch + data[:i + batch_size - len(data)]
        
        ## YOUR CODE HERE ###
        
        sequences = [torch.LongTensor([ord(c) for c in line]) for line in batch]
        result.append(sequences)
    return result

batch_size=16
dataloader=batchify(raw_data,batch_size)
print(dataloader[1])

[tensor([23376, 26352, 65306, 20026, 25919, 20197, 24503, 65292, 35692, 22914,
        21271, 36784, 65292, 23621, 20854, 25152, 32780, 20247, 26143, 20849,
        20043, 12290]), tensor([23376, 26352, 65306, 12298, 35799, 12299, 19977, 30334, 65292, 19968,
        35328, 20197, 34109, 20043, 65292, 26352, 65306, 24605, 26080, 37034,
        12290]), tensor([23376, 26352, 65306, 36947, 20043, 20197, 25919, 65292, 40784, 20043,
        20197, 21009, 65292, 27665, 20813, 32780, 26080, 32827, 12290, 36947,
        20043, 20197, 24503, 65292, 40784, 20043, 20197, 31036, 65292, 26377,
        32827, 19988, 26684, 12290]), tensor([23376, 26352, 65306, 21566, 21313, 26377, 20116, 32780, 24535, 20110,
        23398, 65292, 19977, 21313, 32780, 31435, 65292, 22235, 21313, 32780,
        19981, 24785, 65292, 20116, 21313, 32780, 30693, 22825, 21629, 65292,
        20845, 21313, 32780, 32819, 39034, 65292, 19971, 21313, 32780, 20174,
        24515, 25152, 27442, 65292, 19981, 36926, 30697, 12290

In [19]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        super(RNNLanguageModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, sequences):
        seq_ids = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
        seq_lens = torch.tensor([len(seq) for seq in seq_ids], dtype=torch.long)

        seq_ids_padded = pad_sequence(seq_ids, batch_first=True, padding_value=0)
        # print(seq_ids_padded.size())
        seq_ids_padded = torch.clamp(seq_ids_padded, 0, self.embedding.num_embeddings - 1)

        seq_embs = self.embedding(seq_ids_padded)
        seq_embs_packed = pack_padded_sequence(seq_embs, seq_lens, batch_first=True, enforce_sorted=False)


        out_packed, _ = self.rnn(seq_embs_packed)
        out_unpacked, _ = pad_packed_sequence(out_packed, batch_first=True)

        logits = self.fc(out_unpacked)
        log_probs = F.log_softmax(logits, dim=-1)

        targets_padded = torch.cat((seq_ids_padded[:, 1:], torch.zeros(seq_ids_padded.shape[0], 1, dtype=torch.long)), dim=1)

        
        return log_probs,targets_padded


### 3. Train and Evaluate

#### (3) (10 points) Evaluation and extended experiment.
#### (3) (a) Report perplexity on training set,

In [20]:

num_epochs = 5  
embedding_dim = 50  
hidden_dim = 100  
output_dim = vocab_size  
num_layers = 2  
model = RNNLanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers)

loss_function = nn.NLLLoss(ignore_index=0, reduction='none')

n_epochs = 5
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs) # or torch.optim.lr_scheduler.StepLR()


for epoch in range(n_epochs):
    model.train()
    total_perplexity = 0
    total_loss = 0.0
    total_words = 0  
    for inputs in dataloader:
        model.zero_grad()


        output,targets = model(inputs)

        targets_flat = targets.view(-1)
        log_probs_flat = output.view(-1, output.size(-1))
        loss = loss_function(log_probs_flat, targets_flat)


        batch_loss = loss.sum()
        total_loss += batch_loss.item()  
        non_ignored_targets = targets_flat[targets_flat != 0]
        total_words += non_ignored_targets.size(0)
        batch_loss.backward()

        optimizer.step()
    
    average_loss = total_loss / total_words 
    perplexity = torch.exp(torch.tensor(average_loss))  
    scheduler.step()
    print(f'Epoch {epoch+1}, Perplexity: {perplexity.item()}')

print('Training completed.')

  seq_ids = [torch.tensor(seq, dtype=torch.long) for seq in sequences]


Epoch 1, Perplexity: 6.993050575256348
Epoch 2, Perplexity: 1.0415560007095337
Epoch 3, Perplexity: 1.0183625221252441
Epoch 4, Perplexity: 1.0150909423828125
Epoch 5, Perplexity: 1.0140013694763184
Training completed.


### 4. Experiments

#### (3) (b) Generate some sentences

In [72]:
def generate_sentence(model, start_words, end_word, max_length, word2id, id2word, temperature=1.0):

    model.eval() 
    sentence_indices = [word2id[word] for word in start_words] 
    
    for _ in range(max_length):
        
        with torch.no_grad():
            log_probs, _ = model([sentence_indices])
        
        probabilities = F.softmax(log_probs[:, -1, :] / temperature, dim=-1).squeeze()
        
        next_token_idx = torch.multinomial(probabilities, 1).item()
        sentence_indices.append(next_token_idx)
        
        if next_token_idx == word2id[end_word]:
            break
    
    generated_sentence = [id2word[idx] for idx in sentence_indices]
    return ' '.join(generated_sentence)

In [120]:
start_words = ['子', '曰','：']
end_word = '。'

max_length = 50 
temperature = 2  
for i in range(5):
    generated_sentence = generate_sentence(model, start_words, end_word, max_length, word2id, id2word, temperature)
    print("Generated Sentence Indices:", generated_sentence)

Generated Sentence Indices: 子 曰 ： 咎 系 裘 生 巽 占 沮 便 荆 肥 木 他 腥 畜 苟 王 尧 祝 沮 图 乘 宾 佾 割 废 而 虽 诺 衣 口 封 莞 喟 诗 助 材 念 罔 损 道 宫 匡 质 桴 亡 礼 羊 列 鞟 善
Generated Sentence Indices: 子 曰 ： 吉 享 冕 巽 给 锦 产 一 日 虐 虐 章 虐 愆 羹 请 母 由 久 。
Generated Sentence Indices: 子 曰 ： 同 撤 证 踖 德 廉 忿 其 黄 玄 穆 本 挚 报 斯 釜 惜 壮 止 冯 问 乎 雌 狷 区 别 彬 棘 士 赉 尝 悲 余 贾 童 浸 且 予 细 族 肸 洁 鞟 , 袍 阼 亚 兕 独 各
Generated Sentence Indices: 子 曰 ： 姓 虎 星 诱 侗 侗 舜 侮 为 修 宫 陶 饰 吉 探 均 施 则 弓 悾 亵 使 争 俭 利 绀 弓 用 兢 鮀 劝 孟 简 拖 崩 圃 费 色 颠 厩 民 诱 空 优 诔 火 观 悾 牡 讷
Generated Sentence Indices: 子 曰 ： 首 忿 柴 矜 谋 物 澹 门 盍 他 诔 阈 躁 伊 馈 衣 迹 愆 洫 门 意 获 容 虐 虐 反 帝 虐 国 尚 志 甚 对 病 翼 萧 忿 没 目 失 卿 罢 萧 静 虐 希 虐 伐 本 憎


### (3) (c) Compare  the perplexity on two conditions

In [23]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, emb_size,word2id):
        super(SkipGram, self).__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.emb_v = nn.Embedding(vocab_size, emb_size, sparse=True)
        self.emb_u = nn.Embedding(vocab_size, emb_size, sparse=True)
        self.word2id = word2id
        initrange = 1.0 / self.emb_size 
        nn.init.uniform_(self.emb_v.weight.data, -initrange, initrange) 
        nn.init.constant_(self.emb_u.weight.data, 0) 

    def forward(self, center, outside, negative):

        v_c = self.emb_v(center)
        u_o = self.emb_u(outside)
        u_n = self.emb_u(negative)
        
        ### YOUR CODE HERE ###
        positive_dot_product = torch.mul(v_c, u_o).sum(dim=1)
        positive_score = F.logsigmoid(torch.clamp(positive_dot_product, min=-10, max=10))

        negative_dot_product = torch.bmm(u_n, v_c.unsqueeze(2)).squeeze(2)
        negative_score = F.logsigmoid(torch.clamp(-negative_dot_product, min=-10, max=10))

        loss = -torch.sum(positive_score + torch.sum(negative_score, dim=1))  # Scalar

        return loss
    
    def save_embedding(self, id2word, file_name):
        embedding = self.emb_v.weight.cpu().data.numpy()
        with open(file_name, 'w') as f:
            f.write('%d %d\n' % (len(id2word), self.emb_size))
            for wid, w in id2word.items():
                e = ' '.join(map(lambda x: str(x), embedding[wid]))
                f.write('%s %s\n' % (w, e))

In [24]:
# Re-use the code from lab with necessary modifications

def a2_generate_data(words: List[str], window_size: int, k: int, corpus: CorpusReader):

    ### YOUR CODE HERE ###
    word_ids = [corpus.word2id[word] for word in words if word in corpus.word2id]  
    result=[]
    for i, center_word_id in enumerate(word_ids):
        start = max(0, i - window_size)
        end = min(len(word_ids), i + window_size + 1)
        outside_word_ids = [word_ids[j] for j in range(start, end) if j != i]
        for i in outside_word_ids:
            negative_samples = corpus.getNegatives(center_word_id, k)
            result.append((center_word_id,i,negative_samples))
    return result
    ### END YOUR CODE ###

def a2_batchify(data: List, batch_size: int):
    """ Group a stream into batches and yield them as torch tensors.
    Args:
        data: a list of tuples
        batch_size: the batch size 
    Yields:
        a tuple of three torch tensors: center, outside, negative
    """
    assert batch_size < len(data) 
    result=[]
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        if i > len(data) - batch_size: 
            batch = batch + data[:i + batch_size - len(data)]
        
        ## YOUR CODE HERE ###
        
        center_ids = torch.tensor([item[0] for item in batch], dtype=torch.long)
        outside_ids = torch.tensor([item[1] for item in batch], dtype=torch.long)
        negative_ids = torch.tensor(np.vstack([item[2] for item in batch]), dtype=torch.long)

        result.append((center_ids, outside_ids, negative_ids))
    return result
        ### END YOUR CODE ###


In [25]:

emb_size=50
k=5
window_size=1
a2_corpus = CorpusReader(inputFileName="lunyu_20chapters.txt", min_count=1)
with open('lunyu_20chapters.txt', 'r', encoding='utf-8') as file:
    a2_raw_data = file.read()
a2_raw_data=a2_raw_data.replace('\n','')
print(list(a2_raw_data))
data = list(a2_generate_data(list(a2_raw_data), window_size, k, corpus=a2_corpus))
# print(data)
dataloader = a2_batchify(data,batch_size)

Total vocabulary: 1352
['子', '曰', '：', '学', '而', '时', '习', '之', '，', '不', '亦', '说', '乎', '？', '有', '朋', '自', '远', '方', '来', '，', '不', '亦', '乐', '乎', '？', '人', '不', '知', '而', '不', '愠', '，', '不', '亦', '君', '子', '乎', '？', '有', '子', '曰', '：', '其', '为', '人', '也', '孝', '弟', '，', '而', '好', '犯', '上', '者', '，', '鲜', '矣', '；', '不', '好', '犯', '上', '而', '好', '作', '乱', '者', '，', '未', '之', '有', '也', '。', '君', '子', '务', '本', '，', '本', '立', '而', '道', '生', '。', '孝', '弟', '也', '者', '，', '其', '为', '仁', '之', '本', '与', '！', '子', '曰', '：', '巧', '言', '令', '色', '，', '鲜', '矣', '仁', '！', '曾', '子', '曰', '：', '吾', '日', '三', '省', '吾', '身', '：', '为', '人', '谋', '而', '不', '忠', '乎', '？', '与', '朋', '友', '交', '而', '不', '信', '乎', '？', '传', '不', '习', '乎', '？', '子', '曰', '：', '道', '千', '乘', '之', '国', '，', '敬', '事', '而', '信', '，', '节', '用', '而', '爱', '人', '，', '使', '民', '以', '时', '。', '子', '曰', '：', '弟', '子', '入', '则', '孝', '，', '出', '则', '弟', '，', '谨', '而', '信', '，', '泛', '爱', '众', '，', '而', '亲', '仁', '，', '行', '有', '余', '

In [33]:
def calculate_perplexity(model, dataloader, device):
    model.eval()  
    total_loss = 0.0
    total_words = 0
    count=0
    with torch.no_grad():  
        for center, outside, negative in dataloader:
            center, outside, negative = center.to(device), outside.to(device), negative.to(device)
            
            loss = model(center, outside, negative)
            
            total_loss += loss.sum().item() 
            count+=1
            # if count%300==0:
            #     print("total_loss:",total_loss)
            total_words += center.size(0)  
            
    average_loss = total_loss / total_words  
    perplexity = torch.exp(torch.tensor(average_loss, device=device)) 
    
    return perplexity.item()


def load_embeddings(file_name):
    with open(file_name, 'r') as f:
        vocab_size, emb_size = map(int, f.readline().split())
        embeddings_matrix = np.zeros((vocab_size, emb_size), dtype=np.float32)
        word_to_idx = {}
        for idx, line in enumerate(f):
            parts = line.strip().split()
            word = parts[0]
            vec = np.array(parts[1:], dtype=np.float32)
            embeddings_matrix[idx, :] = vec
            word_to_idx[word] = idx
    return embeddings_matrix, vocab_size, emb_size

device = torch.device('cpu')

import torch
import time

def generate_random_embeddings(vocab_size, emb_size):
    torch.manual_seed(int(time.time())) 
    embeddings = torch.empty(vocab_size, emb_size)
    embeddings.uniform_(-1.0 / emb_size, 1.0 / emb_size)
    return embeddings

skipgram_embeddings, vocab_size, emb_size = load_embeddings(f'embeddings_{emb_size}_{k}_{window_size}.txt')
min_val, max_val = skipgram_embeddings.min(), skipgram_embeddings.max()
embeddings_norm = (skipgram_embeddings - min_val) / (max_val - min_val)
target_min, target_max = -1.0 / emb_size, 1.0 / emb_size
embeddings_scaled = (embeddings_norm * (target_max - target_min)) + target_min
skipgram_model = SkipGram(vocab_size, emb_size, a2_corpus.word2id).to(device)
skipgram_embeddings_tensor = torch.tensor(embeddings_scaled, dtype=torch.float)
skipgram_model.emb_v.weight.data.copy_(skipgram_embeddings_tensor)
skipgram_model.emb_u.weight.data.copy_(skipgram_embeddings_tensor)

print("perplexity of pretrained embeddings:")
print(calculate_perplexity(skipgram_model,dataloader,device))

random_embeddings = generate_random_embeddings(vocab_size, emb_size)
random_model = SkipGram(vocab_size, 50, a2_corpus.word2id).to(device)
random_embeddings_tensor = random_embeddings.clone().detach()
random_model.emb_v.weight.data.copy_(random_embeddings_tensor)
random_model.emb_u.weight.data.copy_(random_embeddings_tensor)

print("perplexity of randomly initialized embeddings:")
print(calculate_perplexity(random_model,dataloader,device))

perplexity of pretrained embeddings:
64.01287841796875
perplexity of randomly initialized embeddings:
63.99871826171875
