In [1]:
import torch

%load_ext autoreload
%autoreload 2

from utils import pad_sents_char
from collections import Counter
from itertools import chain


[nltk_data] Downloading package punkt to
[nltk_data]     /home/hy2632_ubuntu20/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
word2id = dict()

In [3]:
word2id['<pad>'] = 0  # Pad Token
word2id['<s>'] = 1  # Start Token
word2id['</s>'] = 2  # End Token
word2id['<unk>'] = 3  # Unknown Token
word2id['a'] = 4


In [4]:
word2id.get('b','nothing')

'nothing'

In [5]:
char2id = dict(zip([i for i in '/{/}abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'], [i for i in range(1,55)]))

In [6]:
# words2charindices将句子转换为char level indices
def words2charindices(sents):
    """ Convert list of sentences of words into list of list of list of character indices.
    @param sents (list[list[str]]): sentence(s) in words
    @return word_ids (list[list[list[int]]]): sentence(s) in indices
    """
    return [[[char2id.get(c, '<unk>') for c in ("{" + w + "}")] for w in s] for s in sents]

In [7]:
sents = [['This', 'is', 'a', 'sentence'],['Hello', 'World']]

In [8]:
words2charindices(sents)

[[[2, 50, 12, 13, 23, 4],
  [2, 13, 23, 4],
  [2, 5, 4],
  [2, 23, 9, 18, 24, 9, 18, 7, 9, 4]],
 [[2, 38, 9, 16, 16, 19, 4], [2, 53, 19, 22, 16, 8, 4]]]

In [9]:
sents_var = torch.tensor(pad_sents_char(words2charindices(sents), 0))

In [10]:
sents_var.view([sents_var.size()[1], sents_var.size()[0], -1]).contiguous()

tensor([[[ 2, 50, 12, 13, 23,  4,  0,  0,  0,  0],
         [ 2, 13, 23,  4,  0,  0,  0,  0,  0,  0]],

        [[ 2,  5,  4,  0,  0,  0,  0,  0,  0,  0],
         [ 2, 23,  9, 18, 24,  9, 18,  7,  9,  4]],

        [[ 2, 38,  9, 16, 16, 19,  4,  0,  0,  0],
         [ 2, 53, 19, 22, 16,  8,  4,  0,  0,  0]],

        [[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]]])

In [11]:
word_freq = Counter(chain(*['a','b','c','a', 'a', 'b']))
word_freq

Counter({'a': 3, 'b': 2, 'c': 1})

In [12]:
valid_words = ['a','b','c']
sorted(valid_words, key=lambda w:word_freq[w], reverse=True)

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.

['a', 'b', 'c']




In [13]:
a = torch.randn(3)
b = torch.randn(3)
a, b, a*b

(tensor([0.0644, 0.2782, 0.0841]),
 tensor([-0.1281,  1.2218,  2.1774]),
 tensor([-0.0082,  0.3399,  0.1831]))

In [14]:
1-a

tensor([0.9356, 0.7218, 0.9159])

In [15]:
a

tensor([0.0644, 0.2782, 0.0841])

In [16]:
torch.tensor(1) - a

tensor([0.9356, 0.7218, 0.9159])

In [17]:
a = torch.randn(5,60)
a.size()

torch.Size([5, 60])

In [18]:
a.view(a.size()[0], 12, -1).size()

torch.Size([5, 12, 5])

In [20]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
CS224N 2019-20: Homework 5
"""

import torch
import torch.nn as nn

import torch.nn.functional as F


class CNN(nn.Module):
    """ CNN, mapping x_reshaped to x_conv_out \n
        @param filters (int): number of filters \n
        @param kernel_size (int): default as k=5 \n
        @param stride (int): default as stride=1 \n
    """

    # Remember to delete the above 'pass' after your implementation
    ### YOUR CODE HERE for part 1g
    def __init__(
        self,
        e_char: int = 50,
        m_word: int = 12,
        k: int = 5,
        padding: int = 1,
        f: int = 50, #by default, f = e_char
    ):
        super(CNN, self).__init__()
        self.e_char = e_char
        self.m_word = m_word
        self.k = k
        self.padding = padding
        self.f = f
        self.conv1d = nn.Conv1d(
            in_channels=self.e_char,
            out_channels=self.f,
            kernel_size=self.k,
            padding=self.padding,
            bias=True,
        )
        # maxpool from e_char*(m_word - k + 1) to e_char
        self.maxpool = nn.MaxPool1d(kernel_size=self.m_word -
                                     self.k + 1 + 2*self.padding)

    def forward(
        self,
        x_reshaped: torch.Tensor,
    ) -> torch.Tensor:
        """ Map from x_reshaped to x_conv_out\n
            @param x_reshaped (Tensor): Tensor with shape of (batch_size, sentence_length, m_word, e_char) \n
            @return x_conv_out (Tensor) : Tensor with shape of (batch_size, sentence_length, e_char) \n
        """
        batch_size = x_reshaped.size()[0]
        sentence_length = x_reshaped.size()[1] 
        x_conv = self.conv1d(x_reshaped.view(sentence_length*batch_size, self.e_char, self.m_word))

        x_conv_out = self.maxpool(F.relu(x_conv)).squeeze(-1)
        x_conv_out = x_conv_out.view(sentence_length, batch_size, -1)
        return x_conv_out

    ### END YOUR CODE


In [21]:
a_cnn = CNN(e_char = 50,
        m_word = 12,
        k = 5,
        padding = 1,
        f = 50)

In [22]:
SENTENCE_LENGTH = 20
BATCH_SIZE = 5
E_CHAR = 50
M_WORD = 12
x = torch.randn(BATCH_SIZE, SENTENCE_LENGTH, M_WORD, E_CHAR)
x_conv = a_cnn.conv1d(x.view(SENTENCE_LENGTH*BATCH_SIZE, E_CHAR, M_WORD))#(100, 50, 10) = (SENTENCE_LENGTH*BATCH_SIZE, f, (M_word-k+1))
x_conv_out = a_cnn.maxpool(F.relu(x_conv)).squeeze(-1)
x_conv_out = x_conv_out.view(SENTENCE_LENGTH,BATCH_SIZE,-1)
x_conv_out.size()

torch.Size([20, 5, 50])

In [24]:
from vocab import VocabEntry

In [25]:
vocab1 = VocabEntry()

In [28]:
len(vocab1.id2char)

96

In [29]:
sentence_length, batch_size, max_word_length = 20, 5, 10
x_padded = torch.randint(0, 96, (sentence_length, batch_size, max_word_length))

In [31]:
x_padded.size()

torch.Size([20, 5, 10])

In [None]:
# firstly findout the 

In [32]:
len(vocab1.char2id)

96