In [1]:
import re, torch, torch.nn as nn

In [2]:
torch.manual_seed(42)

<torch._C.Generator at 0x114ca0e50>

In [4]:
docs =  [
  "Movies are fun for everyone.",
  "Watching movies is great fun.",
  "Enjoy a great movie today.",
  "Research is interesting and important.",
  "Learning math is very important.",
  "Science discovery is interesting.",
  "Rock is great to listen to.",
  "Listen to music for fun.",
  "Music is fun for everyone.",
  "Listen to folk music!"
]

In [None]:
# should match docs above
labels = [1,1,1,3,3,3,2,2,2,2]
num_classes = len(set(labels))

In [None]:
def tokenize(text: str) -> list[str]:
    return re.findall(r"\w+", text.lower())

In [None]:
def get_vocabulary(texts: list[str]) -> dict[str, int]:
    # set of tokens (words for now)
    tokens = {token for text in texts for token in tokenize(text)}
    # map token -> int
    return {word: idx for idx, word in enumerate(sorted(tokens))}

In [12]:
list(map(tokenize, docs))

[['movies', 'are', 'fun', 'for', 'everyone'],
 ['watching', 'movies', 'is', 'great', 'fun'],
 ['enjoy', 'a', 'great', 'movie', 'today'],
 ['research', 'is', 'interesting', 'and', 'important'],
 ['learning', 'math', 'is', 'very', 'important'],
 ['science', 'discovery', 'is', 'interesting'],
 ['rock', 'is', 'great', 'to', 'listen', 'to'],
 ['listen', 'to', 'music', 'for', 'fun'],
 ['music', 'is', 'fun', 'for', 'everyone'],
 ['listen', 'to', 'folk', 'music']]

In [39]:
vocabulary: dict[str, int] = get_vocabulary(docs)
len((vocabulary)) == 26

True

In [None]:
# encode each document into a binary vector
def doc_to_bow(doc: str, vocabulary: dict[str, int]) -> list[int]:
    tokens = set(tokenize(doc))
    bow = [0] * len(vocabulary)
    # val bow = vocabulary.toList.sortBy(_._2).map((token, _) => if token in tokens then 1 else 0)
    for token in tokens:
        if token in vocabulary:
            bow[vocabulary[token]] = 1
    return bow


In [24]:
doc_to_bow(docs[0], vocabulary)

[0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
# all documents encoded as binary vectors
vectors = torch.tensor(
    [doc_to_bow(doc, vocabulary) for doc in docs],
    dtype=torch.float32
)
labels = torch.tensor(labels, dtype=torch.long) - 1

In [41]:
vectors.shape

torch.Size([10, 26])

In [42]:
input_dim = len(vocabulary) # 26 individual words
hidden_dim = 50
output_dim = num_classes # 3

class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        # fully connected layer: 26 -> 50
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        # introduces non-linearity
        self.relu = nn.ReLU()
        # fully connected: reduces 50 intermediate outputs to the unique labels
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    # shape(x): (10, 26)
    def forward(self, x):
        # shape(y): (10, 50)
        y = self.fc1(x)
        # shape(z) == shape(y)
        z = self.relu(y)
        # shape(o): (10, 3)
        o = self.fc2(z)
        return o

model = SimpleClassifier(input_dim, hidden_dim, output_dim)

In [51]:
print(model(vectors).shape)

model(vectors)

torch.Size([10, 3])


tensor([[-0.1249,  0.0085, -0.1692],
        [-0.1310,  0.0526, -0.1209],
        [-0.1486,  0.2993, -0.0465],
        [-0.1381,  0.0110, -0.0692],
        [-0.0671,  0.0424, -0.0643],
        [-0.1242, -0.0400, -0.1442],
        [-0.0478,  0.1157, -0.0135],
        [ 0.0275,  0.0527, -0.1057],
        [-0.0227, -0.0882, -0.1626],
        [ 0.0333,  0.1109, -0.0120]], grad_fn=<AddmmBackward0>)

In [None]:
## combines softmax + cross-entropy loss (for stability)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.001)

for step in range(3000):
    # feedforward network, no need to accumulate gradients
    optimizer.zero_grad()
    # calculate the cross-entropy loss
    loss = criterion(model(vectors), labels)
    loss.backward()
    optimizer.step()

In [53]:
model(vectors)

tensor([[ 0.3078,  0.0190, -0.5881],
        [ 0.3254, -0.1905, -0.3217],
        [ 0.2623,  0.1236, -0.2714],
        [-0.4320, -0.3877,  0.7063],
        [-0.3091, -0.2410,  0.5366],
        [-0.4015, -0.4474,  0.6093],
        [-0.2790,  0.9255, -0.3921],
        [-0.2521,  1.3202, -0.7699],
        [-0.1704,  0.5199, -0.4183],
        [-0.3820,  1.2937, -0.5408]], grad_fn=<AddmmBackward0>)

In [59]:
new_docs = [
    "Listening to rock music is fun.",
    "I love science very much.",
    "I watch a lot of TV"
]
class_names = ["Cinema", "Music", "Science"]

new_doc_vectors = torch.tensor(
    [doc_to_bow(new_doc, vocabulary) for new_doc in new_docs],
    dtype = torch.float32
)

with torch.no_grad():
    outputs = model(new_doc_vectors)
    print(outputs)
    predicted_ids = torch.argmax(outputs, dim = 1) + 1

for i, new_doc in enumerate(new_docs):
    print(f'{new_doc}: {class_names[predicted_ids[i].item() - 1]}')

tensor([[-0.3077,  0.6642, -0.3008],
        [-0.0592, -0.0435,  0.0097],
        [-0.1923,  0.0547, -0.0095]])
Listening to rock music is fun.: Music
I love science very much.: Science
I watch a lot of TV: Music
