In [1]:
from types import SimpleNamespace
from collections import Counter
import os
import re
import pathlib
import array
import pickle
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch.nn.functional as F


In [2]:
DATASET_VERSION = 'ca-100'
COMPETITION_ROOT = '../input/wordvectors'
DATASET_ROOT = f'../input/text-preprocessing/data/{DATASET_VERSION}'
WORKING_ROOT = f'data/{DATASET_VERSION}'
DATASET_PREFIX = 'ca.wiki'

In [3]:
params = SimpleNamespace(
    embedding_dim = 300, #100
    batch_size = 1000, # 1000
    epochs = 5,
    preprocessed = f'{DATASET_ROOT}/{DATASET_PREFIX}',
    working = f'{WORKING_ROOT}/{DATASET_PREFIX}',
    modelname = f'{WORKING_ROOT}/{DATASET_VERSION}.pt',
    train = True,
    weighting_scheme = 'trained_vector', # Options: 'uniform', 'fixed_scalar', 'trained_scalar', 'trained_vector'
)

In [4]:
class Vocabulary(object):
    def __init__(self, pad_token='<pad>', unk_token='<unk>', eos_token='<eos>'):
        self.token2idx = {}
        self.idx2token = []
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.eos_token = eos_token
        if pad_token is not None:
            self.pad_index = self.add_token(pad_token)
        if unk_token is not None:
            self.unk_index = self.add_token(unk_token)
        if eos_token is not None:
            self.eos_index = self.add_token(eos_token)

    def add_token(self, token):
        if token not in self.token2idx:
            self.idx2token.append(token)
            self.token2idx[token] = len(self.idx2token) - 1
        return self.token2idx[token]

    def get_index(self, token):
        if isinstance(token, str):
            return self.token2idx.get(token, self.unk_index)
        else:
            return [self.token2idx.get(t, self.unk_index) for t in token]

    def get_token(self, index):
        return self.idx2token[index]

    def __len__(self):
        return len(self.idx2token)

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.__dict__, f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            self.__dict__.update(pickle.load(f))

In [5]:
def batch_generator(idata, target, batch_size, shuffle=True):
    nsamples = len(idata)
    if shuffle:
        perm = np.random.permutation(nsamples)
    else:
        perm = range(nsamples)

    for i in range(0, nsamples, batch_size):
        batch_idx = perm[i:i+batch_size]
        if target is not None:
            yield idata[batch_idx], target[batch_idx]
        else:
            yield idata[batch_idx], None

CBOW model
----------
You can add new parameters to the model in the *\_\_init\_\_()* method with *self.register_buffer()* (for parameters not to be trained):

    self.register_buffer('position_weight', torch.tensor([1,2,3,3,2,1], dtype=torch.float32))

or *nn.Parameter()* (for parameters to be trained)

    self.position_weight = nn.Parameter(torch.tensor([1,2,3,3,2,1], dtype=torch.float32))
    
In both cases, you can reference and use them in the *forward* method as

    self.position_weight

In [6]:
class CBOW(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, weighting_scheme='uniform'):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=False)
        self.weighting_scheme = weighting_scheme
        self.context_size = 6  # 3 before, 3 after
        
        if weighting_scheme == 'fixed_scalar':
            self.position_weights = torch.tensor([1, 2, 3, 3, 2, 1], dtype=torch.float32)
            self.register_buffer('position_weights', self.position_weights)  # Fixed buffer name
            
        elif weighting_scheme == 'trained_scalar':
            self.position_weights = nn.Parameter(torch.ones(self.context_size))  # Initialize with ones
            
        elif weighting_scheme == 'trained_vector':
            self.position_weights = nn.Parameter(torch.ones(self.context_size, embedding_dim))  # Initialize with ones
            
    def forward(self, input):
        # input shape: (batch_size, context_size)
        batch_size = input.size(0)
        
        # Check input dimensions
        if input.size(1) != self.context_size:
            raise ValueError(f"Expected input context size of {self.context_size}, got {input.size(1)}")
            
        # Embedding lookup: (batch_size, context_size, embedding_dim)
        embedded = self.emb(input)
        
        if self.weighting_scheme == 'uniform':
            # Simple average
            context_vec = embedded.mean(dim=1)
            
        elif self.weighting_scheme == 'fixed_scalar':
            # Reshape weights for broadcasting: (1, context_size, 1)
            weights = self.position_weights.view(1, -1, 1)
            context_vec = (embedded * weights).sum(dim=1)
            context_vec = context_vec / weights.sum()  # Normalize
            
        elif self.weighting_scheme == 'trained_scalar':
            # Reshape weights for broadcasting: (1, context_size, 1)
            weights = self.position_weights.view(1, -1, 1)
            context_vec = (embedded * weights).sum(dim=1)
            context_vec = context_vec / weights.sum()  # Normalize
            
        elif self.weighting_scheme == 'trained_vector':
            # Reshape weights for broadcasting: (1, context_size, embedding_dim)
            weights = self.position_weights.view(1, self.context_size, -1)
            context_vec = (embedded * weights).sum(dim=1)
            context_vec = context_vec / weights.sum(dim=1)  # Normalize
            
        else:
            raise ValueError(f"Unknown weighting scheme: {self.weighting_scheme}")
        
        # Final prediction: (batch_size, num_embeddings)
        output = self.lin(context_vec)
        
        return output

class CBOW(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=False)
        self.context_size = 6
        # Initialize with slight preference for center words
        self.position_weights = nn.Parameter(torch.tensor([0.8, 0.9, 1.0, 1.0, 0.9, 0.8]))
        
    def forward(self, input):
        embedded = self.emb(input)
        weights = F.softmax(self.position_weights, dim=0).view(1, -1, 1)
        context_vec = (embedded * weights).sum(dim=1)
        return self.lin(context_vec)


class CBOW(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, context_size=3):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=True)  # Bias can help learning
        self.context_size = context_size  # Flexible context size
        self.dropout = nn.Dropout(0.2)  # Regularization
        
        # Learnable position-dependent weights (initialize closer words higher)
        init_weights = torch.tensor([1 / (abs(i - context_size) + 1) for i in range(2 * context_size)])
        self.position_weights = nn.Parameter(init_weights)

        # Layer normalization for stability
        self.norm = nn.LayerNorm(embedding_dim)

    def forward(self, input):
        embedded = self.emb(input)  # (batch_size, context_size*2, embedding_dim)
        
        # Normalize and apply softmax/sigmoid weighting
        weights = F.sigmoid(self.position_weights).view(1, -1, 1)  # Shape (1, context_size*2, 1)
        
        context_vec = (embedded * weights).sum(dim=1)  # Weighted sum
        context_vec = self.norm(context_vec)  # Normalize embeddings
        context_vec = self.dropout(context_vec)  # Regularization
        
        return self.lin(context_vec)  # Final output logits

In [7]:
class CBOW(nn.Module):
    """Attention implementation """
    def __init__(self, num_embeddings, embedding_dim, context_size=3, num_heads=4, dropout=0.2):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.context_size = context_size
        self.num_heads = num_heads
        
        # Core embeddings
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        
        # Multi-head attention components
        self.head_dim = embedding_dim // num_heads
        assert self.head_dim * num_heads == embedding_dim, "Embedding dimension must be divisible by number of heads"
        
        # Query, key, value projections
        self.q_proj = nn.Linear(embedding_dim, embedding_dim)
        self.k_proj = nn.Linear(embedding_dim, embedding_dim)
        self.v_proj = nn.Linear(embedding_dim, embedding_dim)
        
        # Output projection
        self.o_proj = nn.Linear(embedding_dim, embedding_dim)
        
        # Positional encodings (learnable)
        self.pos_emb = nn.Parameter(torch.randn(2 * context_size, embedding_dim) * 0.01)
        
        # Final prediction layer
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=True)
        
        # Normalization and regularization
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)
        
        # Feed-forward network
        self.ff_network = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(embedding_dim * 4, embedding_dim),
            nn.Dropout(dropout)
        )
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        nn.init.normal_(self.emb.weight, mean=0, std=0.02)
        nn.init.normal_(self.pos_emb, mean=0, std=0.02)
        
        for module in [self.q_proj, self.k_proj, self.v_proj, self.o_proj, 
                      self.ff_network[0], self.ff_network[3]]:
            nn.init.xavier_uniform_(module.weight)
            nn.init.zeros_(module.bias)
            
    def forward(self, x):
        batch_size = x.shape[0]
        context_len = x.shape[1]
        
        # Get embeddings and add positional encodings
        token_emb = self.emb(x)  # [batch, context_len, emb_dim]
        pos_emb = self.pos_emb.unsqueeze(0).expand(batch_size, -1, -1)  # [batch, context_len, emb_dim]
        x = token_emb + pos_emb
        
        # Apply multi-head attention
        residual = x
        x = self.norm1(x)
        
        # Projections for multi-head attention
        q = self.q_proj(x).view(batch_size, context_len, self.num_heads, self.head_dim).transpose(1, 2)  # [batch, heads, context_len, head_dim]
        k = self.k_proj(x).view(batch_size, context_len, self.num_heads, self.head_dim).transpose(1, 2)  # [batch, heads, context_len, head_dim]
        v = self.v_proj(x).view(batch_size, context_len, self.num_heads, self.head_dim).transpose(1, 2)  # [batch, heads, context_len, head_dim]
        
        # Compute attention scores
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)  # [batch, heads, context_len, context_len]
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.dropout(attn_probs)
        
        # Apply attention to values
        context = torch.matmul(attn_probs, v)  # [batch, heads, context_len, head_dim]
        context = context.transpose(1, 2).reshape(batch_size, context_len, self.embedding_dim)  # [batch, context_len, emb_dim]
        
        # Output projection
        context = self.o_proj(context)
        
        # Residual connection and normalization
        x = residual + self.dropout(context)
        
        # Feed-forward network with residual connection
        residual = x
        x = self.norm2(x)
        x = residual + self.ff_network(x)
        
        # Pool the context representations (mean pooling)
        x = x.mean(dim=1)  # [batch, emb_dim]
        
        # Final prediction
        return self.lin(x)

We can visually show how this model works:

<div style="text-align:center">
    <img src="https://drive.google.com/uc?id=1INeftgKRCS0QWZu9InCDZ42a5fph3jQX" alt="drawing" width="800" height="800"/>
</div>

We train the model to learn matrices $\mathbf{E}$ and $\mathbf{W}$. Probabilities $P(w_t|w_{c-m},\cdots,w_{c+m})$ can be obtained by applying the softmax function to the output vector $\mathbf{z}$. However, our loss function [nn.CrossEntropyLoss()](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) takes care of this.


In [8]:
def load_preprocessed_dataset(prefix):
    # Try loading precomputed vocabulary and preprocessed data files
    token_vocab = Vocabulary()
    token_vocab.load(f'{prefix}.vocab')
    data = []
    for part in ['train', 'valid', 'test']:
        with np.load(f'{prefix}.{part}.npz') as set_data:
            idata, target = set_data['idata'], set_data['target']
            data.append((idata, target))
            print(f'Number of samples ({part}): {len(target)}')
    print("Using precomputed vocabulary and data files")
    print(f'Vocabulary size: {len(token_vocab)}')
    return token_vocab, data

In [9]:
def train(model, criterion, optimizer, idata, target, batch_size, device, log=False):
    model.train()
    total_loss = 0
    ncorrect = 0
    ntokens = 0
    niterations = 0
    for X, y in batch_generator(idata, target, batch_size, shuffle=True):
        # Get input and target sequences from batch
        X = torch.tensor(X, dtype=torch.long, device=device)
        y = torch.tensor(y, dtype=torch.long, device=device)

        model.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        # Training statistics
        total_loss += loss.item()
        ncorrect += (torch.max(output, 1)[1] == y).sum().item()
        ntokens += y.numel()
        niterations += 1
        if niterations == 200 or niterations == 500 or niterations % 1000 == 0:
            print(f'Train: wpb={ntokens//niterations}, num_updates={niterations}, accuracy={100*ncorrect/ntokens:.1f}, loss={total_loss/ntokens:.2f}')

    total_loss = total_loss / ntokens
    accuracy = 100 * ncorrect / ntokens
    if log:
        print(f'Train: wpb={ntokens//niterations}, num_updates={niterations}, accuracy={accuracy:.1f}, loss={total_loss:.2f}')
    return accuracy, total_loss

In [10]:
def validate(model, criterion, idata, target, batch_size, device):
    model.eval()
    total_loss = 0
    ncorrect = 0
    ntokens = 0
    niterations = 0
    y_pred = []
    with torch.no_grad():
        for X, y in batch_generator(idata, target, batch_size, shuffle=False):
            # Get input and target sequences from batch
            X = torch.tensor(X, dtype=torch.long, device=device)
            output = model(X)
            if target is not None:
                y = torch.tensor(y, dtype=torch.long, device=device)
                loss = criterion(output, y)
                total_loss += loss.item()
                ncorrect += (torch.max(output, 1)[1] == y).sum().item()
                ntokens += y.numel()
                niterations += 1
            else:
                pred = torch.max(output, 1)[1].detach().to('cpu').numpy()
                y_pred.append(pred)

    if target is not None:
        total_loss = total_loss / ntokens
        accuracy = 100 * ncorrect / ntokens
        return accuracy, total_loss
    else:
        return np.concatenate(y_pred)

In [11]:
# Create working dir
pathlib.Path(WORKING_ROOT).mkdir(parents=True, exist_ok=True)

In [12]:
# Select device
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print("WARNING: Training without GPU can be very slow!")

In [13]:
vocab, data = load_preprocessed_dataset(params.preprocessed)

Number of samples (train): 82284341
Number of samples (valid): 164765
Number of samples (test): 165837
Using precomputed vocabulary and data files
Vocabulary size: 100002


In [14]:
# 'El Periodico' validation dataset
valid_x_df = pd.read_csv(f'{COMPETITION_ROOT}/x_valid.csv')
tokens = valid_x_df.columns[1:]
valid_x = valid_x_df[tokens].apply(vocab.get_index).to_numpy(dtype='int32')
valid_y_df = pd.read_csv(f'{COMPETITION_ROOT}/y_valid.csv')
valid_y = valid_y_df['token'].apply(vocab.get_index).to_numpy(dtype='int32')

In [15]:
# 'El Periodico' test dataset
valid_x_df = pd.read_csv(f'{COMPETITION_ROOT}/x_test.csv')
test_x = valid_x_df[tokens].apply(vocab.get_index).to_numpy(dtype='int32')

In [16]:
model = CBOW(len(vocab), params.embedding_dim).to(device)
print(model)
for name, param in model.named_parameters():
    print(f'{name:20} {param.numel()} {list(param.shape)}')
print(f'TOTAL                {sum(p.numel() for p in model.parameters())}')

CBOW(
  (emb): Embedding(100002, 300, padding_idx=0)
  (q_proj): Linear(in_features=300, out_features=300, bias=True)
  (k_proj): Linear(in_features=300, out_features=300, bias=True)
  (v_proj): Linear(in_features=300, out_features=300, bias=True)
  (o_proj): Linear(in_features=300, out_features=300, bias=True)
  (lin): Linear(in_features=300, out_features=100002, bias=True)
  (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (ff_network): Sequential(
    (0): Linear(in_features=300, out_features=1200, bias=True)
    (1): GELU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=1200, out_features=300, bias=True)
    (4): Dropout(p=0.2, inplace=False)
  )
)
pos_emb              1800 [6, 300]
emb.weight           30000600 [100002, 300]
q_proj.weight        90000 [300, 300]
q_proj.bias          300 [300]
k_proj.weight        90000 [300, 300]
k_proj.bi

Note that $\textit{lin}$ layer transposes $\textit{lin.weight}$ before performing the multilplication.

The [nn.CrossEntropyLoss()](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) criterion combines *nn.LogSoftmax()* and *nn.NLLLoss()* in one single class.

In [17]:
criterion = nn.CrossEntropyLoss(reduction='sum')

In [18]:
optimizer = torch.optim.Adam(model.parameters())

train_accuracy = []
wiki_accuracy = []
valid_accuracy = []
for epoch in range(params.epochs):
    acc, loss = train(model, criterion, optimizer, data[0][0], data[0][1], params.batch_size, device, log=True)
    train_accuracy.append(acc)
    print(f'| epoch {epoch:03d} | train accuracy={acc:.1f}%, train loss={loss:.2f}')
    acc, loss = validate(model, criterion, data[1][0], data[1][1], params.batch_size, device)
    wiki_accuracy.append(acc)
    print(f'| epoch {epoch:03d} | valid accuracy={acc:.1f}%, valid loss={loss:.2f} (wikipedia)')
    acc, loss = validate(model, criterion, valid_x, valid_y, params.batch_size, device)
    valid_accuracy.append(acc)
    print(f'| epoch {epoch:03d} | valid accuracy={acc:.1f}%, valid loss={loss:.2f} (El Periódico)')

# Save model
torch.save(model.state_dict(), params.modelname)

Train: wpb=1000, num_updates=200, accuracy=21.1, loss=6.51
Train: wpb=1000, num_updates=500, accuracy=26.5, loss=5.79
Train: wpb=1000, num_updates=1000, accuracy=30.1, loss=5.34
Train: wpb=1000, num_updates=2000, accuracy=33.1, loss=4.92
Train: wpb=1000, num_updates=3000, accuracy=34.8, loss=4.70
Train: wpb=1000, num_updates=4000, accuracy=35.9, loss=4.54
Train: wpb=1000, num_updates=5000, accuracy=36.8, loss=4.43
Train: wpb=1000, num_updates=6000, accuracy=37.4, loss=4.34
Train: wpb=1000, num_updates=7000, accuracy=38.0, loss=4.26
Train: wpb=1000, num_updates=8000, accuracy=38.4, loss=4.20
Train: wpb=1000, num_updates=9000, accuracy=38.8, loss=4.15
Train: wpb=1000, num_updates=10000, accuracy=39.2, loss=4.10
Train: wpb=1000, num_updates=11000, accuracy=39.5, loss=4.06
Train: wpb=1000, num_updates=12000, accuracy=39.8, loss=4.02
Train: wpb=1000, num_updates=13000, accuracy=40.1, loss=3.99
Train: wpb=1000, num_updates=14000, accuracy=40.3, loss=3.96
Train: wpb=1000, num_updates=15000, a

In [19]:
# Test prediction
y_pred = validate(model, None, test_x, None, params.batch_size, device)
y_token = [vocab.idx2token[index] for index in y_pred]

In [20]:
submission = pd.DataFrame({'id':valid_x_df['id'], 'token': y_token}, columns=['id', 'token'])
print(submission.head())
submission.to_csv('submission.csv', index=False)

   id  token
0   0      l
1   1      s
2   2  haver
3   3      s
4   4     un
