In [1]:
filepath = "torchtext/WikiText2-10000.txt"

with open(filepath, 'r', encoding='utf-8') as file:
    text_content = file.read()

print(text_content[0:1000])

" 
 = 2013 – 14 York City F.C. season = 
 
 The 2013 – 14 season was the <unk> season of competitive association football and 77th season in the Football League played by York City Football Club , a professional football club based in York , North Yorkshire , England . Their 17th @-@ place finish in 2012 – 13 meant it was their second consecutive season in League Two . The season ran from 1 July 2013 to 30 June 2014 . 
 Nigel Worthington , starting his first full season as York manager , made eight permanent summer signings . By the turn of the year York were only above the relegation zone on goal difference , before a 17 @-@ match unbeaten run saw the team finish in seventh @-@ place in the 24 @-@ team 2013 – 14 Football League Two . This meant York qualified for the play @-@ offs , and they were eliminated in the semi @-@ final by Fleetwood Town . York were knocked out of the 2013 – 14 FA Cup , Football League Cup and Football League Trophy in their opening round matches . 
 35 playe

In [2]:
import datasets

dset = datasets.load_dataset('text', data_files=str(filepath))
dset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 37333
    })
})

In [3]:
dset['train']['text'][0:4]

['" ',
 ' = 2013 – 14 York City F.C. season = ',
 ' ',
 ' The 2013 – 14 season was the <unk> season of competitive association football and 77th season in the Football League played by York City Football Club , a professional football club based in York , North Yorkshire , England . Their 17th @-@ place finish in 2012 – 13 meant it was their second consecutive season in League Two . The season ran from 1 July 2013 to 30 June 2014 . ']

In [4]:
def tokenize_row(row):
    row['tokens'] = row['text'].lower().split()
    return row

In [5]:
dset = dset.map(tokenize_row)
dset

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens'],
        num_rows: 37333
    })
})

In [6]:
dset['train']['tokens'][0:2]

[['"'], ['=', '2013', '–', '14', 'york', 'city', 'f.c.', 'season', '=']]

In [7]:
vocab = list(set(
    [tok for row in dset['train']['tokens'] for tok in row]))

vocab[:4]

['relieve', 'hottest', 'hypnotic', 'rusty']

In [8]:
id2tok = dict(enumerate(vocab))
list(id2tok.items())[:4]

[(0, 'relieve'), (1, 'hottest'), (2, 'hypnotic'), (3, 'rusty')]

In [9]:
tok2id = {tok: i for (i,tok) in id2tok.items()}
list(tok2id.items())[:4]

[('relieve', 0), ('hottest', 1), ('hypnotic', 2), ('rusty', 3)]

In [10]:
window_width = 10

def windowizer(row, wsize=window_width):
    """ Compute sentence (str) to sliding-window of skip-gram pairs """
    doc = row['tokens']
    out = []
    for i, wd in enumerate(doc):
        target = tok2id[wd]
        window = [
            i+j for j in range(-wsize, wsize+1, 1)
            if (i+j >=0) & (i+j < len(doc)) & (j != 0)
        ]

        out += [(target, tok2id[doc[w]]) for w in window]
    row['window'] = out
    return row

In [11]:
dset = dset.map(windowizer)
dset

Map: 100%|███████████████████████████████████████████████████████████████| 37333/37333 [01:47<00:00, 347.81 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'window'],
        num_rows: 37333
    })
})

In [12]:
dset['train']['window'][1]

[[8506, 4935],
 [8506, 28203],
 [8506, 4530],
 [8506, 27734],
 [8506, 14724],
 [8506, 13791],
 [8506, 26745],
 [8506, 8506],
 [4935, 8506],
 [4935, 28203],
 [4935, 4530],
 [4935, 27734],
 [4935, 14724],
 [4935, 13791],
 [4935, 26745],
 [4935, 8506],
 [28203, 8506],
 [28203, 4935],
 [28203, 4530],
 [28203, 27734],
 [28203, 14724],
 [28203, 13791],
 [28203, 26745],
 [28203, 8506],
 [4530, 8506],
 [4530, 4935],
 [4530, 28203],
 [4530, 27734],
 [4530, 14724],
 [4530, 13791],
 [4530, 26745],
 [4530, 8506],
 [27734, 8506],
 [27734, 4935],
 [27734, 28203],
 [27734, 4530],
 [27734, 14724],
 [27734, 13791],
 [27734, 26745],
 [27734, 8506],
 [14724, 8506],
 [14724, 4935],
 [14724, 28203],
 [14724, 4530],
 [14724, 27734],
 [14724, 13791],
 [14724, 26745],
 [14724, 8506],
 [13791, 8506],
 [13791, 4935],
 [13791, 28203],
 [13791, 4530],
 [13791, 27734],
 [13791, 14724],
 [13791, 26745],
 [13791, 8506],
 [26745, 8506],
 [26745, 4935],
 [26745, 28203],
 [26745, 4530],
 [26745, 27734],
 [26745, 14724]

In [13]:
def skip_grams(tokens, window_width=window_width):
    pairs = []
    for i, wd in enumerate(tokens):
        target = toke2id[wd]
        window = [
            i+j for j in
            range(-window_width, window_width+1, 1)
            if(i+j >= 0)
            & (i+j < len(tokens))
            & (j != 0)
        ]

        pairs.extend([(target, tok2id[tokens[w]]) for w in window])
    return pairs

In [14]:
from torch.utils.data import Dataset

class Word2VecDataset(Dataset):
    def __init__(self, dataset, vocab_size, wsize=window_width):
        self.dataset = dataset
        self.vocab_size = vocab_size
        self.data = [i for s in dataset['window'] for i in s]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [15]:
from torch.utils.data import DataLoader

CPU_CORES = 4
BATCH_SIZE = 256

dataloader = {}
for k in dset.keys():
    dataloader = {
        k: DataLoader(
            Word2VecDataset(
                dset[k],
                vocab_size=len(vocab)),
            batch_size=BATCH_SIZE,
            shuffle=True,
            num_workers = CPU_CORES -1)
    }

In [16]:
def one_hot_encode(input_id, size):
    vec = torch.zeros(size).float()
    vec[input_id]=1.0
    return vec

In [17]:
from torch import nn
EMBED_DIM = 100

class Word2Vec(nn.Module):
    def __init__(self, vocab_size=len(vocab), embedding_size=EMBED_DIM):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.expand = nn.Linear(embedding_size, vocab_size, bias=False)

    def forward(self, input):
        hidden = self.embed(input)
        logits = self.expand(hidden)
        return logits

In [18]:
model = Word2Vec()
model

Word2Vec(
  (embed): Embedding(28912, 100)
  (expand): Linear(in_features=100, out_features=28912, bias=False)
)

In [19]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

device

device(type='cpu')

In [20]:
model.to(device)

Word2Vec(
  (embed): Embedding(28912, 100)
  (expand): Linear(in_features=100, out_features=28912, bias=False)
)

In [None]:
from tqdm import tqdm
EPOCHS = 10
LEARNING_RATE = 5e-4

loss_fn = nn.CrossEntropyLoss() 
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

running_loss = [] 
pbar = tqdm(range(EPOCHS * len(dataloader['train']))) 

for epoch in range(EPOCHS): 
    epoch_loss = 0 
    for sample_num, (center, context) in enumerate(dataloader['train']): 
        if sample_num % len(dataloader['train']) == 2: 
            print(center, context) 
            # center: tensor([ 229,0, 2379, ..., 402, 553, 521]) 
            # context: tensor([ 112, 1734, 802, ..., 28, 852, 363])
        center, context = center.to(device), context.to(device) 
        optimizer.zero_grad() 
        logits = model(input=context) 
        loss = loss_fn(logits, center) 
        if not sample_num % 10000: 
            # print(center, context) 
            pbar.set_description(f'loss[{sample_num}] = {loss.item()}')
        epoch_loss += loss.item() 
        loss.backward() 
        optimizer.step() 
        pbar.update(1)
    epoch_loss /= len(dataloader['train']) 
    running_loss.append(epoch_loss)

save_model(model, loss)