# Clone repository

1. Clone repository from Gitlab. Repository URL - {GIT_PATH}
2. Repository is cloned into {LOCAL_PATH} directory
3. Branch {BRANCH} is used

In [11]:
GIT_PATH = "https://github.com/lacteolus/word2vec.git"
LOCAL_PATH = "word2vec"
BRANCH = "main"

In [12]:
import os

In [13]:
cmd_string = f'git clone {GIT_PATH} {LOCAL_PATH}'
os.system(cmd_string)

0

In [14]:
!ls

dataset  notebooks  requirements.txt  src
main.py  README.md  results	      word2vec


In [15]:
%cd {LOCAL_PATH}

/content/word2vec/word2vec


In [16]:
!git checkout {BRANCH}

Already on 'main'
Your branch is up to date with 'origin/main'.


In [17]:
! git pull

Already up to date.


# Train model

In [22]:
import os
import torch
import torch.nn as nn
import numpy as np
from collections import Counter
from src.dataloader import get_dataloader, tokenize
from src.custom_word2vec import CBOWModel, SkipGramModel
from src.vocab import Vocab
from src.trainer import Trainer
from src.metric_monitor import MetricMonitor

In [23]:
# CAN BE MODIFIED IF NEEDED
# Max vocabulary size
MAX_VOCAB_SIZE = 5000

# Number of epochs
EPOCHS = 5

# Model type
MODEL_TYPE = "skipgram" # or "cbow"

# Embedding (vector) size
EMBEDDING_SIZE = 100

# Save path
SAVE_PATH = "results"


In [24]:
# Set path for dataset
TEXT_PATH = os.path.join("dataset", "text8.txt")

# Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
# Read input text, tokenize and build vocabulary
with open(TEXT_PATH, "r") as f:
    raw_txt = f.read()

vocab, tokens = tokenize(inp=raw_txt, vocab_size=MAX_VOCAB_SIZE, default_token="<unk>")
VOCAB_SIZE = min(MAX_VOCAB_SIZE, vocab.get_size())

  0%|          | 0/12375405 [00:00<?, ?it/s]

In [26]:
# Dataloaders
train_dataloader = get_dataloader(
    tokens=tokens,
    model_type=MODEL_TYPE,
    loader_type="train",
    vocab=vocab
)
val_dataloader = get_dataloader(
    tokens=tokens,
    model_type=MODEL_TYPE,
    loader_type="val",
    vocab=vocab
)

In [None]:
if MODEL_TYPE == "cbow":
    model = CBOWModel(vocab_size=VOCAB_SIZE, embedding_size=EMBEDDING_SIZE)
elif MODEL_TYPE == "skipgram":
    model = SkipGramModel(vocab_size=VOCAB_SIZE, embedding_size=EMBEDDING_SIZE)
else:
    raise NotImplementedError

# Loss function
criterion = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

metric_monitor = MetricMonitor(
    epochs=EPOCHS
)

trainer = Trainer(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    criterion=criterion,
    optimizer=optimizer,
    device=DEVICE,
    metric_monitor=metric_monitor,
    epochs=EPOCHS
)

trainer.train()

# Save results

In [None]:
import json

In [None]:
# Save vocabulary
vocab_path = os.path.join(SAVE_PATH, MODEL_TYPE, "vocab.json")
with open(vocab_path, "w") as f:
  json.dump(vocab.get_stoi(), f)

In [None]:
# Save tokens
tokens_path = os.path.join(SAVE_PATH, MODEL_TYPE, "tokens.txt")
with open(tokens_path, "w") as f:
  f.write(" ".join(tokens))

In [None]:
# Save metrics
metrics_path = os.path.join(SAVE_PATH, MODEL_TYPE, "metrics.json")
with open(metrics_path, "w") as f:
  json.dump(metric_monitor.metrics, f)

In [None]:
# Save model
model_path = os.path.join(SAVE_PATH, MODEL_TYPE, "model.pth")
torch.save(model, model_path)

In [None]:
# Save model's weights
model_w_path = os.path.join(SAVE_PATH, MODEL_TYPE, "model_state.pth")
torch.save(model.state_dict(), model_w_path)