# Clone repository

1. Clone repository from Gitlab. Repository URL - {GIT_PATH}
2. {GIT_TOKEN} and {GIT_TOKEN_NAME} are used for Gitlab authorization
3. Repository is cloned into {LOCAL_PATH} directory
4. Branch {BRANCH} is used

In [2]:
GIT_PATH = "gitlab.atp-fivt.org/nlp2023/kosmachevdm-word2vec.git"
GIT_TOKEN_NAME = "Colab"
GIT_TOKEN = "Tft5iKMsCH8gpJrm95WL"
LOCAL_PATH = "word2vec"
BRANCH = "word2vectask1"

In [3]:
import getpass
import os

In [4]:
if len(GIT_TOKEN_NAME) == 0:
  GIT_TOKEN_NAME = input('Token name: ')
if len(GIT_TOKEN) == 0:
  GIT_TOKEN = getpass('Token value: ')

cmd_string = f'git clone https://{GIT_TOKEN_NAME}:{GIT_TOKEN}@{GIT_PATH} {LOCAL_PATH}'
os.system(cmd_string)

Cloning into 'word2vec'...


0

In [5]:
!ls

__notebook_source__.ipynb  word2vec


In [6]:
%cd {LOCAL_PATH}

/kaggle/working/word2vec


In [7]:
!git checkout {BRANCH}

branch 'word2vectask1' set up to track 'origin/word2vectask1'.
Switched to a new branch 'word2vectask1'


In [8]:
! git pull

Already up to date.


# Train model

In [9]:
import os
import torch
import torch.nn as nn
import numpy as np
from collections import Counter
from src.helpers import tokenize
from src.dataloader import get_dataloader
from src.custom_word2vec import CBOWModel, SkipGramModel
from src.vocab import Vocab
from src.trainer import Trainer
from src.metric_monitor import MetricMonitor

In [29]:
# CAN BE MODIFIED IF NEEDED
# Max vocabulary size
MAX_VOCAB_SIZE = 5000

# Number of epochs
EPOCHS = 5

# Model type
MODEL_TYPE = "skipgram" # or "cbow"

# Embedding (vector) size
EMBEDDING_SIZE = 100

# Save path
SAVE_PATH = "results"


In [13]:
# Set path for dataset
TEXT_PATH = os.path.join("dataset", "text8.txt")

# Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
# Read input text, tokenize and build vocabulary
with open(TEXT_PATH, "r") as f:
    raw_txt = f.read()

vocab, tokens = tokenize(inp=raw_txt, vocab_size=MAX_VOCAB_SIZE, default_token="<unk>")
VOCAB_SIZE = min(MAX_VOCAB_SIZE, vocab.get_size())

  0%|          | 0/12375405 [00:00<?, ?it/s]

In [15]:
# Dataloaders
train_dataloader = get_dataloader(
    tokens=tokens,
    model_type=MODEL_TYPE,
    loader_type="train",
    vocab=vocab
)
val_dataloader = get_dataloader(
    tokens=tokens,
    model_type=MODEL_TYPE,
    loader_type="val",
    vocab=vocab
)

In [16]:
if MODEL_TYPE == "cbow":
    model = CBOWModel(vocab_size=VOCAB_SIZE, embedding_size=EMBEDDING_SIZE)
elif MODEL_TYPE == "skipgram":
    model = SkipGramModel(vocab_size=VOCAB_SIZE, embedding_size=EMBEDDING_SIZE)
else:
    raise NotImplementedError

# Loss function
criterion = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

metric_monitor = MetricMonitor(
    epochs=EPOCHS
)

trainer = Trainer(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    criterion=criterion,
    optimizer=optimizer,
    device=DEVICE,
    metric_monitor=metric_monitor,
    epochs=EPOCHS
)

trainer.train()

  0%|          | 0/12376 [00:00<?, ?it/s]

  0%|          | 0/3094 [00:00<?, ?it/s]

  0%|          | 0/12376 [00:00<?, ?it/s]

  0%|          | 0/3094 [00:00<?, ?it/s]

  0%|          | 0/12376 [00:00<?, ?it/s]

  0%|          | 0/3094 [00:00<?, ?it/s]

  0%|          | 0/12376 [00:00<?, ?it/s]

  0%|          | 0/3094 [00:00<?, ?it/s]

  0%|          | 0/12376 [00:00<?, ?it/s]

  0%|          | 0/3094 [00:00<?, ?it/s]

# Save results

In [17]:
import json

In [31]:
# Save vocabulary
vocab_path = os.path.join(SAVE_PATH, MODEL_TYPE, "vocab.json")
with open(vocab_path, "w") as f:
  json.dump(vocab.get_stoi(), f)

In [32]:
# Save tokens
tokens_path = os.path.join(SAVE_PATH, MODEL_TYPE, "tokens.txt")
with open(tokens_path, "w") as f:
  f.write(" ".join(tokens))

In [33]:
# Save metrics
metrics_path = os.path.join(SAVE_PATH, MODEL_TYPE, "metrics.json")
with open(metrics_path, "w") as f:
  json.dump(metric_monitor.metrics, f)

In [35]:
# Save model
model_path = os.path.join(SAVE_PATH, MODEL_TYPE, "model.pth")
torch.save(model, model_path)

In [36]:
# Save model's weights
model_w_path = os.path.join(SAVE_PATH, MODEL_TYPE, "model_state.pth")
torch.save(model.state_dict(), model_w_path)