In [2]:
!pip install gensim==3.8.1



##### Colab Stuff

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
os.chdir(os.path.join('drive', 'My Drive', 'ift6759', 'ift6759-t6-p2'))
os.getcwd()

'/content/drive/My Drive/ift6759/ift6759-t6-p2'

In [5]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



### Embeding Matrix Generation


In [0]:
from utils.gensim_embeddings import create_model
from gensim.models import FastText

def build_emb_matrix(task, emb_size, alignment='unaligned'):
    file_name = alignment + '_' + task[:-4]
    data_path = os.path.join(os.getcwd(), 'data', file_name)
    os.makedirs(os.path.join(os.getcwd(), 'embeddings', task, str(emb_size)), exist_ok=True)
    save_path = os.path.join(os.getcwd(), 'embeddings', task, str(emb_size), file_name)
    print(task, file_name, data_path, save_path)
    return create_model(FastText, data_path, save_path=save_path, size=emb_size)

def build_enc_dec_emb_matrices(enc_task, dec_task, emb_size=128, alignment='unaligned'):
    enc_emb = build_emb_matrix(enc_task, emb_size, alignment)
    print("enc_emb created")
    dec_emb = build_emb_matrix(dec_task, emb_size, alignment)
    print("dec_emb created")
    return enc_emb, dec_emb

In [0]:
build_emb_matrix(task='formated_fr_w2w', 
                 emb_size=512)

In [0]:
build_enc_dec_emb_matrices(
    enc_task = 'unformated_en_w2w',
    dec_task = 'unformated_fr_w2w',
    emb_size = 512
)

unformated_en_w2w unaligned_unformated_en /content/drive/My Drive/ift6759/ift6759-t6-p2/data/unaligned_unformated_en /content/drive/My Drive/ift6759/ift6759-t6-p2/embeddings/unformated_en_w2w/1024/unaligned_unformated_en


#### Generate v2id for unformated_fr_w2w

In [0]:
!python train_language_model.py \
--epochs 0 \
--task formated_fr_w2w 

Tensorflow version 2.2.0-rc2
data_file:data/unaligned_formated_fr, tokenize_type:w, rm_punc:True
 29% 137020/474000 [01:47<04:40, 1199.23it/s]

### GRU Language Model Training

#### unformated_en_w2w

In [0]:
!python train_language_model.py \
--epochs 20 \
--batch_size 512 \
--lr 0.01 \
--steps_per_epoch 500 \
--embedding_dim 128 \
--units 1024 \
--embedding_warmer_epoch 5 \
--task unformated_en_w2w 

2020-04-15 04:23:36.182208: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
Tensorflow version 2.2.0-rc2
data_file:data/unaligned_unformated_en, tokenize_type:w, rm_punc:True
100% 474000/474000 [08:30<00:00, 929.02it/s]
2020-04-15 04:33:11.659690: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-04-15 04:33:11.709829: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-04-15 04:33:11.710069: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (22c357fadf83): /proc/driver/nvidia/version does not exist
2020-04-15 04:33:11.736918: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2300000000 Hz
2020-04-15 04:33:11.737487: I tensorflow/compiler/xla/service/service.cc:168] XLA servic

#### unformated_fr_w2w

In [0]:
!python train_language_model.py \
--epochs 20 \
--batch_size 512 \
--lr 0.01 \
--steps_per_epoch 500 \
--embedding_dim 128 \
--units 1024 \
--embedding_warmer_epoch 5 \
--task unformated_fr_w2w 

Tensorflow version 2.2.0-rc2
2020-04-14 19:28:49.830510: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Running on TPU  ['10.87.101.58:8470']
data_file:data/unaligned_unformated_fr, tokenize_type:w, rm_punc:True
100% 474000/474000 [04:02<00:00, 1956.04it/s]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### unformated_fr_c2c

In [0]:
!python train_language_model.py \
--epochs 20 \
--batch_size 512 \
--lr 0.01 \
--steps_per_epoch 500 \
--embedding_dim 128 \
--units 1024 \
--task unformated_fr_c2c 

Tensorflow version 2.2.0-rc2
2020-04-14 19:46:12.599218: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Running on TPU  ['10.87.101.58:8470']
data_file:data/unaligned_unformated_fr, tokenize_type:c, rm_punc:True
100% 474000/474000 [00:09<00:00, 52286.39it/s]
tcmalloc: large alloc 1844813824 bytes == 0x60866000 @  0x7f2c722e91e7 0x7f2c6f7265e1 0x7f2c6f78ae88 0x7f2c6f78b147 0x7f2c6f823118 0x50ac25 0x50c5b9 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x50a080 0x50aa7d 0x50d390 0x509d48 0x50aa7d 0x50c5b9 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x58958c 0x5a067e 0x50d966 0x508245
tcmalloc: large alloc 1291370496 bytes == 0xcefc2000 @  0x7f2c722e91e7 0x7f2c6f7265e1 0x7f2c6f78ae88 0x7f2c6f78afa3 0x7f2c6f8160c6 0x7f2c6f816528 0x50c88b 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x50a080 0x50aa7d 0x50c5b9 0x58efc9 0x4c9546 0x5886f4 0x58892e 0x551b81 0x5aa6ec 0x50abb3

#### formated_fr_c2c


In [0]:
!python train_language_model.py \
--epochs 20 \
--batch_size 512 \
--lr 0.01 \
--steps_per_epoch 500 \
--embedding_dim 128 \
--units 1024 \
--task formated_fr_c2c

Tensorflow version 2.2.0-rc2
2020-04-14 20:28:20.954602: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Running on TPU  ['10.87.101.58:8470']
data_file:data/unaligned_formated_fr, tokenize_type:c, rm_punc:False
100% 474000/474000 [00:10<00:00, 45956.61it/s]
tcmalloc: large alloc 1859977216 bytes == 0x64ca2000 @  0x7fe88e0061e7 0x7fe88b4435e1 0x7fe88b4a7e88 0x7fe88b4a8147 0x7fe88b540118 0x50ac25 0x50c5b9 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x50a080 0x50aa7d 0x50d390 0x509d48 0x50aa7d 0x50c5b9 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x58958c 0x5a067e 0x50d966 0x508245
tcmalloc: large alloc 1301987328 bytes == 0xd4274000 @  0x7fe88e0061e7 0x7fe88b4435e1 0x7fe88b4a7e88 0x7fe88b4a7fa3 0x7fe88b5330c6 0x7fe88b533528 0x50c88b 0x508245 0x50a080 0x50aa7d 0x50d390 0x508245 0x50a080 0x50aa7d 0x50c5b9 0x58efc9 0x4c9546 0x5886f4 0x58892e 0x551b81 0x5aa6ec 0x50abb3 