In [1]:
from google.colab import drive
from google.colab import files

# This will trigger an authentication prompt in VS Code
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd

%cd /content/

/content
/content


In [3]:
!ls drive/MyDrive/data_for_ml/

data.zip


In [4]:
!ls /content/src/

# !rm /content/src/ -r

ls: cannot access '/content/src/': No such file or directory


In [5]:
# Run to download data again from drive

!rm /content/src/data/downloaded.txt

rm: cannot remove '/content/src/data/downloaded.txt': No such file or directory


In [6]:
import os
from getpass import getpass

CODE_DIR = '/content/src/'
DATA_DIR = '/content/src/data/'
DRIVE_ARCHIVE = '/content/drive/MyDrive/data_for_ml/data.zip'


# github username
USER = 'kalk-ak'

REPO_NAME = "n-gram-and-log-linear-model"
# Get PAT
print("Enter your GitHub PAT:")
token = getpass()

if not os.path.exists(CODE_DIR + "downloaded.txt"):
    # Clone my private repo
    !git clone https://{USER}:{token}@github.com/{USER}/{REPO_NAME}.git {CODE_DIR}

    print("Downloaded from Repo Sucessfully  ")
    with open(CODE_DIR + "downloaded.txt", "w") as f:
        f.write("Downloaded")

# 2. SETUP DATA
if not os.path.exists(DATA_DIR + "downloaded.txt"):
    print("Creating data directory...")
    os.makedirs(DATA_DIR, exist_ok=True)

    print(f"Extracting data to {DATA_DIR}...")
    # -d tells unzip exactly where to put the files
    !unzip -q {DRIVE_ARCHIVE} -d {DATA_DIR}

    with open(DATA_DIR + "downloaded.txt", "w") as f:
        f.write("Downloaded")

print("Directory Structure Ready.")


Enter your GitHub PAT:
··········
Cloning into '/content/src'...
remote: Enumerating objects: 227, done.[K
remote: Counting objects: 100% (227/227), done.[K
remote: Compressing objects: 100% (118/118), done.[K
remote: Total 227 (delta 131), reused 202 (delta 106), pack-reused 0 (from 0)[K
Receiving objects: 100% (227/227), 3.20 MiB | 10.56 MiB/s, done.
Resolving deltas: 100% (131/131), done.
Filtering content: 100% (4/4), 12.36 MiB | 7.37 MiB/s, done.
Downloaded from Repo Sucessfully  
Creating data directory...
Extracting data to /content/src/data/...
Directory Structure Ready.


In [7]:
!ls
!ls src/
!ls src/data/data/

drive  sample_data  src
code  data  downloaded.txt
english_spanish  gen_spam  speech


### Switch to working repo code


In [8]:
cd src/


/content/src


### Train the Log Linear model on Colab GPU

In [34]:
# Cell for updating code repo
%cd /content/src/code/

!git pull

%cd ../

/content/src/code
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (1/1), done.[K
remote: Total 4 (delta 3), reused 4 (delta 3), pack-reused 0 (from 0)[K
Unpacking objects: 100% (4/4), 514 bytes | 257.00 KiB/s, done.
From https://github.com/kalk-ak/n-gram-and-log-linear-model
   5e595b3..ff8ff26  master     -> origin/master
Updating 5e595b3..ff8ff26
Fast-forward
 code/probs.py | 10 [32m+++++++[m[31m---[m
 1 file changed, 7 insertions(+), 3 deletions(-)
/content/src


In [17]:
!pip install -r code/requirements.txt



In [18]:
!ls code/vocab

en_1k_vocab.txt        sp_1k_vocab.txt	      vocab-genspam.txt
eng_spanish_10k.vocab  switchboard-small.txt
kaggle.vocab	       switchboard.txt


In [30]:
!ls code

build_vocab.py	  integerize.py    __pycache__	      textcat.py
calendar-app	  models_trained   requirements.txt   train_lm.py
combine_vocab.py  modern-showcase  SGD_convergent.py  train_log_linear.ipynb
data		  my_file.txt	   split_data.py      trigram_randsent.py
email.txt	  nlp-class.yml    test_accuracy.py   vocab
fileprob.py	  probs.py	   test_backoff.py


In [40]:
!mkdir /content/src/code/models_trained_colab

In [36]:
!code/train_lm.py code/vocab/switchboard.txt log_linear_improved data/data/speech/train/switchboard --lexicon data/lexicons/words-gs-only-200.txt --epochs 20 --l2_regularization 0.5 --output code/models_trained_colab/log_linear_switchboard.model --device cuda


INFO:probs:Read vocab of size 11420 from code/vocab/switchboard.txt
INFO:root:BOS is part of the vocab
dimension = ['9849', '200']
row = 9849
col = 200
INFO:root:Created a lexion matrix of shape torch.Size([4912, 200])
INFO:train_lm:Training...
INFO:root:Training on cuda...
INFO:root:Set learning rate to 0.05
INFO:root:Vectorizing training corpus...
INFO:root:Start optimizing on 2192484 training tokens (20 epochs)...
Epoch 1: 100% 536/536 [00:25<00:00, 21.04it/s]
INFO:root:Epoch 1: F = -5.973392781534466
Epoch 2: 100% 536/536 [00:24<00:00, 22.07it/s]
INFO:root:Epoch 2: F = -5.837085587964866
Epoch 3: 100% 536/536 [00:24<00:00, 22.05it/s]
INFO:root:Epoch 3: F = -5.824912626281404
Epoch 4: 100% 536/536 [00:24<00:00, 21.96it/s]
INFO:root:Epoch 4: F = -5.819956416129371
Epoch 5: 100% 536/536 [00:24<00:00, 21.97it/s]
INFO:root:Epoch 5: F = -5.817421489868207
Epoch 6: 100% 536/536 [00:24<00:00, 21.98it/s]
INFO:root:Epoch 6: F = -5.815959599495781
Epoch 7: 100% 536/536 [00:24<00:00, 22.08it/s

### Download the trained model to do further analysis on my local machine

In [46]:
files.download("/content/src/code/models_trained_colab/log_linear_switchboard.model")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>