In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


Copy code from drive to local folder

In [2]:
!rm -r chatbot
!cp -r /content/drive/MyDrive/chatbot /content/chatbot

rm: cannot remove 'chatbot': No such file or directory


### USING AWS

In [3]:
# !pip install boto3

# import boto3
# s3 = boto3.resource("s3", aws_access_key_id = "",
#                       aws_secret_access_key = "")
# bucket = s3.Bucket("reddit-chatbot-data")

# !rm -r data
# !mkdir data
# !mkdir data/training_data data/testing_data data/validation_data
# bucket_contents = [val.key for val in bucket.objects.all()]
# for content in bucket_contents:
#   bucket.download_file(content, f"data/{content}")

### USING AZURE

In [9]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [12]:
# !pip install azure-identity azure-storage-blob
!pip install load_dotenv

!rm -r data
!mkdir data data/training_data data/testing_data data/validation_data



In [13]:
import os
from azure.identity import ClientSecretCredential
from azure.storage.blob import BlobServiceClient

os.environ.clear()
from load_dotenv import load_dotenv
load_dotenv(dotenv_path = "/content/chatbot/.env")

credentials = ClientSecretCredential(
    tenant_id = os.environ["TENANT_ID"],
    client_id = os.environ["CLIENT_ID"],
    client_secret = os.environ["CLIENT_SECRET"],
)

container_name = "redditchatbotdata"
blob_service_client = BlobServiceClient(
    account_url = os.environ["BLOB_URL"],
    credential = credentials
)

container_client = blob_service_client.get_container_client("data")
blob_list = list(container_client.list_blob_names())
for blob_name in blob_list:
  down_stream = container_client.download_blob(blob_name)
  with open(f"data/{blob_name}", "wb") as f:
    f.write(down_stream.readall())

In [14]:
import torch
import torch.nn as nn
from torch.optim import Adam

from chatbot.nmt.model import NMTEncoder, NMTDecoderLA, BahdanauAttention, NMTModelLA, accuracy, categorical_crossentropy
from chatbot.nmt.tokenizer import RegexTokenizer
from chatbot.nmt.dataloader import RedditDataset, RedditDataLoader
from chatbot.nmt.trainer import Trainer


In [15]:
tokenizer = RegexTokenizer(vocab_size = 276)
tokenizer.load("chatbot/models/tokenizer/tok.model")

In [16]:
dataloader = RedditDataLoader(tokenizer = tokenizer,
                              filepaths = ["data/training_data/RC_2017-03.txt"],
                              batch_size = 32,
                              sequence_length = 100,
                              purpose = "training")

In [17]:
inp_ids, out_ids, out_out_ids = next(iter(dataloader))
print(inp_ids.shape, out_ids.shape, out_out_ids.shape)

torch.Size([32, 100]) torch.Size([32, 100]) torch.Size([32, 100])


In [18]:
model = NMTModelLA(
    vocab_size = 276,
    bidirectional = True,
    padding_idx = tokenizer.special_tokens["<|padding|>"]
)

In [19]:
# model = NMTModelBA(attn_concat_dim = 4000,
#                   attn_latent_dim = 512,
#                   vocab_size = 276,
#                   bidirectional = True,
#                   padding_idx = tokenizer.special_tokens["<|padding|>"])

In [20]:
learning_rate = 0.001
criterion = categorical_crossentropy
optimizer = Adam
metric_fns = [accuracy]

In [23]:
trainer = Trainer(dataloader, model, criterion, metric_fns, optimizer, lr = learning_rate, device = torch.device("cpu"), num_epochs = 2)

In [None]:
trainer.train()

  0%|          | 0/2 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:26, 26.38s/it][A
2it [00:52, 26.15s/it][A
3it [01:13, 23.89s/it][A
4it [01:35, 23.21s/it][A
5it [01:55, 21.91s/it][A
6it [02:16, 21.55s/it][A
7it [02:35, 20.82s/it][A