# Procedural

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -U torchtext==0.10.0
!pip install -U torch==1.9.0
!pip install psutil



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data
import numpy as np
import spacy
import spacy.cli
from torchtext.data.metrics import bleu_score
import sys
import pandas as pd
import re
import ast
from sklearn.model_selection import train_test_split
import random
import psutil

spacy.cli.download("en")

[38;5;2m‚úî Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m‚úî Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed Aug 25 10:32:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
RANDOM_SEED = 42

# Process the data

Set hyperparameters needed for the preprocessing

In [6]:
BATCH_SIZE = 16
VOCAB_SIZE = 20000
EMBEDDING_DIM = 50     # in {50, 100, 200}
LENGTH_TO_TRUNCATE_POSTS = 200
LENGTH_TO_TRUNCATE_COMMS = 25
DISCARD_NUMBERS = True

LOAD_TOY = False

Load the data and create a dataframe for the posts and one for the comments and combine useful info from posts dataframe with their corresponding top comment

In [None]:
# # Load the cleaned comments as a pandas dataframe
# df_comms = pd.read_csv('drive/MyDrive/MastersProject/data/clean_comments_combined.csv')
# df_comms['is_asshole'] = [1 if verdict in ["YTA","ESH"] else 0 for verdict in df_comms["verdict"]]

# # Drop from the dataframe comments whose verdict has not been deciphered
# indices_of_unknown_verdicts = [idx for (idx, row) in df_comms.iterrows() if row.verdict == "UNK"]
# df_comms.drop(index=indices_of_unknown_verdicts, inplace=True)
# print("{} comments have been dropped because their verdict was unknown!".format(len(indices_of_unknown_verdicts)))
# print("{} comments remaining.".format(len(df_comms)))

# # Set comment ids as the indices for the dataframe
# df_comms.set_index("id", inplace=True)

# # Remove the etiquettes from the comments.
# query = "([^a-zA-Z0-9]*YTA[^a-zA-Z0-9]* )|([^a-zA-Z0-9]*NTA[^a-zA-Z0-9]* )|([^a-zA-Z0-9]*ESH[^a-zA-Z0-9]* )|([^a-zA-Z0-9]*NAH[^a-zA-Z0-9]* )" # Note: I added these blanks to remove them from the start of target sentences. Did it work? Yes!
# new_col = []
# for idx, row in df_comms.iterrows():
#     new_col.append(re.sub(query, " ", row.body))       
# df_comms['body'] = new_col

# # Load the posts as a pandas dataframe
# df_posts = pd.read_csv('drive/MyDrive/MastersProject/data/posts_with_children_test.csv')
# df_posts['text'] = df_posts["title"] + " " + df_posts["body"].fillna("")



# # Create a new dataframe with useful info from the posts
# df = pd.DataFrame(data=df_posts['text'])
# df.rename(columns = {'text': "source_text"}, inplace=True)
# df['source_body'] = df_posts['body']
# df['source_title'] = df_posts['title']
# df.set_index(df_posts.id, inplace=True)

# # Find the top comment of each post and add it to the above dataframe
# top_children_ids = []
# post_id_no_children = []
# for post_idx, post_row in df_posts.iterrows():
#     children_dic = ast.literal_eval(post_row.children)
#     if children_dic:
#         if max(children_dic, key=children_dic.get) in df_comms.index:
#             top_children_ids.append(max(children_dic, key=children_dic.get))
#         else:
#             top_children_ids.append(0)
#     else:
#         top_children_ids.append(0)
# top_children_bodies = [df_comms.loc[id, "body"] if id != 0 else 0 for id in top_children_ids]
# df['target_body'] = top_children_bodies

# # Drop from the dataframe rows corresponding to posts without comments
# ids_of_posts_without_children = [id for (id, row) in df.iterrows() if row.target_body == 0]
# df.drop(index=ids_of_posts_without_children, inplace=True)
# print("The number of posts that were dropped due to having no comments is:", len(ids_of_posts_without_children))

# # Print the resulting dataframe
# pd.set_option('display.max_rows', 10)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', 20)
# print(df)

Load the nicely processed dataset

In [7]:
# Load either the toy dataset or the full dataset
if not LOAD_TOY:
    df = pd.read_csv('drive/MyDrive/MastersProject/seq2seq/Data/sources-targets_no_etiquettes.csv', dtype={"comment_id":str, "post_id":str, "post_text":str, "comment_body":str})
else:
    df = pd.read_csv('drive/MyDrive/MastersProject/seq2seq/Data/sources-targets_no_etiquettes_toy.csv', dtype={"comment_id":str, "post_id":str, "post_text":str, "comment_body":str})
df.set_index("comment_id", inplace=True)
df_dict = df.to_dict()

# Remove multiple gaps in the texts
query = " +"
new_comment_bodies = []
new_post_texts = []
for comment_id, post_text in df_dict["post_text"].items():
    new_post_texts.append(re.sub(query, " ", post_text))
    new_comment_bodies.append(re.sub(query, " ", df_dict["comment_body"][comment_id]))
df["comment_body"] = new_comment_bodies
df["post_text"] = new_post_texts

# Print the resulting dataframe
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 20)
print(df)

           post_id            post_text         comment_body
comment_id                                                  
cg08970     205zhr  AMITA for tellin...  I've been on the...
cgdobj3     21igkp  [AITA] Am I over...  What if there we...
cgcxbgb     21gs5t  AITA for making ...  You're not an as...
cgcfv1n     21dsje  [AITA] Got a fri...  I was going to s...
cgcdfh0     21eqs8  [AITA] Petty fam...  Unless you are g...
...            ...                  ...                  ...
gge3mm8     kgdqle  WIBTA if I Repor...          YWNBTA and 
gge7gve     kg5q3v  AITA if I (20f) ...   He doesn‚Äôt owe ...
gge7eyc     kgenxj  AITA for not pay...   Ulterior motive...
gge7e86     kg7v7u  AITA for not goi...   but dude, you'r...
gge7dta     kgbf23  AITA for making ...  Wow that‚Äôs a lot...

[539785 rows x 3 columns]


Split dataframe into training, validation and test dataframes and save them as csvs

In [8]:
df_train, df_valid_test = train_test_split(df, test_size=0.1, shuffle=True, random_state=RANDOM_SEED)
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, shuffle=True, random_state=RANDOM_SEED)

df_train.to_csv("drive/MyDrive/MastersProject/seq2seq/Data/sources-targets_train.csv", index=True)
df_valid.to_csv("drive/MyDrive/MastersProject/seq2seq/Data/sources-targets_valid.csv", index=True)
df_test.to_csv("drive/MyDrive/MastersProject/seq2seq/Data/sources-targets_test.csv", index=True)

Build the training, validation and test sets, each of which is an array dicts, each dict containing a source and a target sequence. These sequences have been tokenised using the custom tokenise function (which utilises spacy).

In [9]:
# Load the English version of spacy and use it in the tokenise function. What's up with this spacy_eng: https://spacy.io/usage/linguistic-features#tokenization
spacy_eng = spacy.load('en')

# Define a function that takes in astring and tells you if there is a digit in the string
def has_numbers(string):
    return any(char.isdigit() for char in string)

# Define the tokenise function that uses spacy tokeniser, removes "\n" characters from tokens and either removes tokens containing numbers or transforms them to "ten"
if not DISCARD_NUMBERS:
    def tokenise_eng(text):
        tokenised_text = [tok.text for tok in spacy_eng.tokenizer(text) if "\n" not in tok.text]
        tokenised_text2 = [tok if not has_numbers(tok) else "ten" for tok in tokenised_text]
        return tokenised_text2
else:
    def tokenise_eng(text):
        return [tok.text for tok in spacy_eng.tokenizer(text) if "\n" not in tok.text and not has_numbers(tok.text)]

# Instantiate a field object that builds a vocab from a corpus and can then preprocess, tokenise and numericalise sentences by indexing the vocab.
field = data.Field(sequential=True, use_vocab=True, tokenize=tokenise_eng, lower=True, init_token='<sos>', eos_token='<eos>')
fields = {'post_text': ('src', field), 'comment_body': ('trg', field)}

In [10]:
train_set, valid_set, test_set = data.TabularDataset.splits(path="drive/MyDrive/MastersProject/seq2seq/Data",
                                                            train="sources-targets_train.csv",
                                                            validation="sources-targets_valid.csv",
                                                            test="sources-targets_test.csv",
                                                            format="csv",
                                                            fields=fields)

Separately truncate posts and comments to get rid of too long seqs

In [11]:
for i in range(len(train_set)):
    train_set[i].__dict__["src"] = train_set[i].__dict__["src"][:LENGTH_TO_TRUNCATE_POSTS]
    train_set[i].__dict__["trg"] = train_set[i].__dict__["trg"][:LENGTH_TO_TRUNCATE_COMMS]

for i in range(len(valid_set)):
    valid_set[i].__dict__["src"] = train_set[i].__dict__["src"][:LENGTH_TO_TRUNCATE_POSTS]
    valid_set[i].__dict__["trg"] = train_set[i].__dict__["trg"][:LENGTH_TO_TRUNCATE_COMMS]

for i in range(len(test_set)):
    test_set[i].__dict__["src"] = train_set[i].__dict__["src"][:LENGTH_TO_TRUNCATE_POSTS]
    test_set[i].__dict__["trg"] = train_set[i].__dict__["trg"][:LENGTH_TO_TRUNCATE_COMMS]

Inspect the datasets

In [12]:
print(type(train_set))
print(len(train_set))
print(train_set[2].__dict__)

<class 'torchtext.legacy.data.dataset.TabularDataset'>
485806
{'src': ['aita', 'for', 'considering', 'breaking', 'up', 'with', 'my', 'girlfriend', 'over', 'her', 'tic', '?', 'my', 'girlfriend', '(', ')', 'and', 'i', '(', ')', 'have', 'been', 'together', 'for', 'about', 'three', 'years', 'now', '.', 'we', 'met', 'while', 'we', 'were', 'both', 'in', 'our', 'freshmen', 'years', 'of', 'college', ';', 'i', "'m", 'studying', 'engineering', 'while', 'she', "'s", 'been', 'studying', 'to', 'become', 'a', 'doctor', '.', 'in', 'the', 'past', 'few', 'months', ',', 'my', 'girlfriend', 'has', 'been', 'studying', 'intensely', 'in', 'preparation', 'of', 'the', 'mcat', '.', 'she', 'studied', 'for', 'a', 'little', 'over', 'a', 'month', 'before', 'the', 'actual', 'day', 'she', "'d", 'take', 'the', 'test', ',', 'but', 'she', 'took', 'that', 'month', 'very', 'seriously', '.', 'she', 'dedicated', 'virtually', 'all', 'of', 'her', 'free', 'time', 'to', 'studying', 'and', 'basically', 'cut', 'herself', 'off', 

Build the vocabulary from the training set and assign a vector to each word inside it.

In [13]:
field.build_vocab(train_set, max_size=VOCAB_SIZE, min_freq=2, vectors='glove.twitter.27B.{}d'.format(EMBEDDING_DIM))                #'glove.twitter.27B.{}d'.format(EMBEDDING_DIM)) #vectors='glove.6B.100d')

.vector_cache/glove.twitter.27B.zip: 1.52GB [04:44, 5.34MB/s]                            
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 1193513/1193514 [00:26<00:00, 44709.89it/s]


Inspect the vocabulary

In [14]:
vocab_words_list = field.vocab.itos
vocab_words_to_indices_dict = field.vocab.stoi

print("field.vocab.itos =")
print(vocab_words_list)
print()
print("field.vocab.stoi =")
print(vocab_words_to_indices_dict)
print()
print("field.vocab.vectors =")
print(field.vocab.vectors)
print("      shape =", field.vocab.vectors.shape)

field.vocab.itos =

field.vocab.stoi =

field.vocab.vectors =
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.7532,  0.0109,  0.3165,  ...,  0.1377, -0.5730, -0.7398],
        [ 0.1027,  0.8154, -0.8668,  ..., -0.0197,  0.1052, -0.0235],
        [-1.5462, -0.1981,  0.2061,  ..., -0.1243,  0.9785, -0.6493]])
      shape = torch.Size([20004, 50])


What vectors are assigned to words in the vocabulary not included in Glove? How many of them exist?

In [15]:
all_zeros_vec_indices = []
for idx, embedding in enumerate(field.vocab.vectors):
    all_zeros = not embedding.any()
    if all_zeros:
        all_zeros_vec_indices.append(idx)
vocab_words_no_glove_embeddings = [field.vocab.itos[idx] for idx in all_zeros_vec_indices]
print("There are {}, (out of the {}) words in the vocabulary that do not have a glove embedding and are thus represented by the zero vector:".format(len(vocab_words_no_glove_embeddings), len(field.vocab.itos)))
print(vocab_words_no_glove_embeddings)
print()

There are 788, (out of the 20004) words in the vocabulary that do not have a glove embedding and are thus represented by the zero vector:
['<unk>', '<pad>', '<sos>', '<eos>', ' ', 'wibta', '...', '..', '....', 'covid', 'tl;dr', 'aitah', 'and/or', '\xa0', '\xa0 ', '.....', 'gt;i', ':)', 'shitpost', 'may.', 'incel', '\\-', 'wibtah', 'w/', 'lgbtq+', 'fsil', 'stepdaughters', 'airpods', 'd&amp;d', 'aita-', 'doggo', 'fortnite', '\u200b', 'amitheasshole', 'ywbta', ':(', '/r', "i'm", 'stepsons', 'i‚Äòm', 'me-', 'gt;she', 'üö©', "don't", 'Ô∏è', 'uninviting', 'background-', 'niblings', 'üòÇ', 'lgbt+', 'gt;he', 'gt;my', 'name\\', ';)', 'fianc√®', 'mudpies', 'amp;nbsp', 'she‚Äòs', '-i', 'assholish', 'onlyfans', 'reddits', '\xa0\xa0', "i've", 'justnomil', 'i‚Äôm', 'temperaments', 'airbnbs', '......', 'really-', 'nonbinary', 'y‚Äôall', 'f*ck', 'doordash', 'don‚Äòt', 'catcalled', ':/', 'exwife', 'loomingtales', 'clickbait', 'üôÑ', 'youngish', 'ü§∑', 'girthy-', 'downvote', 'downvoted', 'raisedbyna

How are words that are not included in the vocabulary represented in the numericalised sequences? How many of them exist?

In [17]:
# CAREFUL! Takes ages.

train_set_words_total = set()
train_set_words_not_in_vocab = set()
unknown_word_occurrences = 0
total_word_occurrences = 0
for seq in train_set:
    for word in seq.__dict__['src']:
        train_set_words_total.add(word)
        total_word_occurrences += 1
        if word not in field.vocab.itos:
            train_set_words_not_in_vocab.add(word)
            unknown_word_occurrences += 1
    for word in seq.__dict__['trg']:
        train_set_words_total.add(word)
        total_word_occurrences += 1
        if word not in field.vocab.itos:
            train_set_words_not_in_vocab.add(word)
            unknown_word_occurrences += 1
print()
print()
print("There are {}, (out of the {}), words in the train set occuring a total of {}, (out of {}) times, that are not in the vocabulary and are thus represented by the <unk> token:".format(len(train_set_words_not_in_vocab), len(train_set_words_total), unknown_word_occurrences, total_word_occurrences))
print(train_set_words_not_in_vocab)



There are 56298, (out of the 76298), words in the train set occuring a total of 449199, (out of 104866549) times, that are not in the vocabulary and are thus represented by the <unk> token:


Instantiate training, validation and test loaders.

In [18]:
# Create the dataloaders with batches of samples of similar text lengths to minimise padding required.
train_loader, valid_loader, test_loader = data.BucketIterator.splits((train_set, valid_set, test_set), batch_size=BATCH_SIZE, device="cpu", sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)))

In [19]:
# OPTIONAL: to clean up RAM
del test_loader, test_set

Inspect a batch

In [20]:
batch = next(iter(train_loader))
print("batch.src.shape =", batch.src.shape)
print("batch.trg.shape =", batch.trg.shape)
print(100*"-")
print("The first ten words in each of the first four sequences in the batch of source sequences:")
print(batch.src[0:10, 0:4])
batch_src_part = []
for row_idx, row in enumerate(batch.src[0:10, 0:4]):
    batch_src_part.append([])
    for col_idx, vocab_idx in enumerate(row):
        batch_src_part[row_idx].append(field.vocab.itos[vocab_idx])
for row in batch_src_part:
    print(row)
print()
print("The first ten words in each of the first four sequences in the batch of target sequences:")
print(batch.trg[0:10, 0:4])
batch_trg_part = []
for row_idx, row in enumerate(batch.trg[0:10, 0:4]):
    batch_trg_part.append([])
    for col_idx, vocab_idx in enumerate(row):
        batch_trg_part[row_idx].append(field.vocab.itos[vocab_idx])
for row in batch_trg_part:
    print(row)

batch.src.shape = torch.Size([202, 16])
batch.trg.shape = torch.Size([27, 16])
----------------------------------------------------------------------------------------------------
The first ten words in each of the first four sequences in the batch of source sequences:
tensor([[    2,     2,     2,     2],
        [   39,  2915,    39,    39],
        [   12,    12,    12,    12],
        [  548,    30,   450,  4354],
        [   11,  3400,     8,    11],
        [  239,    10,   244,  2140],
        [  137,   319,   184,   104],
        [    9,    32,  2421,    89],
        [12052,    18,     8,     5],
        [ 1023,    11,    11,   179]])
['<sos>', '<sos>', '<sos>', '<sos>']
['aita', 'aitah', 'aita', 'aita']
['for', 'for', 'for', 'for']
['moving', 'not', 'refusing', 'communicating']
['my', 'answering', 'to', 'my']
['car', 'the', 'give', 'concerns']
['into', 'phone', 'any', 'over']
['a', '?', 'inheritance', 'what']
['coveted', 'in', 'to', 'i']
['parking', 'my', 'my', 'feel']

The fi

# Create the model

Set the hyperparameters needed for creating the models

In [32]:
LOAD_MODEL = True

FINAL_VOCAB_SIZE = len(field.vocab)
HIDDEN_SIZE = 200
NUM_LAYERS = 1
ENC_DROP = 0.5
DEC_DROP = 0.5
IS_ENC_BIDIRECTIONAL = True
TRAIN_EMBEDDINGS = True

TEACHER_FORCE_RATIO = 0.8

Define the architectures of the models

In [33]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, hidden_size, num_layers, p_drop, is_bidirectional, train_embeddings):
        super(Encoder, self).__init__()

        # Instantiate straightforward attributes
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p_drop)
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.train_embeddings = train_embeddings
        self.is_bidirectional = is_bidirectional
        
        # Freeze the embeddings if desired
        if not self.train_embeddings:
            self.embedding.weight.requires_grad = False

        # Instantiate the lstm of the encoder
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p_drop, bidirectional=self.is_bidirectional)

        # Kind of an fc layer that will pick whether to propagate to attention the info of the forward or the backward direction
        self.fc_hidden = nn.Linear(self.hidden_size * 2, self.hidden_size)
        # same for the cell
        self.fc_cell = nn.Linear(self.hidden_size * 2, self.hidden_size)


    def forward(self, x):                               # x.shape = (seq_length x batch_size)

        embedding = self.dropout(self.embedding(x))     # embedding.shape = (seq_length x batch_size x embedding_size)

        outputs, (hidden, cell) = self.lstm(embedding)  # hidden.shape = ( 2 x batch_size x hidden_size)

        '''
        print("source.shape =", x.shape)
        print("embedded_source.shape =", embedding.shape)
        print("outputs.shape =", outputs.shape)
        print("hidden.shape =", hidden.shape)
        print("cell.shape =", cell.shape)
        '''

        # Concatenate the hidden from the forward with that from the backward layer to express it as (batch_size x hidden_size * 2)
        # and pass it through a linear layer which selects the features from each layer to propagate to the decoder who is not bidirectional.
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

        '''
        print("After extra layers for hidden/cell:")
        print("hidden.shape =", hidden.shape)
        print("cell.shape =", cell.shape)
        '''

        return outputs, hidden, cell




class Decoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size, num_layers, p_drop, train_embeddings):
        super(Decoder, self).__init__()

        # Instantiate the straightforward attributes
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p_drop)
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.train_embeddings = train_embeddings

        # Freeze the embeddings if desired
        if not self.train_embeddings:
            self.embedding.weight.requires_grad = False

        # Instantiate the lstm of the decoder
        self.lstm = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers, dropout=p_drop)

        # A learnable layer to compute the similarity between each output state from encoder and the hidden state from previous word of decoder
        self.energy = nn.Linear(hidden_size * 3, 1)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()

        # This fc layer maps from a hidden vector to a vector giving probs of each word being produced
        self.fc = nn.Linear(hidden_size, output_size)
    

    def forward(self, x, encoder_outputs, hidden, cell):
        sequence_length = encoder_outputs.shape[0]
        x = x.unsqueeze(0)                              # x.shape = (batch_size) but we want (1, N) because this works word by word, we don't send in the entire sentence.

        embedding = self.dropout(self.embedding(x))     # embedding.shape = (1 x batch_size x embedding_size)

        # This is required to be able to add the hidden state from the decoder for each word to the encoder output states
        h_reshaped = hidden.repeat(sequence_length, 1, 1)

        # Here we compute, in a learnable fashion, the similarity between each of the encoder outputs and the decoder hidden state for the previous word. 
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_outputs), dim=2)))    # energy.shape = (seq_length, batch_size, 1)
        attention = self.softmax(energy)                                                    # attention.shape = (seq_length, batch_size, 1)

        '''
        print("Current_target_word.shape =", x.shape)
        print("embedded_target_word.shape =", embedding.shape)
        print("encoder_outputs.shape =", encoder_outputs.shape)
        print("hidden.shape =", hidden.shape)
        print("cell.shape =", cell.shape)
        print()
        print("h_reshaped.shape =", h_reshaped.shape)
        print("energy.shape =", energy.shape)
        print("attention.shape =", attention.shape)
        '''

        # Reshape to do matrix multiplication of attention
        attention = attention.permute(1, 2, 0)                  # attention.shape = (batch_size x 1, seq_len)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)      # encoder_outputs.shape = (batch_size, seq_length, hidden_size)

        # Context vector is a linear combination of encoder outputs where each output contributes according to its similarity with the decoder's hidden state for the previous target word
        context_vector = torch.bmm(attention, encoder_outputs).permute(1, 0, 2)     # (batch_size x 1 x hidden_size * 2) --> (1 x batch_size x hidden_size * 2)

        # Feed the lstm with both the current word's embedding and the 
        lstm_input = torch.cat((context_vector, embedding), dim=2)

        # output hidden and cell will be used for next word
        outputs, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))      # outputs.shape = (1, batch_size, hidden_size)

        predictions = self.fc(outputs)                                      # predictions.shape = (1, batch_size, length_of_vocab)

        predictions = predictions.squeeze(0)

        '''
        print("context_vector.shape =", context_vector.shape)
        print("lstm_input.shape =", lstm_input.shape)
        print("outputs.shape =", outputs.shape)
        print("hidden.shape =", hidden.shape)
        print("cell.shape =", cell.shape)
        print("predictions_shape =", predictions.shape)
        '''

        return predictions, hidden, cell



class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, eng, teacher_force_ratio):
        super(Seq2Seq, self).__init__()

        # Instantiate the straightforward attributes
        self.encoder = encoder
        self.decoder = decoder
        self.vocab_size = len(eng.vocab)
        self.teacher_force_ratio = teacher_force_ratio


    def forward(self, source, target):

        batch_size = source.shape[1]        # source.shape = (source_seq_length x batch_size)
        target_len = target.shape[0]        # target.shape = (target_seq_length x batch_size)

        outputs = torch.zeros(target_len, batch_size, self.vocab_size).to(device)

        encoder_outputs, hidden, cell = self.encoder(source)

        # Grab start token
        x = target[0]

        '''
        print("We send 1 by 1 the target words through the decoder")
        '''

        # word by word, keep sending to the decoder: x, hidden, cell
        for t in range(1, target_len):
            
            # Note how encoder outputs are always sent in, at each time step
            output, hidden, cell = self.decoder(x, encoder_outputs, hidden, cell)        # output.shape = (batch_size, vocab_size)

            outputs[t] = output

            best_guess = output.argmax(1)

            # sometimes teacher-force, others send the word from the prediction
            x = target[t] if random.random() < self.teacher_force_ratio else best_guess

        return outputs


    def forward_enc(self, source):

        hidden, cell = self.encoder(source)

        return hidden, cell


Use GPU if available

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


Instantiate encoder and decoder and put them together in the seq2seq model

In [45]:
if not LOAD_MODEL:
    encoder_net = Encoder(FINAL_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, NUM_LAYERS, ENC_DROP, IS_ENC_BIDIRECTIONAL, TRAIN_EMBEDDINGS).to(device)
    # Copy the glove embeddings used in eng.vocab over to the encoder embeddings
    encoder_net.embedding.weight.data.copy_(field.vocab.vectors)

    decoder_net = Decoder(FINAL_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, FINAL_VOCAB_SIZE, NUM_LAYERS, DEC_DROP, TRAIN_EMBEDDINGS).to(device)
    # Copy the glove embeddings used in eng.vocab over to the decoder embeddings
    decoder_net.embedding.weight.data.copy_(field.vocab.vectors)

    model = Seq2Seq(encoder_net, decoder_net, field, TEACHER_FORCE_RATIO).to(device)
else:
    model = torch.load('/content/drive/MyDrive/MastersProject/seq2seq/SavedSeq2seqModels/24-08_full_set.pt', map_location=torch.device('cpu'))
    model = model.to(device)
    print("Loaded model!")

print()
print(model)
print()
model_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("The total number of trainable parameters in the classifier is: {}".format(model_trainable_params))
print()

Loaded model!

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(20004, 50)
    (lstm): LSTM(50, 200, dropout=0.5, bidirectional=True)
    (fc_hidden): Linear(in_features=400, out_features=200, bias=True)
    (fc_cell): Linear(in_features=400, out_features=200, bias=True)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(20004, 50)
    (lstm): LSTM(450, 200, dropout=0.5)
    (energy): Linear(in_features=600, out_features=1, bias=True)
    (softmax): Softmax(dim=0)
    (relu): ReLU()
    (fc): Linear(in_features=200, out_features=20004, bias=True)
  )
)

The total number of trainable parameters in the classifier is: 7107005



Make sure that encoder and decoder have adopted the Glove embeddings of the vocab

In [46]:
print("The embeddings of the encoder:")
print(model.encoder.embedding.weight)
print()
print("The embeddings of the decoder:")
print(model.decoder.embedding.weight)

The embeddings of the encoder:
Parameter containing:
tensor([[ 0.1296,  0.2365,  0.2682,  ..., -0.1173, -0.3593, -0.0701],
        [ 0.1350, -0.2346, -0.0235,  ...,  0.0084, -0.5344,  0.1271],
        [-0.0092, -0.0057,  0.0319,  ...,  0.0179, -0.0147, -0.0079],
        ...,
        [-0.2731,  0.2914,  0.5772,  ...,  0.3922, -0.6087, -0.9670],
        [ 0.0327,  0.6515, -0.9867,  ...,  0.2915,  0.1913, -0.0215],
        [-1.7123, -0.2272,  0.1827,  ..., -0.6166,  1.2670, -0.6961]],
       device='cuda:0', requires_grad=True)

The embeddings of the decoder:
Parameter containing:
tensor([[ 0.2385,  0.0499, -0.0295,  ...,  0.2763,  0.0258,  0.1895],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1060, -0.1295, -0.3307,  ..., -0.0518,  0.1851,  0.0165],
        ...,
        [-0.7532,  0.0109,  0.3165,  ...,  0.1377, -0.5730, -0.7398],
        [ 0.1027,  0.8154, -0.8668,  ..., -0.0197,  0.1052, -0.0235],
        [-1.4827, -0.1345,  0.2061,  ..., -0.0926,  

# Train

Set the hyperparameters required for training

In [47]:
LR = 0.0002
NUM_EPOCHS = 2

Define a couple of helper function for the training

In [48]:
def generate_example_comment(model, batch, field, num_coms_to_gen=2):
    '''
    Function that takes in a trained model and a batch of posts with comments and generates a few new comments from posts
    '''
    src = batch.src.to(device)
    trg = batch.trg.to(device)

    model = model.eval()
    with torch.no_grad():
        outputs = model(src, trg)

    for i in range(0, num_coms_to_gen):
        output_seq_ex = outputs[:, i, :]
        best_guess_for_ex_seq = [int(row.argmax()) for row in output_seq_ex]
        print([field.vocab.itos[idx] for idx in best_guess_for_ex_seq])
    
    pass


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

Define a function that trains a model for one epoch

In [49]:
def train_epoch(model, dataloader, optimiser, criterion, device, print_on=False):
    '''
    Function that performs one epoch (one pass through each sample in the given loader) of training of the given model.
    '''

    model = model.train()

    # Create some things for storing and calculating training metrics on a per epoch level
    losses = []
    # Create some lists for storing and calculating training metrics on a per some batches level
    many_batches_losses = []

    for i, batch in enumerate(dataloader):

        source_batch = batch.src.to(device)
        target_batch = batch.trg.to(device)

        '''
        gpu_info = !nvidia-smi
        gpu_info = '\n'.join(gpu_info)
        print(gpu_info)
        print(100 * "-")
        '''

        output_batch = model(source_batch, target_batch)
        del source_batch

        '''
        gpu_info = !nvidia-smi
        gpu_info = '\n'.join(gpu_info)
        print(gpu_info)
        print(100 * "-")
        '''

        # output.shape = (trg_len, batch_size, output_dim)
        # target.shape = (trg_len, batch_size)
        # we change the shape to suit the loss fn. We do not send the start token.

        output_batch = output_batch[1:].reshape(-1, output_batch.shape[2])
  
        target_batch = target_batch[1:].reshape(-1)

        optimiser.zero_grad()
        loss = criterion(output_batch, target_batch)
        losses.append(loss.item())
        many_batches_losses.append(loss.item())
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimiser.step()

        torch.cuda.empty_cache()
        del target_batch, output_batch

        if print_on:
            # Every so often print the current training accuracy
            if (i + 1) % 200 == 0:
                print("...Batch #{} : Training Loss={}".format(i + 1, sum(many_batches_losses) / len(many_batches_losses)))
                many_batches_losses = []

    # Return the training accuracy and the mean training loss for the given epoch
    return np.mean(losses)

In [50]:
def eval_epoch(model, dataloader, criterion, device, print_on=False):
    '''
    Function that performs one epoch (one pass through each sample in the given loader) of evaluation of the given model.
    '''

    model = model.eval()

    # Create some things for storing and calculating training metrics on a per epoch level
    losses = []

    for i, batch in enumerate(dataloader):

        source_batch = batch.src.to(device)
        target_batch = batch.trg.to(device)

        '''
        gpu_info = !nvidia-smi
        gpu_info = '\n'.join(gpu_info)
        print(gpu_info)
        print(100 * "-")
        '''
        
        with torch.no_grad():
            output_batch = model(source_batch, target_batch)
        del source_batch

        '''
        gpu_info = !nvidia-smi
        gpu_info = '\n'.join(gpu_info)
        print(gpu_info)
        print(100 * "-")
        '''

        # output.shape = (trg_len, batch_size, output_dim)
        # target.shape = (trg_len, batch_size)
        # we change the shape to suit the loss fn. We do not send the start token.
        output_batch = output_batch[1:].reshape(-1, output_batch.shape[2])
        target_batch = target_batch[1:].reshape(-1)

        loss = criterion(output_batch, target_batch)
        losses.append(loss.item())

        torch.cuda.empty_cache()
        del target_batch, output_batch

    # Return the training accuracy and the mean training loss for the given epoch
    return np.mean(losses)

Perform the training for several epochs

In [51]:
pad_idx = field.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimiser = optim.Adam(model.parameters(), lr=LR)

# Create lists to store the six metrics to be plotted later
train_losses = []

for epoch in range(1, NUM_EPOCHS + 1):
    print(f'Epoch [{epoch} / {NUM_EPOCHS}]')

    # Try to generate a comment from a post in the validation set as an evaluation
    batch = next(iter(valid_loader))
    generate_example_comment(model, batch, field, num_coms_to_gen=2)

    # Perform 1 epoch of training, store and report the relevant training metrics
    train_loss = train_epoch(model, train_loader, optimiser, criterion, device, print_on=True)
    print("Train: loss {}".format(train_loss))

    # Perform one epoch of validation and report the validation loss
    valid_loss = eval_epoch(model, valid_loader, criterion, device, print_on=True)
    print("Valid: loss {}".format(valid_loss))

# Try to generate a comment from a post in the validation set as an evaluation
batch = next(iter(valid_loader))
generate_example_comment(model, batch, field, num_coms_to_gen=2)

Epoch [1 / 2]
['<unk>', ' ', 'you', "'s", 'not', 'of', 'the', 'business', 'of', 'the', 'to', 'you', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>']
['<unk>', ' ', 'you', 'you', 'is', 'a', 'of', 'the', 'people', '<unk>', '"', 'other', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>']
...Batch #200 : Training Loss=4.463541667461396
...Batch #400 : Training Loss=4.469241734743118
...Batch #600 : Training Loss=4.460330945253372
...Batch #800 : Training Loss=4.459378703832626
...Batch #1000 : Training Loss=4.476839512586594
...Batch #1200 : Training Loss=4.479264371395111
...Batch #1400 : Training Loss=4.4915511000156405
...Batch #1600 : Training Loss=4.482216790914536
...Batch #1800 : Training Loss=4.460624967813492
...Batch #2000 : Training Loss=4.452579138278961
...Batch #2200 : Training Loss=4.478443877696991
...Batch

Save the trained seq2seq model and the field object with vocab

In [None]:
with torch.no_grad():
    torch.save(model, '/content/drive/MyDrive/MastersProject/seq2seq/SavedSeq2seqModels/25-08_full_set.pt')

torch.save(field, '/content/drive/MyDrive/MastersProject/seq2seq/SavedFieldsEng/25-08_full_set.pt')

# Classify Encoder Ouputs

In [None]:
class EncoderOutputClassifier(nn.Module):
    '''
    The fully connected network classifier that takes as input the hidden and cell state outputs that the encoder spits
    when fed with posts, and classifies them into assholes or sweethearts.
    '''

    def __init__(self, feats_in, n_neurons_1, n_neurons_2, n_neurons_3, dropout_proportion):
        super(EncoderOutputClassifier, self).__init__()

        # Instantiate straight forward attributes
        self.feats_in = feats_in
        self.n_neurons_1 = n_neurons_1
        self.n_neurons_2 = n_neurons_2
        self.n_neurons_3 = n_neurons_3
        self.p_dropout = dropout_proportion

        # Determine the number of layers based on the given neurons for each layer prior to the last which by default has 1 neuron.
        if self.n_neurons_1 == 0:
            self.num_layers = 1
        elif self.n_neurons_2 == 0:
            self.num_layers = 2
        elif self.n_neurons_3 == 0:
            self.num_layers = 3
        else:
            self.num_layers = 5

            # Structure the architecture of the network depending on the number of layers and their number of neurons
            if self.num_layers == 1:

                self.classifier = nn.Sequential(
                    nn.Linear(self.feats_in, 1),
                    nn.Sigmoid(),
                    # nn.Dropout(p=self.dropout_proportion)
                )

            elif self.num_layers == 2:

                self.classifier = nn.Sequential(
                    nn.Linear(self.feats_in, self.n_neurons_1),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_1),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_1, 1),
                    nn.Sigmoid(),
                    # nn.Dropout(p=self.dropout_proportion)
                )

            elif self.num_layers == 3:

                self.classifier = nn.Sequential(
                    nn.Linear(self.feats_in, self.n_neurons_1),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_1),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_1, self.n_neurons_2),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_2),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_2, 1),
                    nn.Sigmoid(),
                    # nn.Dropout(p=self.dropout_proportion)
                )

            elif self.num_layers == 4:

                self.classifier = nn.Sequential(
                    nn.Linear(self.feats_in, self.n_neurons_1),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_1),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_1, self.n_neurons_2),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_2),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_2, self.n_neurons_3),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_3),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_3, 1),
                    nn.Sigmoid(),
                    # nn.Dropout(p=self.dropout_proportion)
                )

            elif self.num_layers == 5:

                self.classifier = nn.Sequential(
                    nn.Linear(self.feats_in, self.n_neurons_1),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_1),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_1, self.n_neurons_2),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_2),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_2, self.n_neurons_3),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_3),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_3, self.n_neurons_4),
                    nn.ReLU(),
                    # nn.BatchNorm1d(self.n_neurons_4),
                    # nn.Dropout(p=self.dropout_proportion),

                    nn.Linear(self.n_neurons_4, 1),
                    nn.Sigmoid(),
                    # nn.Dropout(p=self.dropout_proportion)
                )

    def forward(self, hidden, cell):
        print(hidden.shape)
        hidden = hidden.permute(1, 0, 2)
        cell = cell.permute(1, 0, 2)
        print(hidden.shape)
        batch_size = hidden.shape[0]

        hidden = hidden.view(batch_size, -1)
        cell = cell.view(batch_size, -1)
        print(hidden.shape)
        X = torch.cat((hidden, cell), 1)
        print(X.shape)

        out = self.classifier(X)
        print(out.shape)
        out = out.view(-1)
        print(out.shape)

        return out

In [None]:
df = pd.read_csv('drive/MyDrive/MastersProject/data/aita_clean.csv')
df['text'] = df["title"] + " " + df["body"].fillna("")
df.drop(columns=["id", "timestamp", "title", "body", "edited", "verdict", "score", "num_comments"], inplace=True)

In [None]:
df_train, df_valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=RANDOM_SEED)
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, shuffle=True, random_state=RANDOM_SEED)

df_train.to_csv("drive/MyDrive/MastersProject/data/aita_clean_to_classify_from_gen_train.csv", index=True)
df_valid.to_csv("drive/MyDrive/MastersProject/data/aita_clean_to_classify_from_gen_valid.csv", index=True)
df_test.to_csv("drive/MyDrive/MastersProject/data/aita_clean_to_classify_from_gen_test.csv", index=True)

In [None]:
labels_field = data.RawField(preprocessing=None, postprocessing=None, is_target=True)
# Instantiate a field object that builds a vocab from a corpus and can then preprocess, tokenise and numericalise sentences by indexing the vocab.
# TODO: Understand better exactly the steps involved in creating and utilising a field object
fields = {'text': ('post_text', eng), 'is_asshole': ('is_asshole', labels_field)}

In [None]:
train_set, test_set, valid_set = data.TabularDataset.splits(path="drive/MyDrive/MastersProject/data",
                                                            train="aita_clean_to_classify_from_gen_train.csv",
                                                            validation="aita_clean_to_classify_from_gen_valid.csv",
                                                            test="aita_clean_to_classify_from_gen_test.csv",
                                                            format="csv",
                                                            fields=fields)
train_loader, valid_loader, test_loader = data.BucketIterator.splits((train_set, valid_set, test_set), batch_size=BATCH_SIZE, device="cpu")

In [None]:
batch = next(iter(train_loader))
print("batch.post_text.shape =", batch.post_text.shape)
print(100*"-")
print("The first ten words in each of the first four sequences in the batch of source sequences:")
print(batch.post_text[0:10, 0:4])
batch_post_text_part = []
for row_idx, row in enumerate(batch.post_text[0:10, 0:4]):
    batch_post_text_part.append([])
    for col_idx, vocab_idx in enumerate(row):
        batch_post_text_part[row_idx].append(eng.vocab.itos[vocab_idx])
for row in batch_post_text_part:
    print(row)
print()
print("A batch of targets:")
print(batch.is_asshole)

batch.post_text.shape = torch.Size([758, 32])
----------------------------------------------------------------------------------------------------
The first ten words in each of the first four sequences in the batch of source sequences:
tensor([[   2,    2,    2,    2],
        [  63,  201,   63,   63],
        [  14,   38,   14,   14],
        [ 141,    5,   28,  446],
        [1717,  184,  185, 4310],
        [ 143,  240, 5689,   11],
        [   5,  276,   11,  110],
        [  64,  102,  243, 2613],
        [   9,   23,   89,   35],
        [3025,   49,   36,    5]])
['<sos>', '<sos>', '<sos>', '<sos>']
['aita', 'wibta', 'aita', 'aita']
['for', 'if', 'for', 'for']
['‚Äú', 'i', 'not', 'completely']
['demanding', 'let', 'saying', 'abandoning']
['‚Äù', 'another', 'thanking', 'my']
['i', 'girl', 'my', 'work']
['get', 'who', 'best', 'responsibilities']
['the', 'is', 'friend', '?']
['xbox', 'n‚Äôt', "'s", 'i']

A batch of targets:
['0', '1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '

In [None]:
def train_class_of_encs_epoch(model, seq2seq, dataloader, optimiser, device, print_on=False):
    '''
    Function that performs one epoch (one pass through each sample in the given loader) of training of the given model.
    '''

    seq2seq = seq2seq.eval()
    model = model.train()

    # Create some things for storing and calculating training metrics on a per epoch level
    losses = []
    correct_predictions = 0
    total_predictions = 0

    # Create some lists for storing and calculating training metrics on a per some batches level
    many_batches_losses = []
    many_batches_accs = []
    for i, batch in enumerate(dataloader):

        # Isolate the numpy arrays from the current batch that are needed for training
        source_batch = batch.post_text.to(device)
        y = batch.is_asshole.to(device)

        # Pass the texts through the trained seq2seq model
        with torch.no_grad():
            hidden, cell = seq2seq.forward_enc(source_batch)
        del source_batch

        print("y")
        print(y.shape)
        print(y)
        print("hidden")
        print(hidden)
        print(hidden.shape)

        '''
        # Pass the batch through the classifier (output layers)
        y_out = model(hidden, cell)

        # Binarise output probs to predictions in {0, 1}
        y_preds = y_out.detach()
        y_preds = torch.where(y_preds > 0.5, 1, 0)
        correct_predictions += int(torch.sum(y_preds == y))
        total_predictions += len(y_preds)
        many_batches_accs.append(torch.sum(y_preds == y) / len(y_preds))

        # Get the mean loss for the batch
        loss_fn = nn.BCELoss(reduction="mean").to(device)
        loss = loss_fn(y_out, y.float())
        losses.append(loss.item())
        many_batches_losses.append(loss.item())
        loss.backward()
      
        # Not sure why I am clipping the grad here. Apparently it helps prevent exploding gradients.
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Take an optimisation step
        optimiser.step()
        optimiser.zero_grad()

        if print_on:
        # Every so often print the current training accuracy
            if (i + 1) % 100 == 0:
                print("...Batch #{} : Training Loss={}, Training Accuracy={}".format(i + 1, sum(many_batches_losses) / len(many_batches_losses), sum(many_batches_accs) / len(many_batches_accs)))
                many_batches_losses = []
                many_batches_accs = []

    # Return the training accuracy and the mean training loss for the given epoch
    return correct_predictions / total_predictions, np.mean(losses)'''

In [None]:
optimiser = optim.Adam(model.parameters(), lr=LR)
train_class_of_encs_epoch(model, model, train_loader, optimiser, device, print_on=False)

y
torch.Size([3, 32])
tensor([[   2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2],
        [2936,  384, 2936,  384, 2936,  384, 2936, 2936,  384, 2936, 2936, 2936,
         2936, 2936, 2936, 2936, 2936, 2936, 2936, 2936, 2936, 2936, 2936, 2936,
         2936,  384,  384, 2936, 2936, 2936,  384, 2936],
        [   3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3]], device='cuda:0')
hidden
tensor([[[ 7.5251e-01,  9.9992e-01,  9.9927e-01,  ...,  9.4138e-03,
          -9.9956e-01,  4.3989e-07],
         [ 7.5253e-01,  9.9992e-01,  9.9927e-01,  ...,  9.4061e-03,
          -9.9956e-01,  4.3863e-07],
         [ 7.5257e-01,  9.9992e-01,  9.9927e-01,  ...,  9.4065e-03,
          -9.99

KeyboardInterrupt: ignored