In [2]:
import pickle
import json
import re
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch import nn

In [3]:
# load training data
training_data = pickle.load(open('../../data/reddit/bbc_news_scrape_raw.pkl', 'rb'))
training_data[0].keys()

dict_keys(['post_id', 'comment_id', 'url', 'ancestors', 'text', 'full_context'])

In [4]:
'''
Frame as a retrieval problem

comment thread c1 --> c2 --> ... --> cn
link in cn leads to article with paragraphs p1 --> p2 --> pn

simple approach:

take beginning of article, encode it with IR model
lstm on each comment (embedding initialized with BERT?)
'''
bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
bert_model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
sbert_model = SentenceTransformer('msmarco-distilbert-cos-v5')

In [52]:
# encode the text data
X = []
Y = []
for example in training_data:
    encoded_context = sbert_model.encode(example['full_context'],convert_to_tensor=True)
    X.append(encoded_context)
    encoded_target = sbert_model.encode(" ".join(example['text']), convert_to_tensor=True)
    Y.append(encoded_target)

In [124]:
class URLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(URLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)

    def forward(self, x, prev_state):
        output, state = self.lstm(x, prev_state)
        return output, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, 1, self.input_size),
                torch.zeros(self.num_layers, 1, self.input_size))

input_size = 768
hidden_size = 768
num_layers = 5

model = URLSTM(input_size, hidden_size, num_layers)
model.to('cuda:0')


URLSTM(
  (lstm): LSTM(768, 768, num_layers=5)
)

In [125]:
learning_rate = 1e-1
loss_fn = nn.CosineEmbeddingLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(100):
    total_loss = 0
    for x,y in zip(X,Y):
        state_h, state_c = model.init_state(len(x))
        state_h = state_h.to('cuda:0')
        state_c = state_c.to('cuda:0')


        pred, (state_h, state_c) = model(torch.unsqueeze(x, 1), (state_h, state_c))

        condition = torch.tensor(1).to('cuda:0')

        loss = loss_fn(pred[-1][0], y, condition)

        state_h = state_h.detach()
        state_c = state_c.detach()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(total_loss / len(X))

In [None]:
# CURRENTLY DOESN'T WORK
# Need to pool or something

class URLRet(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, num_layers):
        super(URLRet, self).__init__()


        self.encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=dim_feedforward)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        self.f1 = nn.Linear(d_model, 1024)
        self.f2 = nn.Linear(1024, 768)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.flatten = nn.Flatten()

    def forward(self, x):

        x = self.encoder(x)
        x = self.flatten(x)

        x = self.f1(x)
        x = self.relu(x)
        x = self.f2(x)
        x = self.sigmoid(x)
        return x

d_model = 768
nhead = 4
dim_feedforward = 2048
num_layers=2

model = URLRet(d_model, nhead, dim_feedforward,2)
model.to('cuda:0')