In [2]:
import torch
import json
import transformers
from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup

In [3]:
with open(r"D:\Work\Baseline_V1\data_task_1_train\data_task_1_train.json") as f:
    data = json.load(f)

In [4]:
import random
random.choice(data)

{'question_id': '3e9esy',
 'question': "With home backup batteries gaining some momentum through products like the Tesla Powerwall, why aren't flywheel batteries being discussed as an alternative?",
 'answers': ["Flywheels big enough to store power for a house are big heavy things that spin very, very fast and contain a huge amount of energy. Like enough energy to launch big pieces of metal clear into the next county if they get unbalanced.\n\nThat's why you don't see them suggested as an alternative for home use."],
 'ctxs': ['PV systems, due to their high reliability, low self discharge and investment and maintenance costs, despite shorter lifetime and lower energy density. Lithium-ion batteries have the potential to replace lead-acid batteries in the near future, as they are being intensively developed and lower prices are expected due to economies of scale provided by large production facilities such as the Gigafactory 1. In addition, the Li-ion batteries of plug-in electric cars m

In [7]:
def embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length=128, device="cuda:0"):
    a_toks = tokenizer.batch_encode_plus(passages, max_length=max_length, pad_to_max_length=True)
    a_ids, a_mask = (
        torch.LongTensor(a_toks["input_ids"]).to(device),
        torch.LongTensor(a_toks["attention_mask"]).to(device),
    )
    with torch.no_grad():
        a_reps = qa_embedder.embed_answers(a_ids, a_mask).cpu().type(torch.float)
    return a_reps.numpy()

def embed_questions_for_retrieval(q_ls, tokenizer, qa_embedder, device="cuda:0"):
    q_toks = tokenizer.batch_encode_plus(q_ls, max_length=128, pad_to_max_length=True)
    q_ids, q_mask = (
        torch.LongTensor(q_toks["input_ids"]).to(device),
        torch.LongTensor(q_toks["attention_mask"]).to(device),
    )
    with torch.no_grad():
        q_reps = qa_embedder.embed_questions(q_ids, q_mask).cpu().type(torch.float)
    return q_reps.numpy()

In [7]:
questions = [sample['question'] for sample in data]

In [6]:
qar_model = AutoModel.from_pretrained('yjernite/retribert-base-uncased').to('cuda:0')
_ = qar_model.eval()

Some weights of RetriBertModel were not initialized from the model checkpoint at yjernite/retribert-base-uncased and are newly initialized: ['bert_query.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
class RetrievalQAEmbedder(torch.nn.Module):
    def __init__(self, sent_encoder, dim):
        super(RetrievalQAEmbedder, self).__init__()
        self.sent_encoder = sent_encoder
        self.output_dim = 128
        self.project_q = torch.nn.Linear(dim, self.output_dim, bias=False)
        self.project_a = torch.nn.Linear(dim, self.output_dim, bias=False)
        self.ce_loss = torch.nn.CrossEntropyLoss(reduction="mean")

    def embed_sentences_checkpointed(self, input_ids, attention_mask, checkpoint_batch_size=-1):
        # reproduces BERT forward pass with checkpointing
        if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size:
            return self.sent_encoder(input_ids, attention_mask=attention_mask)[1]
        else:
            # prepare implicit variables
            device = input_ids.device
            input_shape = input_ids.size()
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
            head_mask = [None] * self.sent_encoder.config.num_hidden_layers
            extended_attention_mask: torch.Tensor = self.sent_encoder.get_extended_attention_mask(
                attention_mask, input_shape, device
            )

            # define function for checkpointing
            def partial_encode(*inputs):
                encoder_outputs = self.sent_encoder.encoder(inputs[0], attention_mask=inputs[1], head_mask=head_mask,)
                sequence_output = encoder_outputs[0]
                pooled_output = self.sent_encoder.pooler(sequence_output)
                return pooled_output

            # run embedding layer on everything at once
            embedding_output = self.sent_encoder.embeddings(
                input_ids=input_ids, position_ids=None, token_type_ids=token_type_ids, inputs_embeds=None
            )
            # run encoding and pooling on one mini-batch at a time
            pooled_output_list = []
            for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)):
                b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
                b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
                pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask)
                pooled_output_list.append(pooled_output)
            return torch.cat(pooled_output_list, dim=0)

    def embed_questions(self, q_ids, q_mask, checkpoint_batch_size=-1):
        q_reps = self.embed_sentences_checkpointed(q_ids, q_mask, checkpoint_batch_size)
        return self.project_q(q_reps)

    def embed_answers(self, a_ids, a_mask, checkpoint_batch_size=-1):
        a_reps = self.embed_sentences_checkpointed(a_ids, a_mask, checkpoint_batch_size)
        return self.project_a(a_reps)

    def forward(self, q_ids, q_mask, a_ids, a_mask, checkpoint_batch_size=-1):
        device = q_ids.device
        q_reps = self.embed_questions(q_ids, q_mask, checkpoint_batch_size)
        a_reps = self.embed_answers(a_ids, a_mask, checkpoint_batch_size)
        compare_scores = torch.mm(q_reps, a_reps.t())
        loss_qa = self.ce_loss(compare_scores, torch.arange(compare_scores.shape[1]).to(device))
        loss_aq = self.ce_loss(compare_scores.t(), torch.arange(compare_scores.shape[0]).to(device))
        loss = (loss_qa + loss_aq) / 2
        return loss


In [3]:
def make_qa_retriever_model(model_name="google/bert_uncased_L-8_H-512_A-8", from_file=None, device="cuda:0"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    bert_model = AutoModel.from_pretrained(model_name).to(device)
    # run bert_model on a dummy batch to get output dimension
    d_ids = torch.LongTensor(
        [[bert_model.config.bos_token_id if bert_model.config.bos_token_id is not None else 1]]
    ).to(device)
    d_mask = torch.LongTensor([[1]]).to(device)
    sent_dim = bert_model(d_ids, attention_mask=d_mask)[1].shape[-1]
    qa_embedder = RetrievalQAEmbedder(bert_model, sent_dim).to(device)
    if from_file is not None:
        param_dict = torch.load(from_file)  # has model weights, optimizer, and scheduler states
        qa_embedder.load_state_dict(param_dict["model"])
    return tokenizer, qa_embedder

In [4]:
def query_qa_dense_index(
    question, qa_embedder, tokenizer, wiki_passages, wiki_index, n_results=10, min_length=20, device="cuda:0"
):
    q_rep = embed_questions_for_retrieval([question], tokenizer, qa_embedder, device=device)
    D, I = wiki_index.search(q_rep, 2 * n_results)
    res_passages = [wiki_passages[int(i)] for i in I[0]]
    support_doc = "<P> " + " <P> ".join([p["passage_text"] for p in res_passages])
    res_list = [dict([(k, p[k]) for k in wiki_passages.column_names]) for p in res_passages]
    res_list = [res for res in res_list if len(res["passage_text"].split()) > min_length][:n_results]
    for r, sc in zip(res_list, D[0]):
        r["score"] = float(sc)
    return support_doc, res_list

In [11]:
# q_rep = embed_questions_for_retrieval(questions, tokenizer, qa_embedder)

NameError: name 'embed_questions_for_retrieval' is not defined

In [8]:
import json
import nlp
import pandas as pd
from IPython.display import display
import torch
from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer,  get_linear_schedule_with_warmup

# with open(r'D:\Work\Baseline_V1\data_task_1_train\ELI5_ori_processed\ELI5_train_10_doc.json') as f:
#     train = json.load(f)
    
with open(r'D:\Work\Baseline_V1\data_task_1_train\ELI5_ori_processed\ELI5_val_10_doc.json') as f:
    val = json.load(f)

#--------------------------------------------------------------------------------------------------

with open(r'D:\Work\Baseline_V1\pred_ref_t5.json', 'r')  as f:
    pred_ref_t5 = json.load(f)

with open(r'D:\Work\Baseline_V1\prediction_t5_2.json', 'r')  as f:
    predict_t5 = json.load(f)

predicted_load = pred_ref_t5[0]
reference_load = pred_ref_t5[1]
reference_load_5samp = reference_load[0:5]


for i, sample in enumerate(reference_load_5samp):
    reference_load_5samp[i] = sample[0]

print("------------")
print("Prediction's len: {}, Reference's len: {}".format(len(predict_t5), len(reference_load_5samp)))
print("------------")

nlp_rouge = nlp.load_metric('rouge')
scores = nlp_rouge.compute(
    predict_t5, reference_load_5samp,
    rouge_types=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'],
    use_agregator=True, use_stemmer=False
)
df = pd.DataFrame({
    'rouge1': [scores['rouge1'].mid.precision, scores['rouge1'].mid.recall, scores['rouge1'].mid.fmeasure],
    'rouge2': [scores['rouge2'].mid.precision, scores['rouge2'].mid.recall, scores['rouge2'].mid.fmeasure],
    'rougeL': [scores['rougeL'].mid.precision, scores['rougeL'].mid.recall, scores['rougeL'].mid.fmeasure],
}, index=[ 'P', 'R', 'F'])
df.style.format({'rouge1': "{:.4f}", 'rouge2': "{:.4f}", 'rougeL': "{:.4f}"})
display(df)

# idx = 2
# print("Question:   {}".format(val[2]['question']))
# print("Prediction: {}".format(predicted_load[idx]))
# print("Reference:  {}".format(reference_load[idx]))


FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Work\\Baseline_V1\\data_task_1_train\\ELI5_ori_processed\\ELI5_val_10_doc.json'

In [10]:
idx = 5
print("{} \n {}".format(predicted_load[idx], reference_load_5samp[idx]))

IndexError: list index out of range

In [None]:
import os
import dotenv
import psycopg2
import torch
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer,AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer


dotenv.load_dotenv()

DBNAME=os.getenv("DBNAME", "wiki_db")
HOST=os.getenv("HOST", "124.158.12.207")
PORT=os.getenv("PORT", "15433")
USER=os.getenv("USER", "gradlab")
PWD=os.getenv("PASSWORD", "baldarg")
# TB_CLIENT=os.getenv("TB_CLIENT","client_tb")
TB_WIKI=os.getenv("TB_WIKI", "wiki_tb")
# MSD_WIKI = bool(os.getenv("MSD_WIKI", False))

#TODO: Use this function

def query_embd(embd, limit_doc=3, ):
    embd = str(list(embd.cpu().detach().numpy().reshape(-1)))
    try:
        connection = psycopg2.connect(dbname=DBNAME,host=HOST,port=PORT,user=USER,password=PWD)
        cursor = connection.cursor()
        aemb_sql = f"""
                        SET LOCAL ivfflat.probes = 3;
                        SELECT content 
                        FROM {TB_WIKI}
                        ORDER BY embedd <#> %s LIMIT %s;
                    """
        cursor.execute(aemb_sql,(embd, limit_doc))
        connection.commit()
        rows = cursor.fetchall()

        if connection: 
            cursor.close()
            connection.close()
        
        return rows
        
    except (Exception, psycopg2.Error) as error: 
        print("Failed query record from database {}".format(error))

def load_model_qs(
    pretrain_name="vblagoje/dpr-question_encoder-single-lfqa-wiki", 
    device = torch.device("cuda:0")
    ):
    qs_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(pretrain_name)
    qs_model = DPRQuestionEncoder.from_pretrained(pretrain_name)
    qs_model.to(device)
    
    return qs_model, qs_tokenizer

def get_embds_qs(model, tokenizer, text, device):
    # Tokenize sentences
    model.eval()
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    return model_output['pooler_output']

def make_qa_s2s_model(model_name="facebook/bart-base", from_file=None, device="cuda:0"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    if from_file is not None:
        param_dict = torch.load(from_file)  # has model weights, optimizer, and scheduler states
        model.load_state_dict(param_dict["model"])
    return tokenizer, model

def qa_s2s_generate(question_doc, qa_s2s_model, qa_s2s_tokenizer, num_answers=1, num_beams=None,
                    min_len=64, max_len=256, do_sample=False,temp=1.0, top_p=None, top_k=None,
                    max_input_length=512, device="cuda:0"):
    
    model_inputs = make_qa_s2s_batch([(question_doc, "A")], qa_s2s_tokenizer, 
                                       max_input_length, device=device)
    
    n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
    model = qa_s2s_model.module if hasattr(qa_s2s_model, 'module') else qa_s2s_model 
    generated_ids = model.generate( input_ids=model_inputs["input_ids"],
                                           attention_mask=model_inputs["attention_mask"],
                                           min_length=min_len,max_length=max_len,
                                           do_sample=do_sample, early_stopping=True,
                                           num_beams=1 if do_sample else n_beams,
                                           temperature=temp,top_k=top_k,top_p=top_p,
                                           eos_token_id=qa_s2s_tokenizer.eos_token_id,
                                           no_repeat_ngram_size=3,
                                           num_return_sequences=num_answers,
                                           decoder_start_token_id=qa_s2s_tokenizer.bos_token_id)
    return [qa_s2s_tokenizer.decode(ans_ids, skip_special_tokens=True).strip() for ans_ids in generated_ids]

def make_qa_s2s_batch(qa_list, tokenizer, max_len=64, max_a_len=360, device="cuda:0"):
    q_ls = [q for q, a in qa_list]
    a_ls = [a for q, a in qa_list]
    q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True)
    q_ids, q_mask = (torch.LongTensor(q_toks["input_ids"]).to(device),
                     torch.LongTensor(q_toks["attention_mask"]).to(device))
    a_toks = tokenizer.batch_encode_plus(a_ls, max_length=min(max_len, max_a_len), pad_to_max_length=True)
    a_ids, a_mask = (torch.LongTensor(a_toks["input_ids"]).to(device),
                     torch.LongTensor(a_toks["attention_mask"]).to(device))
    labels = a_ids[:, 1:].contiguous().clone()
    labels[a_mask[:, 1:].contiguous() == 0] = -100
    model_inputs = {"input_ids": q_ids,
                    "attention_mask": q_mask,
                    "decoder_input_ids": a_ids[:, :-1].contiguous(),
                    "labels": labels}
    return model_inputs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qa_s2s_tokenizer, pre_model = make_qa_s2s_model(model_name="facebook/bart-base",
                                                from_file=r"D:\Work\Baseline_V1\src\Model\bart-base_model.pth",
                                                device="cpu:0")

In [3]:
def retrieve(question, qs_model, qs_tokenizer, device, limit_doc=10):
    question_embd = get_embds_qs(qs_model, qs_tokenizer, question, device=device)
    documents_wiki = query_embd(question_embd, limit_doc=limit_doc)
    return [doc[-1] for doc in documents_wiki]


In [4]:
while(1):
    question = input("\nUSER:")
    if question == "[EXIT]":
        break
    else:
        doc_5 = retrieve(question, pre_model, qa_s2s_tokenizer, "cuda:0", limit_doc=5)
        doc = "<P> " + " <P> ".join([p for p in doc_5])
        question_doc = "question: {} context: {}".format(question, doc)

        # generate an answer with beam search
        answer = qa_s2s_generate(
                question_doc, pre_model, qa_s2s_tokenizer,
                num_answers=1,
                num_beams=6,
                min_len=3,
                max_len=100,
                max_input_length=1024,
                device="cuda:0")[0]
        
        print("\nBOT:", answer.replace("\n", ""))

In [None]:
!tensorboard --logdir=runs