In [26]:
import torch # you sould download the model then if you like you can save it
from transformers import BertForQuestionAnswering,BertTokenizer
# model = BertForQuestionAnswering.from_pretrained  ('bert-large-uncased-whole-word-masking-finetuned-squad') # to download the model
# model.save_pretrained("./BertLSquad/") # to save the model
model = BertForQuestionAnswering.from_pretrained('./BertLSquad/') # to open saved model

KeyboardInterrupt: 

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
question = "How many parameters does BERT-large have?"
answer_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."

In [4]:
# Apply the tokenizer to the input text, treating them as a text-pair.
input_ids = tokenizer.encode(question, answer_text)
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print('The input has a total of {:} tokens.'.format(len(input_ids)))

The input has a total of 70 tokens.


In [5]:
# Search the input_ids for the first instance of the `[SEP]` token.
sep_index = input_ids.index(tokenizer.sep_token_id)

# The number of segment A tokens includes the [SEP] token istelf.
num_seg_a = sep_index + 1

# The remainder are segment B.
num_seg_b = len(input_ids) - num_seg_a

# Construct the list of 0s and 1s.
segment_ids = [0]*num_seg_a + [1]*num_seg_b

# There should be a segment_id for every input token.
assert len(segment_ids) == len(input_ids)

In [6]:
# Run our example through the model.
outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                             token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                             return_dict=True) 

start_scores = outputs.start_logits
end_scores = outputs.end_logits


Now we can highlight the answer just by looking at the most probable start and end words. 

In [7]:
# Find the tokens with the highest `start` and `end` scores.
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])

print('Answer: "' + answer + '"')

Answer: "340 ##m"


In [8]:
# Start with the first token.
answer = tokens[answer_start]

# Select the remaining answer tokens and join them with whitespace.
for i in range(answer_start + 1, answer_end + 1):
    
    # If it's a subword token, then recombine it with the previous token.
    if tokens[i][0:2] == '##':
        answer += tokens[i][2:]
    
    # Otherwise, add a space then the token.
    else:
        answer += ' ' + tokens[i]

print('Answer: "' + answer + '"')

Answer: "340m"


In [9]:
# Pull the scores out of PyTorch Tensors and convert them to 1D numpy arrays.
s_scores = start_scores.detach().numpy().flatten()
e_scores = end_scores.detach().numpy().flatten()

# We'll use the tokens as the x-axis labels. In order to do that, they all need
# to be unique, so we'll add the token index to the end of each one.
token_labels = []
for (i, token) in enumerate(tokens):
    token_labels.append('{:} - {:>2}'.format(token, i))


Turn the QA process into a function so we can easily try out other examples.

In [10]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example through the model.
    outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                    token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                    return_dict=True) 

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    print('Answer: "' + answer + '"')

As our reference text, I've taken the Abstract of the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf).


In [11]:
question = "What does the 'B' in BERT stand for?"
answer_text = """Bidirectional Encoder Representations from Transformers (BERT) is a Transformer-based machine learning technique for natural language processing (NLP) pre-training developed by Google. BERT was created and published in 2018 by Jacob Devlin and his colleagues from Google.[1][2] As of 2019, Google has been leveraging BERT to better understand user searches.[3] The original English-language BERT has two models[1]: (1) the BERTBASE: 12 Encoders with 12 bidirectional self-attention heads, and (2) the BERTLARGE: 24 Encoders with 24 bidirectional self-attention heads. Both models are pre-trained from unlabeled data extracted from the BooksCorpus[4] with 800M words and English Wikipedia with 2,500M words[5]."""
answer_question(question, answer_text)

Query has 189 tokens.

Answer: "bidirectional encoder representations from transformers"


In [12]:
from preprocessing import PDFCorpus
pdf_corpus = PDFCorpus()
pdf_corpus.add_pdf('../data/reports/MGI-The-Age-of-Analytics-Full-report.pdf')

In [13]:
pdf_corpus.get_paragraphs_df()["content"][1710].replace('\n', '').replace('\xa0', ' ')

'Identify root causes for low product yield (e.g., tool-/die-specific issues) in manufacturing'

In [25]:
# pdf_corpus.get_paragraphs_df()
# 30 000 tokens
# INPUT
# Question.. .. .. . . . . . .  .  [sep] text....start....end................
# start - end

In [14]:
question = "Which industries?"
answer_text = pdf_corpus.get_paragraphs_df()["content"][1710].replace('\n', '').replace('\xa0', ' ')
answer_question(question, answer_text)

Query has 29 tokens.

Answer: "manufacturing"


In [15]:
question = "What is main idea?"
answer_text = pdf_corpus.get_paragraphs_df()["content"][1710].replace('\n', '').replace('\xa0', ' ')
answer_question(question, answer_text)

Query has 31 tokens.

Answer: "identify root causes for low product yield"


In [16]:
question = "what kind of application?"
answer_text = pdf_corpus.get_paragraphs_df()["content"][1710].replace('\n', '').replace('\xa0', ' ')
answer_question(question, answer_text)

Query has 31 tokens.

Answer: "manufacturing"


In [17]:
# we can ask one question per one paragraph or per multi paragraphs
paragraphs=5    # how many paragraphs
counter=0 # check relation between answers and our NERs to get meaningfull results
questions=["what is the expectation of artificial intelligence?"]# , "What industries?" , "What is main idea?", "What is the summary?"]
for question in questions:
    for answer_text in pdf_corpus.get_paragraphs_df()["content"]:        
        if paragraphs != counter :
            counter += 1
            print("question = ",question)
            answer_question(question, answer_text)
            print("---------------------------------------")
        else:
            break
# after building a df by answers check df to clear results and
# UI can filter results by sectors or functions

question =  what is the expectation of artificial intelligence?
Query has 23 tokens.

Answer: "competing in a data - driven world"
---------------------------------------
question =  what is the expectation of artificial intelligence?
Query has 13 tokens.

Answer: "december"
---------------------------------------
question =  what is the expectation of artificial intelligence?
Query has 18 tokens.

Answer: "what is the expectation of artificial intelligence ? [SEP]"
---------------------------------------
question =  what is the expectation of artificial intelligence?
Query has 12 tokens.

Answer: "what is the expectation of artificial intelligence ? [SEP]"
---------------------------------------
question =  what is the expectation of artificial intelligence?
Query has 13 tokens.

Answer: "organizational challenges"
---------------------------------------


In [18]:
pdf_corpus.get_paragraphs_df()["content"][1710].replace('\n', '').replace('\xa0', ' ')

'Identify root causes for low product yield (e.g., tool-/die-specific issues) in manufacturing'

In [19]:
# pdf_corpus.get_paragraphs_df()
# pdf_corpus.get_docs_df()
# pdf_corpus.get_sentences_df()
# pdf_corpus.get_corpus_df()