# **Question Answer Models**

## **Import Packages and Data**

In [1]:
import time
from tqdm.auto import tqdm
import pickle

import math
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords as nltk_stopwords

import torch
from transformers import BertForQuestionAnswering, BertTokenizer
from sentence_transformers import SentenceTransformer

2023-10-02 09:59:02.862623: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_QA = pd.read_csv('/Users/kellyshreeve/Desktop/Data-Sets/Externship/qa_merged_clean.csv',
                    parse_dates=True)

In [3]:
df_QA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 987122 entries, 0 to 987121
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             987122 non-null  int64  
 1   id_q                   987122 non-null  int64  
 2   owner_user_id_q        973748 non-null  float64
 3   creation_date_q        987122 non-null  object 
 4   score_q                987122 non-null  int64  
 5   title                  987122 non-null  object 
 6   body_q                 987122 non-null  object 
 7   body_normalized_q      987120 non-null  object 
 8   title_normalized       987122 non-null  object 
 9   body_with_sentences_q  987122 non-null  object 
 10  title_with_sentences   987122 non-null  object 
 11  creation_year_q        987122 non-null  int64  
 12  id_a                   987122 non-null  float64
 13  owner_user_id_a        981755 non-null  float64
 14  creation_date_a        987122 non-nu

In [4]:
print(df_QA.isna().sum())

Unnamed: 0                   0
id_q                         0
owner_user_id_q          13374
creation_date_q              0
score_q                      0
title                        0
body_q                       0
body_normalized_q            2
title_normalized             0
body_with_sentences_q        0
title_with_sentences         0
creation_year_q              0
id_a                         0
owner_user_id_a           5367
creation_date_a              0
parent_id                    0
score_a                      0
body_a                       0
body_normalized_a            7
body_with_sentences_a        5
creation_year_a              0
answer_length                0
question_length              0
dtype: int64


In [5]:
df_QA=df_QA.reset_index(drop=True)

## **QA Models**

### Subset data for questions with answers and scores above 0

In [6]:
# Subset for Q & A with positive scores
df_QA = df_QA[(df_QA['score_a'] >= 0) & (df_QA['score_q'] >= 0)]

print('Answer Score Descriptives')
print(df_QA['score_a'].describe())
print()
print('Question Score Descriptives')
print(df_QA['score_q'].describe())

Answer Score Descriptives
count    913099.000000
mean          3.239337
std          22.090040
min           0.000000
25%           0.000000
50%           1.000000
75%           3.000000
max        8384.000000
Name: score_a, dtype: float64

Question Score Descriptives
count    913099.000000
mean          7.769584
std          65.424836
min           0.000000
25%           0.000000
50%           1.000000
75%           3.000000
max        5524.000000
Name: score_q, dtype: float64


All question and answer scores now have a minimum of 0.

There are 913,099 remaining Q/A pairs where both question and answer have scores > 0 and every question has at least one answer.

### BERT

In [7]:
bert_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Model with specific answer as context
result: got an answer!

In [8]:
# Choose row 10
display(df_QA.loc[10])

Unnamed: 0                                                              10
id_q                                                                   535
owner_user_id_q                                                      154.0
creation_date_q                                  2008-08-02 18:43:54+00:00
score_q                                                                 40
title                    Continuous Integration System for a Python Cod...
body_q                   <p>I'm starting work on a hobby project with a...
body_normalized_q        i'm starting work on a hobby project with a py...
title_normalized         continuous integration system for a python cod...
body_with_sentences_q    i'm starting work on a hobby project with a py...
title_with_sentences     continuous integration system for a python cod...
creation_year_q                                                       2008
id_a                                                               61746.0
owner_user_id_a          

In [9]:
# Ask a relevant question for row 10
question = 'how to do hobby project?'
answer_text = df_QA.loc[10, 'body_a']

In [11]:
# BERT QA with row 10
start_time = time.time()

input_ids = bert_tokenizer.encode(question, answer_text)

attention_mask = [1] * len(input_ids)

output = bert_model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))

start_index = torch.argmax(output[0][0, :len(input_ids) - input_ids.index(bert_tokenizer.sep_token_id)])
end_index = torch.argmax(output[1][0, :len(input_ids) - input_ids.index(bert_tokenizer.sep_token_id)])

answer = bert_tokenizer.decode(input_ids[start_index:end_index + 1], skip_special_tokens=True)

end_time = time.time()

computation_time = end_time - start_time

print(f'Answer: {answer}')
print(f'Computation Time: {computation_time:.2f} seconds')

Answer: build it yourself
Computation Time: 4.39 seconds


#### Find similar questions

Get embedding for first 10,000 Qs

In [3]:
df_QA = df_QA.reset_index(drop=True)

In [4]:
# Get embeddings for data set questions
start_time = time.time()

questions = df_QA.loc[0:10000, 'body_with_sentences_q']

sent_model = SentenceTransformer('bert-base-nli-mean-tokens')

ques_embeddings = sent_model.encode(questions)

end_time = time.time()

computation_time = end_time - start_time

print(f'Question Embeddings Shape: {ques_embeddings.shape}')
print(f'Computation Time: {computation_time:.2f} seconds')

Question Embeddings Shape: (10001, 768)
Computation Time: 1181.21 seconds


In [5]:
with open('/Users/kellyshreeve/Desktop/ques_embeddings', 'wb') as file:
    pickle.dump(ques_embeddings, file)

In [6]:
file = open('/Users/kellyshreeve/desktop/ques_embeddings', 'rb')
pickled_embeddings = pickle.load(file)

In [7]:
pickled_embeddings.shape

(10001, 768)

Find similar questions with cosine similarity

In [None]:
# Use cosine distance to find similar questions
start_time = time.time()

new_question = 'What is pandas?'

model = SentenceTransformer('bert-base-nli-mean-tokens')

new_question_embeddings = model.encode(new_question)

similarity_scores = cosine_similarity([new_question_embeddings],
                                       ques_embeddings)

best_index = np.argmin(similarity_scores)

end_time = time.time()

computation_time = end_time - start_time

best_question = df_QA.loc[best_index, 'body_with_sentences_q']
best_answer = df_QA.loc[best_index, 'body_with_sentences_a']

print('Question Posed:')
print(new_question)
print()
print('Question:')
print()
print(best_question)
print()
print('Answer:')
print()
print(best_answer)
print()
print(f'Best Index: {best_index}')
print()
print(f'Computation Time: {computation_time:.2f} seconds')

Question:

so when playing with the development i can just set settings.debug to true and if an error occures i can see it nicely formatted with good stack trace and request information. but on kind of production site i'd rather use debug false and show visitors some standard error page with information that i'm working on fixing this bug at this moment br at the same time i'd like to have some way of logging all those information stack trace and request info to a file on my server so i can just output it to my console and watch errors scroll email the log to me every hour or something like this. what logging solutions would you recomend for a django site that would meet those simple requirements i have the application running as fcgi server and i'm using apache web server as frontend although thinking of going to lighttpd .

Answer:

well when debug false django will automatically mail a full traceback of any error to each person listed in the admins setting which gets you notificatio

Find answer to question from answer to similar question

In [None]:
# Find answer from most relevant question
start_time = time.time()

new_question = 'What is pandas?'

input_ids = bert_tokenizer.encode(new_question, df_QA.loc[5822, 'body_a'])

attention_mask = [1] * len(input_ids)

output = bert_model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))

start_index = torch.argmax(output[0][0, :len(input_ids) - input_ids.index(bert_tokenizer.sep_token_id)])
end_index = torch.argmax(output[1][0, :len(input_ids) - input_ids.index(bert_tokenizer.sep_token_id)])

answer = bert_tokenizer.decode(input_ids[start_index:end_index + 1], skip_special_tokens=True)

end_time = time.time()

computation_time = end_time - start_time

print(f'Question: {new_question}')
print()
print(f'Answer: {answer}')
print()
print(f'Computation Time: {computation_time:.2f} seconds')

Answer: 

Computation Time: 1.97 seconds


BERT did not find an answer in the given answer text.

Check that BERT QA works for a more relevant question

In [None]:
# Find an answer from a more relevant question
start_time = time.time()

relevant_question = 'How to use django?'

input_ids = bert_tokenizer.encode(relevant_question, df_QA.loc[5822, 'body_a'])

attention_mask = [1] * len(input_ids)

output = bert_model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))

start_index = torch.argmax(output[0][0, :len(input_ids) - input_ids.index(bert_tokenizer.sep_token_id)])
end_index = torch.argmax(output[1][0, :len(input_ids) - input_ids.index(bert_tokenizer.sep_token_id)])

answer = bert_tokenizer.decode(input_ids[start_index:end_index + 1], skip_special_tokens=True)

end_time = time.time()

computation_time = end_time - start_time

print(f'Question: {relevant_question}')
print(f'Answer: {answer}')
print()
print(f'Computation Time: {computation_time:.2f} seconds')

Answer: django will automatically mail a full traceback of any error to each person listed in the < code > admins < / code > setting

Computation Time: 2.19 seconds


While not a great answer, BERT did find an answer in the answer text.

In [None]:
# Similar question function
def find_similar_question(question, df, question_column):
    new_question_embeddings = model.encode(new_question)

    similarity_scores = cosine_similarity([new_question_embeddings],
                                        ques_embeddings)

    best_index = np.argmin(similarity_scores)

    print(f'Posed Question: {question}')
    print(f'Most Similar Question: {df.loc[best_index, question_column]})

# Try new questions
find_similar_question('What is python?', df_QA, 'body_with_sentences_q')

### Find similar answers

Get embeddings for first 10,000 answers

In [None]:
# Get embeddings for data set answers
start_time = time.time()

sentence_model = SentenceTransformer('bert-base-nli-mean-tokens')

answers = df_QA.loc[0:10000, 'body_with_sentences_a']

answer_embeddings = sentence_model.encode(answers)

end_time = time.time()

computation_time = end_time - start_time

print(f'Question Embeddings Shape: {ques_embeddings.shape}')
print(f'Computation Time: {computation_time:.2f} seconds')

Question Embeddings Shape: (10001, 768)
Computation Time: 1125.99 seconds


Find answers similar to question

In [None]:
# Use cosine distance to find answer similar to question
start_time = time.time()

new_question = 'What is pandas?'

model = SentenceTransformer('bert-base-nli-mean-tokens')

new_question_embeddings = sentence_model.encode(new_question)

similarity_scores = cosine_similarity([new_question_embeddings],
                                       answer_embeddings)

best_index = np.argmin(similarity_scores)

end_time = time.time()

computation_time = end_time - start_time

best_question = df_QA.loc[best_index, 'body_with_sentences_q']
best_answer = df_QA.loc[best_index, 'body_with_sentences_a']

print('Question Posed:')
print()
print(new_question)
print()
print('Question:')
print()
print(best_question)
print()
print('Answer:')
print()
print(best_answer)
print()
print(f'Best Index: {best_index}')
print()
print(f'Computation Time: {computation_time:.2f} seconds')

Question Posed:

What is pandas?

Question:

i recently discovered the notify extension in mercurial which allows me quickly send out emails whenever i push changes but i'm tty sure i'm still missing out on a lot of functionality which could make my life a lot easier. ul li notify extension http www.selenic.com mercurial wiki index.cgi notifyextension rel nofollow http www.selenic.com mercurial wiki index.cgi notifyextension li ul which mercurial hook or combination of interoperating hooks is the most useful for working in a loosely connected team please add links to non standard parts you use and or add the hook or a description how to set it up so others can easily use it.

Answer:

i really enjoy what i did with my custom hook. i have it post a message to my campfire account campfire is a group based app . it worked out really well. because i had my clients in there and it could show him my progress.

Best Index: 1419

Computation Time: 1.18 seconds


Also not a relevant answer. Difficult to know if it would work better with more embeddings.

### Question Answer Function

In [None]:
def question_answer():
    # Text normaliztion
    def normalize_text(text):
        text = text.lower()
        text = text.replace('<p>', ' ')
        text = text.replace('</p>', ' ')
        text = text.replace('\n', ' ')
        text = text.replace('<a', ' ')
        text = text.replace('</a>', ' ')
        text = text.replace('href=', ' ')
        text = text.replace('</code', ' ')
        text = text.replace('</pre>', ' ')
        text = text.replace('<code>', ' ')
        text = text.replace('jpeg', ' ')
        text = text.replace('jpg', ' ')
        text = text.replace('pre', ' ')
        text = text.replace('pdf', ' ')
        text = re.sub(r"[^a-zA-z']", ' ', text)
        text = text.split()
        text = " ".join(text)
    
        return text
    
    def bert_qa(text):
        # ! DEFINE BERT QA FUNCTION
        text
        
        return text
    
    # User input question 
    question = input('Question:')
    
    # Normalize Question
    question_norm = normalize_text(question)
    
    # Use bert model to answer question
    answer = bert_qa(question_norm)
    
    # Print answer
    print(f'Normalized quesiton: {answer}')
    
    # User vote
    user_vote = 0
    
    vote = input('Was this question helpful? (y/n)')
    
    # ! ADD VOTE TO DATAFRAME COLUMN
    if vote == 'y':
        user_vote += 1
    elif vote =='n':
        user_vote -= 1
    else:
        vote = input('Please choose on of these options (y/n):')
        
    return user_vote