# **QA User Function**

In [1]:
import time
import re
import pickle

import math
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords as nltk_stopwords

import torch
from transformers import BertForQuestionAnswering, BertTokenizer
from sentence_transformers import SentenceTransformer

2023-10-13 14:45:07.622747: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### QA Function v.1 - initial

In [10]:
def question_answer():
    start_time = time.time()
    
    # Take in user question
    posed_question = input('Question:')
    
    # Load data frame
    df_QA = pd.read_csv('/Users/kellyshreeve/Desktop/Data-Sets/Externship/qa_merged_clean.csv')
    
    # Load question embeddings
    file = open('/Users/kellyshreeve/desktop/embeddings', 'rb')
    ques_embeddings = pickle.load(file)
    
    # Initiate Sentence Model
    sent_model = SentenceTransformer('bert-base-nli-mean-tokens')

    # Get new question embeddings
    new_question_embeddings = sent_model.encode(posed_question)

    # Find most similar question index
    similarity_scores = cosine_similarity([new_question_embeddings],
                                       ques_embeddings)

    best_index = np.argmax(similarity_scores)
    
    # Extract similar question and answer text from df
    best_question = df_QA.loc[best_index, 'body_with_sentences_q']
    best_answer = df_QA.loc[best_index, 'body_with_sentences_a']
    
    end_time = time.time()
    
    computation_time = end_time - start_time
    
    # Print results
    print(f'Posed Question: {posed_question}')
    print(f'Similar Question: {best_question}')
    print(f'Similar Answer: {best_answer}')
    print(f'Embeddings Shape: {ques_embeddings.shape}')
    print(f'Computation Time: {computation_time}')

In [11]:
question_answer()

KeyboardInterrupt: 

### QA Function v.2 - faster

Pull data, embedding, and model load out of function.

In [None]:
# Load all data and sentence model
start = time.time()

# Load data frame
df_QA = pd.read_csv('/Users/kellyshreeve/Desktop/Data-Sets/Externship/qa_merged_clean.csv')
    
# Load question embeddings
file = open('/Users/kellyshreeve/desktop/ques_embeddings', 'rb')
ques_embeddings = pickle.load(file)

# Initiate Sentence Model
sent_model = SentenceTransformer('bert-base-nli-mean-tokens')

end = time.time()

print(f'Compuation Time: {end - start}')

Compuation Time: 45.44052767753601


In [None]:
# Function to get user input and embeddings and return
# similar Q/A. Does not load data or embeddings.

def question_answer_v2():
    start_time = time.time()
    
    # Take in user question
    posed_question = input('Question:')

    # Get new question embeddings
    new_question_embeddings = sent_model.encode(posed_question)

    # Find most similar question index
    similarity_scores = cosine_similarity([new_question_embeddings],
                                       ques_embeddings)

    best_index = np.argmax(similarity_scores)
    
    # Extract similar question and answer text from df
    best_question = df_QA.loc[best_index, 'body_with_sentences_q']
    best_answer = df_QA.loc[best_index, 'body_with_sentences_a']
    
    end_time = time.time()
    
    computation_time = end_time - start_time
    
    # Print results
    print(f'Posed Question: {posed_question}')
    print()
    print(f'Similar Question: {best_question}')
    print()
    print(f'Similar Answer: {best_answer}')
    print()
    print(f'Embeddings Shape: {ques_embeddings.shape}')
    print()
    print(f'Computation Time: {computation_time}')

In [None]:
question_answer()

Posed Question: What is django?

Similar Question: how do i go about specifying and using an enum in a django model

Similar Answer: from the https docs.djangoproject.com en dev ref models fields django.db.models.field.choices rel nofollow django documentation maybechoice 'y' 'yes' 'n' 'no' 'u' 'unknown' and you define a charfield in your model married models.charfield max_len h choices maybechoice you can do the same with integer fields if you don't like to have letters in your db. in that case rewrite your choices maybechoice 'yes' 'no' 'unknown'

Embeddings Shape: (10001, 768)

Computation Time: 4.076124906539917


In [None]:
question_answer()

Posed Question: How to add a column in Pandas?

Similar Question: how do you change the size of figure drawn with matplotlib

Similar Answer: the following seems to work from pylab import rcparams rcparams['figure.figsize'] this makes the figure's width inches and its height b inches b . the figure class then uses this as the default value for one of its arguments.

Embeddings Shape: (10001, 768)

Computation Time: 12.959703922271729


In [None]:
question_answer()

Posed Question: How to find a full path to a font?

Similar Question: does anyone know how to do this i need to add a header of the form value value

Similar Answer: as the question is phrased it's hard to guess what the intention or even the intended semantics is. for setting headers try the following import soappy headers soappy.types.headertype headers.value value or [...] headers.foo value headers.bar value

Embeddings Shape: (10001, 768)

Computation Time: 7.04338002204895


In [None]:
question_answer()

Posed Question: How to find a full path to a font in photoshop javascript?

Similar Question: is there any python module for rendering a html page with javascript and get back a dom object i want to parse a page which generates almost all of its content using javascript.

Similar Answer: only way i know to accomplish this would be to drive real browser for example using http selenium rc.openqa.org rel nofollow selenium rc .

Embeddings Shape: (10001, 768)

Computation Time: 14.047891855239868


### QA Function v.3 - normalize question text

In [2]:
# Load all data and sentence model
start = time.time()

# Load data frame
df_QA = pd.read_csv('/Users/kellyshreeve/Desktop/Data-Sets/Externship/qa_merged_clean.csv')
    
# Load question embeddings
file = open('/Users/kellyshreeve/desktop/ques_embeddings', 'rb')
ques_embeddings = pickle.load(file)

# Initiate Sentence Model
sent_model = SentenceTransformer('bert-base-nli-mean-tokens')

end = time.time()

print(f'Compuation Time: {end - start}')

Compuation Time: 42.505502223968506


In [3]:
def normalize_with_sentences(text):
    text = text.lower()
    text = text.replace('<p>', ' ')
    text = text.replace('</p>', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('<a', ' ')
    text = text.replace('</a>', ' ')
    text = text.replace('href=', ' ')
    text = text.replace('</code', ' ')
    text = text.replace('</pre>', ' ')
    text = text.replace('<code>', ' ')
    text = text.replace('jpeg', ' ')
    text = text.replace('jpg', ' ')
    text = text.replace('pre', ' ')
    text = text.replace('pdf', ' ')
    text = text.replace('gt', ' ')
    text = re.sub(r"[^a-zA-z'.]", ' ', text)
    text = text.split()
    text = " ".join(text)
    
    return text

In [4]:
# Function to get user input and embeddings and return
# similar Q/A. Does not load data or embeddings.
# Normalizes question text

def question_answer_v3():
    start_time = time.time()
    
    # Take in user question
    posed_question = input('Question:')
    
    # Normalize question
    posed_quesiton = normalize_with_sentences(posed_question)

    # Get new question embeddings
    new_question_embeddings = sent_model.encode(posed_question)

    # Find most similar question index
    similarity_scores = cosine_similarity([new_question_embeddings],
                                       ques_embeddings)

    best_index = np.argmax(similarity_scores)
    
    # Extract similar question and answer text from df
    best_question = df_QA.loc[best_index, 'body_with_sentences_q']
    best_answer = df_QA.loc[best_index, 'body_with_sentences_a']
    
    end_time = time.time()
    
    computation_time = end_time - start_time
    
    # Print results
    print(f'Posed Question: {posed_question}')
    print()
    print(f'Similar Question: {best_question}')
    print()
    print(f'Similar Answer: {best_answer}')
    print()
    print(f'Embeddings Shape: {ques_embeddings.shape}')
    print()
    print(f'Computation Time: {computation_time}')

In [8]:
question_answer_v3()

Posed Question: What is python?

Similar Question: how do you create a weak reference to an object in python

Similar Answer: import weakref class object ... pass ... o object r weakref.ref o if the reference is still active r will be o otherwise none do_something_with_o r see the http docs.python.org lib module weakref.html wearkref module docs for more details. you can also use weakref.proxy to create an object that proxies o. will throw referenceerror if used when the referent is no longer referenced.

Embeddings Shape: (10001, 768)

Computation Time: 5.623437166213989


In [5]:
question_answer_v3()

Posed Question: What is pandas?

Similar Question: how do i turn a python program into an .egg file

Similar Answer: http peak.telecommunity.com devcenter setuptools setuptools is the software that creates http peak.telecommunity.com devcenter pythoneggs .egg files . it's an extension of the http docs.python.org lib module distutils.html distutils package in the standard library. the process involves creating a setup.py file then python setup.py bdist_egg creates an .egg package.

Embeddings Shape: (10001, 768)

Computation Time: 9.889105796813965


In [73]:
question_answer()

Posed Question: What is Django?

Similar Question: how do i go about specifying and using an enum in a django model

Similar Answer: from the https docs.djangoproject.com en dev ref models fields django.db.models.field.choices rel nofollow django documentation maybechoice 'y' 'yes' 'n' 'no' 'u' 'unknown' and you define a charfield in your model married models.charfield max_len h choices maybechoice you can do the same with integer fields if you don't like to have letters in your db. in that case rewrite your choices maybechoice 'yes' 'no' 'unknown'

Embeddings Shape: (10001, 768)

Computation Time: 8.851870059967041


In [74]:
question_answer()

Posed Question: How to find the full path to a font?

Similar Question: does anyone know how to do this i need to add a header of the form value value

Similar Answer: as the question is phrased it's hard to guess what the intention or even the intended semantics is. for setting headers try the following import soappy headers soappy.types.headertype headers.value value or [...] headers.foo value headers.bar value

Embeddings Shape: (10001, 768)

Computation Time: 8.934815883636475


In [55]:
# Find top 5
x = np.array([2, 5, 9, 3, 10, 7, 2, 11])

# Top 4
print(np.argpartition(x, -4))
print(x[np.argpartition(x, -4)][-4:])

# Sorted Top 4
print(sorted(x[np.argpartition(x, -4)][-4:]))

# Smallest 5
print(np.argpartition(x, 5)[:5])

[6 0 3 1 5 4 2 7]
[ 7 10  9 11]
[7, 9, 10, 11]
[6 0 3 1 5]
