# **QA User Function**

In [1]:
import time
import re
import pickle

import math
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords as nltk_stopwords

import torch
from transformers import BertForQuestionAnswering, BertTokenizer
from sentence_transformers import SentenceTransformer

2023-10-03 13:57:08.226969: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### QA Function v.1 - initial

In [8]:
def question_answer():
    start_time = time.time()
    
    # Take in user question
    posed_question = input('Question:')
    
    # Load data frame
    df_QA = pd.read_csv('/Users/kellyshreeve/Desktop/Data-Sets/Externship/qa_merged_clean.csv')
    
    # Load question embeddings
    file = open('/Users/kellyshreeve/desktop/embeddings', 'rb')
    ques_embeddings = pickle.load(file)
    
    # Initiate Sentence Model
    sent_model = SentenceTransformer('bert-base-nli-mean-tokens')

    # Get new question embeddings
    new_question_embeddings = sent_model.encode(posed_question)

    # Find most similar question index
    similarity_scores = cosine_similarity([new_question_embeddings],
                                       ques_embeddings)

    best_index = np.argmax(similarity_scores)
    
    # Extract similar question and answer text from df
    best_question = df_QA.loc[best_index, 'body_with_sentences_q']
    best_answer = df_QA.loc[best_index, 'body_with_sentences_a']
    
    end_time = time.time()
    
    computation_time = end_time - start_time
    
    # Print results
    print(f'Posed Question: {posed_question}')
    print(f'Similar Question: {best_question}')
    print(f'Similar Answer: {best_answer}')
    print(f'Embeddings Shape: {ques_embeddings.shape}')
    print(f'Computation Time: {computation_time}')

In [9]:
question_answer()

Posed Question: What is django?
Similar Question: i'm starting work on a hobby project with a python codebase and would like to set up some form of continuous integration i.e. running a battery of test cases each time a check in is made and sending nag e mails to responsible persons when the tests fail similar to cruisecontrol or teamcity. i realize i could do this with hooks in most vcses but that requires that the tests run on the same machine as the version control server which isn't as elegant as i would like. does anyone have any suggestions for a small user friendly open source continuous integration system suitable for a python codebase
Similar Answer: one possibility is hudson. it's written in java but there's integration with python projects blockquote http redsolo.blogspot.com hudson embraces python.html rel nofollow hudson embraces python blockquote i've never tried it myself however. strong update strong sept. after a trademark dispute hudson has been renamed to http jenkin

### QA Function v.2 - faster

Pull data, embedding, and model load out of function.

In [2]:
# Load all data and sentence model
start = time.time()

# Load data frame
df_QA = pd.read_csv('/Users/kellyshreeve/Desktop/Data-Sets/Externship/qa_merged_clean.csv')
    
# Load question embeddings
file = open('/Users/kellyshreeve/desktop/ques_embeddings', 'rb')
ques_embeddings = pickle.load(file)

# Initiate Sentence Model
sent_model = SentenceTransformer('bert-base-nli-mean-tokens')

end = time.time()

print(f'Compuation Time: {end - start}')

Compuation Time: 45.44052767753601


In [3]:
# Function to get user input and embeddings and return
# similar Q/A. Does not load data or embeddings.

def question_answer_v2():
    start_time = time.time()
    
    # Take in user question
    posed_question = input('Question:')

    # Get new question embeddings
    new_question_embeddings = sent_model.encode(posed_question)

    # Find most similar question index
    similarity_scores = cosine_similarity([new_question_embeddings],
                                       ques_embeddings)

    best_index = np.argmax(similarity_scores)
    
    # Extract similar question and answer text from df
    best_question = df_QA.loc[best_index, 'body_with_sentences_q']
    best_answer = df_QA.loc[best_index, 'body_with_sentences_a']
    
    end_time = time.time()
    
    computation_time = end_time - start_time
    
    # Print results
    print(f'Posed Question: {posed_question}')
    print()
    print(f'Similar Question: {best_question}')
    print()
    print(f'Similar Answer: {best_answer}')
    print()
    print(f'Embeddings Shape: {ques_embeddings.shape}')
    print()
    print(f'Computation Time: {computation_time}')

In [65]:
question_answer()

Posed Question: What is django?

Similar Question: how do i go about specifying and using an enum in a django model

Similar Answer: from the https docs.djangoproject.com en dev ref models fields django.db.models.field.choices rel nofollow django documentation maybechoice 'y' 'yes' 'n' 'no' 'u' 'unknown' and you define a charfield in your model married models.charfield max_len h choices maybechoice you can do the same with integer fields if you don't like to have letters in your db. in that case rewrite your choices maybechoice 'yes' 'no' 'unknown'

Embeddings Shape: (10001, 768)

Computation Time: 4.076124906539917


In [66]:
question_answer()

Posed Question: How to add a column in Pandas?

Similar Question: how do you change the size of figure drawn with matplotlib

Similar Answer: the following seems to work from pylab import rcparams rcparams['figure.figsize'] this makes the figure's width inches and its height b inches b . the figure class then uses this as the default value for one of its arguments.

Embeddings Shape: (10001, 768)

Computation Time: 12.959703922271729


In [21]:
display(df_QA.head(20))

Unnamed: 0.1,Unnamed: 0,id_q,owner_user_id_q,creation_date_q,score_q,title,body_q,body_normalized_q,title_normalized,body_with_sentences_q,...,owner_user_id_a,creation_date_a,parent_id,score_a,body_a,body_normalized_a,body_with_sentences_a,creation_year_a,answer_length,question_length
0,0,469,147.0,2008-08-02 15:11:16+00:00,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,i am using the photoshop's javascript api to f...,how can i find the full path to a font from it...,i am using the photoshop's javascript api to f...,...,50.0,2008-08-02 16:56:53+00:00,469.0,4.0,<p>open up a terminal (Applications-&gt;Utilit...,open up a terminal applications utilities term...,open up a terminal applications utilities term...,2008.0,34,83
1,1,469,147.0,2008-08-02 15:11:16+00:00,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,i am using the photoshop's javascript api to f...,how can i find the full path to a font from it...,i am using the photoshop's javascript api to f...,...,153.0,2008-08-02 17:42:28+00:00,469.0,2.0,<p>I haven't been able to find anything that d...,i haven't been able to find anything that does...,i haven't been able to find anything that does...,2008.0,43,83
2,2,469,147.0,2008-08-02 15:11:16+00:00,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,i am using the photoshop's javascript api to f...,how can i find the full path to a font from it...,i am using the photoshop's javascript api to f...,...,457.0,2008-08-06 03:01:23+00:00,469.0,12.0,<p>Unfortunately the only API that isn't depre...,unfortunately the only api that isn't de cated...,unfortunately the only api that isn't de cated...,2008.0,60,83
3,3,469,147.0,2008-08-02 15:11:16+00:00,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,i am using the photoshop's javascript api to f...,how can i find the full path to a font from it...,i am using the photoshop's javascript api to f...,...,745.0,2008-10-12 07:02:40+00:00,469.0,1.0,<p>There must be a method in Cocoa to get a li...,there must be a method in cocoa to get a list ...,there must be a method in cocoa to get a list ...,2008.0,80,83
4,4,502,147.0,2008-08-02 17:01:58+00:00,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,i have a cross platform python application whi...,get a view of a on windows,i have a cross platform python application whi...,...,161.0,2008-08-02 18:49:07+00:00,502.0,9.0,<p>You can use ImageMagick's convert utility f...,you can use imagemagick's convert utility for ...,you can use imagemagick's convert utility for ...,2008.0,108,47
5,5,502,147.0,2008-08-02 17:01:58+00:00,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,i have a cross platform python application whi...,get a view of a on windows,i have a cross platform python application whi...,...,878.0,2008-08-10 07:10:19+00:00,502.0,2.0,<p>Is the PC likely to have Acrobat installed?...,is the pc likely to have acrobat installed i t...,is the pc likely to have acrobat installed i t...,2008.0,80,47
6,6,502,147.0,2008-08-02 17:01:58+00:00,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,i have a cross platform python application whi...,get a view of a on windows,i have a cross platform python application whi...,...,13.0,2008-08-10 08:08:33+00:00,502.0,25.0,<p>ImageMagick delegates the PDF->bitmap conve...,imagemagick delegates the bitmap conversion to...,imagemagick delegates the bitmap conversion to...,2008.0,173,47
7,7,535,154.0,2008-08-02 18:43:54+00:00,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,i'm starting work on a hobby project with a py...,continuous integration system for a python cod...,i'm starting work on a hobby project with a py...,...,156.0,2008-08-02 18:56:56+00:00,535.0,23.0,<p>One possibility is Hudson. It's written in...,one possibility is hudson it's written in java...,one possibility is hudson. it's written in jav...,2008.0,55,109
8,8,535,154.0,2008-08-02 18:43:54+00:00,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,i'm starting work on a hobby project with a py...,continuous integration system for a python cod...,i'm starting work on a hobby project with a py...,...,157.0,2008-08-02 19:06:40+00:00,535.0,20.0,"<p>We run <a href=""http://buildbot.net/trac"">B...",we run http buildbot net trac buildbot trac at...,we run http buildbot.net trac buildbot trac at...,2008.0,47,109
9,9,535,154.0,2008-08-02 18:43:54+00:00,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,i'm starting work on a hobby project with a py...,continuous integration system for a python cod...,i'm starting work on a hobby project with a py...,...,197.0,2008-08-03 12:09:18+00:00,535.0,14.0,<p>Second the Buildbot - Trac integration. You...,second the buildbot trac integration you can f...,second the buildbot trac integration. you can ...,2008.0,151,109


In [68]:
question_answer()

Posed Question: How to find a full path to a font?

Similar Question: does anyone know how to do this i need to add a header of the form value value

Similar Answer: as the question is phrased it's hard to guess what the intention or even the intended semantics is. for setting headers try the following import soappy headers soappy.types.headertype headers.value value or [...] headers.foo value headers.bar value

Embeddings Shape: (10001, 768)

Computation Time: 7.04338002204895


In [69]:
question_answer()

Posed Question: How to find a full path to a font in photoshop javascript?

Similar Question: is there any python module for rendering a html page with javascript and get back a dom object i want to parse a page which generates almost all of its content using javascript.

Similar Answer: only way i know to accomplish this would be to drive real browser for example using http selenium rc.openqa.org rel nofollow selenium rc .

Embeddings Shape: (10001, 768)

Computation Time: 14.047891855239868


### QA Function v.3 - normalize question text

In [6]:
def normalize_with_sentences(text):
    text = text.lower()
    text = text.replace('<p>', ' ')
    text = text.replace('</p>', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('<a', ' ')
    text = text.replace('</a>', ' ')
    text = text.replace('href=', ' ')
    text = text.replace('</code', ' ')
    text = text.replace('</pre>', ' ')
    text = text.replace('<code>', ' ')
    text = text.replace('jpeg', ' ')
    text = text.replace('jpg', ' ')
    text = text.replace('pre', ' ')
    text = text.replace('pdf', ' ')
    text = text.replace('gt', ' ')
    text = re.sub(r"[^a-zA-z'.]", ' ', text)
    text = text.split()
    text = " ".join(text)
    
    return text

In [7]:
# Function to get user input and embeddings and return
# similar Q/A. Does not load data or embeddings.
# Normalizes question text

def question_answer_v3():
    start_time = time.time()
    
    # Take in user question
    posed_question = input('Question:')
    
    # Normalize question
    posed_quesiton = normalize_with_sentences(posed_question)

    # Get new question embeddings
    new_question_embeddings = sent_model.encode(posed_question)

    # Find most similar question index
    similarity_scores = cosine_similarity([new_question_embeddings],
                                       ques_embeddings)

    best_index = np.argmax(similarity_scores)
    
    # Extract similar question and answer text from df
    best_question = df_QA.loc[best_index, 'body_with_sentences_q']
    best_answer = df_QA.loc[best_index, 'body_with_sentences_a']
    
    end_time = time.time()
    
    computation_time = end_time - start_time
    
    # Print results
    print(f'Posed Question: {posed_question}')
    print()
    print(f'Similar Question: {best_question}')
    print()
    print(f'Similar Answer: {best_answer}')
    print()
    print(f'Embeddings Shape: {ques_embeddings.shape}')
    print()
    print(f'Computation Time: {computation_time}')

In [8]:
question_answer_v3()

Posed Question: What is python?

Similar Question: how do you create a weak reference to an object in python

Similar Answer: import weakref class object ... pass ... o object r weakref.ref o if the reference is still active r will be o otherwise none do_something_with_o r see the http docs.python.org lib module weakref.html wearkref module docs for more details. you can also use weakref.proxy to create an object that proxies o. will throw referenceerror if used when the referent is no longer referenced.

Embeddings Shape: (10001, 768)

Computation Time: 5.623437166213989


In [73]:
question_answer()

Posed Question: What is Django?

Similar Question: how do i go about specifying and using an enum in a django model

Similar Answer: from the https docs.djangoproject.com en dev ref models fields django.db.models.field.choices rel nofollow django documentation maybechoice 'y' 'yes' 'n' 'no' 'u' 'unknown' and you define a charfield in your model married models.charfield max_len h choices maybechoice you can do the same with integer fields if you don't like to have letters in your db. in that case rewrite your choices maybechoice 'yes' 'no' 'unknown'

Embeddings Shape: (10001, 768)

Computation Time: 8.851870059967041


In [74]:
question_answer()

Posed Question: How to find the full path to a font?

Similar Question: does anyone know how to do this i need to add a header of the form value value

Similar Answer: as the question is phrased it's hard to guess what the intention or even the intended semantics is. for setting headers try the following import soappy headers soappy.types.headertype headers.value value or [...] headers.foo value headers.bar value

Embeddings Shape: (10001, 768)

Computation Time: 8.934815883636475


In [55]:
# Find top 5
x = np.array([2, 5, 9, 3, 10, 7, 2, 11])

# Top 4
print(np.argpartition(x, -4))
print(x[np.argpartition(x, -4)][-4:])

# Sorted Top 4
print(sorted(x[np.argpartition(x, -4)][-4:]))

# Smallest 5
print(np.argpartition(x, 5)[:5])

[6 0 3 1 5 4 2 7]
[ 7 10  9 11]
[7, 9, 10, 11]
[6 0 3 1 5]
