<a href="https://colab.research.google.com/github/howard-haowen/NLP-demos/blob/main/train_FAQ_model_with_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*   Author: [Haowen Jiang](https://howard-haowen.rohan.tw/)

# Download a model

In [None]:
!pip install -q spacy

In [None]:
!python -m spacy download en_core_web_md

2023-03-06 13:57:20.596574: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-06 13:57:23.063323: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-06 13:57:23.063536: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-06 13:57:28.600990: E tensorfl

# Load a model

In [None]:
import spacy



In [None]:
nlp = spacy.load("en_core_web_md")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

# Load a dataset

In [None]:
!pip install -q datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("web_questions")



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['url', 'question', 'answers'],
        num_rows: 3778
    })
    test: Dataset({
        features: ['url', 'question', 'answers'],
        num_rows: 2032
    })
})


In [None]:
def dataset2df(dataset, split='train'):
    df = dataset[split].to_pandas()
    df = df[['question', 'answers']]
    df.loc[:, 'answers'] = df['answers'].apply(lambda arr: ", ".join([i for i in arr]))
    return df

In [None]:
df = dataset2df(dataset)
df

Unnamed: 0,question,answers
0,what is the name of justin bieber brother?,"Jazmyn Bieber, Jaxon Bieber"
1,what character did natalie portman play in sta...,Padmé Amidala
2,what state does selena gomez?,New York City
3,what country is the grand bahama island in?,Bahamas
4,what kind of money to take to bahamas?,Bahamian dollar
...,...,...
3773,where did sir donald bradman live?,Adelaide
3774,what are the holydays of obligation in the cat...,"Name day, Saint Patrick's Day, Maundy Thursday..."
3775,what is the name of the broncos mascot?,Miles
3776,what caused the russian financial crisis of 1998?,Allies of World War II


# Convert the dataset to spaCy objects

In [None]:
from spacy.tokens import Doc

if not Doc.has_extension("answer"):
    Doc.set_extension("answer", default=None)

In [None]:
import pandas as pd

def convert_df2docs(df: pd.DataFrame, qcol: str, acol: str) -> list:
    """
    df: The dataset in DataFrame 
    qcol: Column name for questions
    acol: Column name for answers
    """
    questions = df[qcol].to_list()
    answers = df[acol].to_list()
    qa_tuples = [(q, {"answer": a}) for q, a in zip(questions, answers)]
    doc_tuples = nlp.pipe(qa_tuples, as_tuples=True,)
    docs = []
    for doc, context in doc_tuples:
        doc._.answer = context["answer"]
        docs.append(doc)
    return docs

In [None]:
docs = convert_df2docs(df, 'question', 'answers')

In [None]:
q1 = docs[1]
q1.text

'what character did natalie portman play in star wars?'

In [None]:
q1._.answer

'Padmé Amidala'

In [None]:
q1.vector

array([-1.6279961 ,  2.4959083 , -2.588559  , -2.332141  ,  1.2626406 ,
        1.716149  ,  0.912531  ,  2.4241168 , -0.06311016, -0.28046352,
        2.943069  ,  0.22578308, -2.2698054 ,  1.412488  ,  1.9777391 ,
       -1.195561  , -0.45647997, -0.198733  ,  1.5510001 ,  0.7685981 ,
        1.0977949 ,  0.6495772 , -0.45109797, -2.036414  , -0.17086908,
        0.910155  , -2.427465  , -0.57002395,  1.0054638 ,  2.96036   ,
        0.6996047 , -0.88964194,  0.5209521 , -1.198046  , -0.9824816 ,
       -0.06232102, -1.866366  ,  0.49036592, -0.95985997,  1.875222  ,
       -1.3045189 ,  0.029563  ,  2.12781   , -1.220001  ,  0.64616704,
        2.3855002 , -1.856546  , -3.2135062 ,  0.37537998,  1.863395  ,
       -1.624996  ,  0.365358  ,  1.2175821 , -2.234212  , -1.577785  ,
       -1.3475698 , -0.22315402, -0.28250003,  1.4990139 ,  1.5962719 ,
       -0.9409919 , -1.441222  , -0.31776   , -1.577894  ,  1.3866299 ,
        1.3931592 , -4.06257   , -4.663712  , -1.2520559 ,  2.27

# Create numpy 2D arrays for questions

In [None]:
import numpy as np

def create_2dvectors_from_docs(docs: list) -> np.array:
    doc_vectors1D = [doc.vector for doc in docs]
    doc_vectors2D = np.array(doc_vectors1D)
    return doc_vectors2D

In [None]:
doc_vectors2D = create_2dvectors_from_docs(docs)
doc_vectors2D.shape

(3778, 300)

# Compute cosine similarity

In [None]:
from numpy.linalg import norm

def find_most_similar(query: str, docs: list, topK: int):
    query_vector = nlp(query).vector
    doc_vectors2D = create_2dvectors_from_docs(docs)
    cos_sim = np.dot(doc_vectors2D,query_vector)/(norm(doc_vectors2D, axis=1)*norm(query_vector))
    sorted_ids = np.argsort(cos_sim)
    top_sim_ids = sorted_ids[::-1][:topK] # in descending order
    print(f"Query: {query}")
    print("="*20)
    for id in top_sim_ids:
        score = cos_sim[id]
        doc = docs[id]
        print(f"Corpus question: {doc.text}")
        print(f"Corpus answer: {doc._.answer}")
        print(f"Score: {score}")
        print("="*10)

In [None]:
query = "What did Michael Jordan major in college'?"
topK = 5
find_most_similar(query, docs, topK)

Query: What did Michael Jordan major in college'?
Corpus question: what did jeff corwin major in?
Corpus answer: Biologist
Score: 0.8347316980361938
Corpus question: what did shawnee smith star in?
Corpus answer: The Grudge 3, Saw IV, Summer School, Saw II, Saw III, The Island, Saw, The Blob, Who's Harry Crumb?
Score: 0.804623544216156
Corpus question: when did michael jordan started playing basketball in the nba?
Corpus answer: 1984
Score: 0.8012731075286865
Corpus question: what shows did kellie martin star in?
Corpus answer: A Pup Named Scooby-Doo, Crisis Center, Father Murphy, Life Goes On, Christy
Score: 0.7991346120834351
Corpus question: what did charles dickens believe in?
Corpus answer: Anglicanism
Score: 0.7923551797866821
