In [1]:
# !python -m spacy download en_core_web_sm

Importing the necessary modules 

In [6]:
import os
import pickle
import pandas as pd
import spacy
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import gc
import torch
VEC_PICKLE_LOC = "/home/student/QA-System/backend/files/data/vectorizer_ver0.22.1.pickle"

This method lemmatize our csv's paragraphs

In [7]:
#lemmas word
def lemmatize(phrase):
    sp = spacy.load("en_core_web_sm")
    return " ".join([word.lemma_ for word in sp(phrase)])


This method reads our whole csv and put it in our paragraphs array

In [8]:
def reading_csv(pathToCSV):
    data = pd.read_csv(pathToCSV)
    paragraphs = []
    for t in data['text']:
        if not pd.isna(t):
            paragraphs.append(t)
    return paragraphs

This method process our question and return an answer. There will be code comments that explain what the code does.

In [9]:
def processing_question(ques, paragraphs, domain_lemma_cache, domain_pickle):   
    #Lemmatizing whole csv text column
    LEMMA_CACHE = domain_lemma_cache
    if not os.path.isfile(LEMMA_CACHE):#if there is no cache of the current domain, we will create a new lemma cache for the domain
        lemmas = [lemmatize(par) for par in tqdm(paragraphs)] 
        df = pd.DataFrame(data={'context': paragraphs, 'lemmas': lemmas})
        df.to_feather(LEMMA_CACHE)#save the lemmatzed corpus in a feather file so we don't have to lemmatize the corpus each time
        
    df = pd.read_feather(LEMMA_CACHE)
    paragraphs = df.context
    lemmas = df.lemmas
    
    if not os.path.isfile(VEC_PICKLE_LOC):#if there is no TDIDF vectorizer model. we will create the model and fit the lemma cache here
        vectorizer = TfidfVectorizer(
            stop_words='english', min_df=5, max_df=.5, ngram_range=(1, 3))
        vectorizer.fit_transform(lemmas)
        pickle.dump(vectorizer, open(VEC_PICKLE_LOC, "wb"))      

    if not os.path.isfile(domain_pickle):#if there is no vector representation of the lemma cache we will vectorize it here
        tfidf = vectorizer.fit_transform(lemmas)
        pickle.dump(tfidf, open(domain_pickle, "wb"))
        
    #if there is cache of the TFIDF model and the cache for the vector representation of the lemma of the current domain we will load it in memory
    vectorizer = pickle.load(open(VEC_PICKLE_LOC, "rb"))
    tfidf = pickle.load(open(domain_pickle, "rb"))
      
    # we lemmatize and vectorize our question here
    question = ques
    query = vectorizer.transform([lemmatize(question)])
    (query > 0).sum(), vectorizer.inverse_transform(query)
    
    #Dot producting the paragraph with our question.
    scores = (tfidf * query.T).toarray()
    results = (np.flip(np.argsort(scores, axis=0)))
    
    #declaring our question answering pipeline
    qapipe = pipeline('question-answering',
                      model='distilbert-base-uncased-distilled-squad', #we are using the distilbert-base-uncased-distilled-squad model to process our answer
                      tokenizer='bert-base-uncased', #using the bert-base-uncased tokenizer to tokenize our paragraph
                      device=0)#device=0 means we are using the gpu instead of the CPU. 0 is the id for our gpu
    
    #getting the top 5 candidates and running those candidates through our pipeline
    THRESH = 0.01
    candidate_idxs = [ (i, scores[i]) for i in results[0:5, 0] ]
    contexts = [ (paragraphs[i],s)
        for (i,s) in candidate_idxs if s > THRESH ]
    
    question_df = pd.DataFrame.from_records([ {
        'question': question,
        'context':  ctx
    } for (ctx,s) in contexts ])
    
    #we process the paragraphs and use dataframe to tidy our data
    preds = qapipe(question_df.to_dict(orient="records"))
    answer_df = pd.DataFrame.from_records(preds)
    answer_df["context"] = question_df["context"]
    answer_df = answer_df.sort_values(by="score", ascending=False)
    return answer_df
 

domains_choices = {
    'op':('/home/student/QA-System/backend/files/data/op/op.csv',
          '/home/student/QA-System/backend/files/data/op/op.feather',
          '/home/student/QA-System/backend/files/data/op/op_tfidf.pickle')
}

domain = 'op'
ques = 'What is the degree i should study if i want to be a chef'
domain_lemma_cache = domains_choices[domain][1]
csvpath = domains_choices[domain][0]
pickle_cache = domains_choices[domain][2]
paragraphs = reading_csv(csvpath)
result_df = processing_question(ques, paragraphs, domain_lemma_cache, pickle_cache)
print(reresult_df.iloc[0])
    

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 7.79 GiB total capacity; 0 bytes already allocated; 85.69 MiB free; 0 bytes reserved in total by PyTorch)

In [None]:
# Referencing https://medium.com/@patonw/question-answering-with-pytorch-transformers-part-1-8736196bf20e