## Q&A System

In [5]:
#gensim 3.8.3 version para word2vec 
from pathlib import Path
import chardet
import es_core_news_sm
from scipy.spatial.distance import cosine
from spacy.lang.es.stop_words import STOP_WORDS
from string import punctuation
import numpy as np
import regex
import joblib
import os
import pandas as pd
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, Union
import requests
from wikitextparser import remove_markup
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from typing import List, Optional, Sequence
from functools import reduce
from gensim import models
from gensim import corpora
from gensim.models import Word2Vec
from collections import defaultdict
from enum import Enum
from nltk.tokenize import sent_tokenize
import math

### Function to generate texts from wikipedia

In [422]:
def wikipedia_corpus():
    """Extrae contenido de textos de wikipedia.
    
    Los textos se escogen al azar, de wikipedia en español.
    Para más información revisar la API:
    
    https://www.mediawiki.org/wiki/API:Random
    
    Se utiliza el paquete ``wikitextparser`` para limpiar
    los textos (en formato wikitext):
    
    https://github.com/5j9/wikitextparser#miscellaneous
    """
    url = "https://es.wikipedia.org/w/api.php"

    params = {
        "action": "query",
        "format": "json",
        "generator": "random",
        "prop": "revisions",
        "rvprop": "content",
        "grnnamespace": "0",
    }
    session = requests.Session()
    res = session.get(url=url, params=params)
    res.raise_for_status()
    data = res.json()
    for id, page in data["query"]["pages"].items():
        if "revisions" in page:
            yield remove_markup(page["revisions"][0]["*"])

### Creation of list of texts from wikipedia

In [465]:
cant_textos =1000
textos_wiki = []
id_o = 0
for i in range(0,cant_textos):
    texto = next(wikipedia_corpus())
    textos_wiki.append(texto)

###  Instanciate Class of lemmatizer

In [7]:
LEMMATIZATION_LIST_FILE = Path("lemmatization-es.txt")

class Lemmatizer(ABC):
    """Abstrae lematizador."""
    @abstractmethod
    def lemmatize(self, word: str) -> str:
        pass
    
    
class DummyLemmatizer(Lemmatizer):
    """Lematizador que no hace nada."""
    def lemmatize(self, word: str) -> str:
        return word


class LexiconLemmatizer(Lemmatizer):
    """Implementa un lematizador basado en lexicon."""

    def __init__(self, lemma_dict: Dict[str, str]):
        """Constructor.
        
        Lee un diccionario de reglas y lo utiliza como lematizador.
        
        :param lemma_dict: Diccionario de reglas de lematización.
        :type lemma_dict: Dict[str, str] 
        """
        self.lemma_dict = lemma_dict
        
    @classmethod
    def from_file(cls: "LexiconLemmatizer",
                  filepath: Union[str, Path]) -> "LexiconLemmatizer":
        """Carga un lematizador desde un archivo de texto."""
        if isinstance(filepath, str):
            filepath = Path(filepath)
            
        with open(filepath, "r") as fp:
            lemma_dict = dict()
            for line in fp:
                try:
                    lemma, word = line.strip().split()
                    lemma_dict[word] = lemma
                except Exception:
                    pass
        return cls(lemma_dict)

    def lemmatize(self, word: str) -> str:
        """Lematiza una palabra.

        Si la palabra no se encuentra en el lexicon utilizado para
        inicializar el lematizador, retornará la misma palabra como
        lemma.

        :param word: Palabra a lematizar
        :type word: str
        :return: Lema de la palabra
        :rtype: str
        """
        return self.lemma_dict.get(word, word)


nlp           = es_core_news_sm.load()

In [8]:
# Trying Lemmatizer
lemmatizer = LexiconLemmatizer.from_file(LEMMATIZATION_LIST_FILE)
lemmatizer.lemmatize("manipulaste")

'manipular'

In [2]:
### Create Preprocessing Pipeline

In [426]:
def pre_processing_pipeline(*preprocessing_steps):
    """Implementa composicion de funciones.
    
    En escencia recibe un iterable de funciones, y las aplica
    en el orden entregado, por ejemplo:
    
    - Limpiar texto
    - Tokenizar
    
    Tomará el texto de entrada lo limpiará y lo tokenizará.
    """
    return reduce(
        lambda f, g: lambda x: g(f(x)),
        preprocessing_steps, lambda x: x)


def remove_punctuation(text: str) -> str:
    """Reemplaza signos de puntuación por espacios."""
    return text.translate(
        str.maketrans(string.punctuation, ' '*len(string.punctuation)))


def remove_stopwords(tokens: List[str]) -> Sequence[str]:
    """Filtra palabras que sean stopwords."""
    return filter(lambda x: x not in stopwords.words("spanish"), tokens)
    

def lemmatize(tokens: Sequence[str],
              lemmatizer: Optional[Lemmatizer]=None) -> Sequence[str]:
    """Lematiza tokens."""
    if lemmatizer is None:
        lemmatizer = DummyLemmatizer()
        
    return map(lemmatizer.lemmatize, tokens)

def lowercase(text: str) -> str:
    """Transforma texto a minúsculas."""
    return text.lower()


def tokenize(text: str) -> List[str]:
    """Tokeniza texto."""
    return word_tokenize(text)


# Pipeline de preprocesamiento
pipeline = pre_processing_pipeline(
    remove_punctuation,
    lowercase,
    tokenize,
    remove_stopwords,
    lambda x: lemmatize(x, lemmatizer))


# Ejemplo
list(pipeline("El perro come carne"))

['perro', 'comer', 'carne']

### Count token frequency and remove those that only appear once, they can be considered "noise".  Then train word2vec model

In [518]:
frequency = defaultdict(int)
for text in map(pipeline, textos_wiki):
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in pipeline(document) if frequency[token] > 1]
    for document in textos_wiki
]




# Training a word2vec model from the given data set
w2v_model = Word2Vec(texts, size=200, min_count=2, window=4, sg=1, workers=4)
w2v_model.train(texts, total_examples=len(texts), compute_loss=True, epochs=50)



(18672151, 19393450)

In [519]:
dictionary = [term for term in w2v_model.wv.vocab]  

### Create class for recognize type of question and create function for embedding of the phrases, tokens or text with Word2vec model, to get embedding of the text(vector)

In [520]:
class QuestionType(Enum):
    QUE = "QUÉ"
    QUIEN = "QUIÉN"
    DONDE = "DÓNDE"
    COMO = "CÓMO"    
    CUANDO = "CUANDO"   
    PORQUE = "PORQUÉ"    
    CUAL = "CUÁL"


class QueryClassification(ABC):
    @staticmethod
    @abstractmethod
    def get_query_class(query: str) -> QuestionType:
        pass

    
class RuleQueryClassification(QueryClassification):
    rules = {
        r"^¿Qué (\w\s?)+\?$": QuestionType.QUE,
        r"^¿Quién (\w\s?)+\?$": QuestionType.QUIEN,
        r"^¿Dónde (\w\s?)+\?$": QuestionType.DONDE,
        r"^¿Cómo (\w\s?)+\?$": QuestionType.COMO,        
        r"^¿Cuando (\w\s?)+\?$": QuestionType.CUANDO,        
        r"^¿Porqué (\w\s?)+\?$": QuestionType.PORQUE,        
        r"^¿Cual (\w\s?)+\?$": QuestionType.CUAL,
    }
    
    def get_query_class(query: str) -> QuestionType:
        for rule, query_type in RuleQueryClassification.rules.items():
            if re.match(rule, query, re.IGNORECASE):
                return query_type
        raise ValueError(f"Tipo de pregunta no soportada: {query}") 
        
def ObtenerEmbeddingOracion(modelo, oracion):  
    Lista_vectores = [modelo.wv[w]  for w in pipeline(oracion) if w in dictionary]
    embedding_palabras = np.array(Lista_vectores)
    embedding_oracion = embedding_palabras.mean(axis=0)
    return(embedding_oracion) 

In [521]:
query = "¿Quién es Jose Velazquez?"
vec_query = ObtenerEmbeddingOracion(w2v_model,query)

RuleQueryClassification.get_query_class(query)

<QuestionType.QUIEN: 'QUIÉN'>

### Measure the cosine similarity between the question made against each vector of the text 

In [523]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
i=0
Similitud=[]
nombre=[]
relevant_docs=[]
threshold = 0.70
relevant_index=[]
relevant_sim=[]

for x in tqdm(textos_wiki):
    texto=ObtenerEmbeddingOracion(w2v_model, x)
    similitud = cosine_similarity([texto], [vec_query])
    Similitud.append(similitud)
    if similitud > threshold:
        relevant_index.append(i)
        relevant_sim.append(similitud)
        relevant_docs.append(x)
    i+=1  


100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s]


In [524]:
similitud_indice = pd.DataFrame()
similitud_indice["relevant_index"] = relevant_index
similitud_indice["relevant_docs"] = relevant_docs
similitud_indice["relevant_sim"] = relevant_sim
similitud_indice.set_index("relevant_index").sort_values(by=["relevant_sim"], ascending=False)


Unnamed: 0_level_0,relevant_docs,relevant_sim
relevant_index,Unnamed: 1_level_1,Unnamed: 2_level_1
86,\n\nPacto de fuga es una película chilena del ...,[[0.7163151]]


### With the selected text, measure the cosine similarity, between the query and each sentence from the selected text

In [526]:
for i in tqdm(relevant_index):
    sentences = sent_tokenize(textos_wiki[i])
    relevant_passages = []
    threshold_passage = 0.65
    relevant_sim_pas=[]
    pasajes_seleccionados = pd.DataFrame()
    for sentence in sentences:
        vec_passage = ObtenerEmbeddingOracion(w2v_model,sentence)
        if np.all(np.isnan(vec_passage) != True): #este hay que arreglarlo 
            if cosine_similarity([vec_passage], [vec_query]) > threshold_passage:
                relevant_passages.append(sentence)
                relevant_sim_pas.append(cosine_similarity([vec_passage], [vec_query]))

pasajes_seleccionados["relevant_passages"] = relevant_passages
pasajes_seleccionados["relevant_sim_pas"] = relevant_sim_pas
pasajes_seleccionados = pasajes_seleccionados.sort_values(by=["relevant_sim_pas"], ascending=False)

100%|██████████| 1/1 [00:00<00:00,  2.73it/s]


In [528]:
print('\033[1m' + query)
print("\n")
print('\033[0m' + pasajes_seleccionados.iloc[0]["relevant_passages"])
pasajes_seleccionados

[1m¿Quién es Jose Velazquez?


[0m* Jose Luis Aguilera como gendarme Care'Poker.


Unnamed: 0,relevant_passages,relevant_sim_pas
1,* Jose Luis Aguilera como gendarme Care'Poker.,[[0.7956158]]
0,Simultáneamente el fiscal Ad Hoc Andrade (Mat...,[[0.70753837]]
