In [1]:

import numpy as np
from sklearn import pipeline, decomposition
from sklearn import feature_extraction 
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

import haystack.document_store.memory
import haystack.document_store.elasticsearch
from haystack import document_store

import haystack.retriever.dense
from haystack import retriever
import pandas as pd

02/21/2021 23:23:20 - INFO - faiss -   Loading faiss with AVX2 support.
02/21/2021 23:23:20 - INFO - faiss -   Loading faiss.
02/21/2021 23:23:20 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
n_samples = 1000
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

data_train = dataset.data[:n_samples]
data_test = dataset.data[n_samples:]

In [3]:
pipe = pipeline.make_pipeline(
    feature_extraction.text.TfidfVectorizer(),
    decomposition.TruncatedSVD(n_components=100)
)

In [4]:
%%time
pipe.fit(data_train)

CPU times: user 3.3 s, sys: 9.1 s, total: 12.4 s
Wall time: 476 ms


Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('truncatedsvd',
                 TruncatedSVD(algorithm='randomized', n_components=100,
                              n_iter=5, random_state=Non

In [5]:
import mlutil



In [6]:
def get_embedding_dim(embedder):
    if isinstance(embedder, pipeline.Pipeline):
        return embedder.steps[-1][1].components_.shape[0]
    elif isinstance(embedder, mlutil.feature_extraction.embeddings.PCREmbeddingVectorizer):
        return embedder.dimensionality_
    else:
        return embedder.components_.shape[0]


def make_dicts_with_added_field(dicts, field_name, field_values):
    for d, val in zip(dicts, field_values):
        d[field_name] = val
        yield d
    

def make_sklearn_retriever(df, pipeline, col='text'):
    embedding_dim = get_embedding_dim(pipeline)
    memory_docstring_store = document_store.elasticsearch.ElasticsearchDocumentStore(index=col, embedding_dim=embedding_dim)
    df['text'] = df[col]
    embeddings = pipeline.fit_transform(df['text'])
    documents = df.to_dict('records')
    dicts_with_added_fields = list(make_dicts_with_added_field(documents, 'embedding', embeddings))
    memory_docstring_store.write_documents(dicts_with_added_fields)
    return retriever.dense.SklearnTransformerRetriever(embedding_transformer=pipeline, document_store=memory_docstring_store)

In [7]:
df = pd.DataFrame({'newsgroups_text': data_train})

In [8]:
df = df[~(df['newsgroups_text'].apply(len) > 10000)]

In [9]:
pipe.fit(df['newsgroups_text'])

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('truncatedsvd',
                 TruncatedSVD(algorithm='randomized', n_components=100,
                              n_iter=5, random_state=Non

In [10]:
sklearn_retriever = make_sklearn_retriever(df, pipe, col='newsgroups_text')

02/21/2021 23:23:22 - INFO - elasticsearch -   HEAD http://localhost:9200/ [status:200 request:0.004s]
02/21/2021 23:23:22 - INFO - elasticsearch -   HEAD http://localhost:9200/newsgroups_text [status:200 request:0.001s]
02/21/2021 23:23:22 - INFO - elasticsearch -   GET http://localhost:9200/newsgroups_text [status:200 request:0.001s]
02/21/2021 23:23:22 - INFO - elasticsearch -   PUT http://localhost:9200/newsgroups_text/_mapping [status:200 request:0.004s]
02/21/2021 23:23:22 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.001s]
02/21/2021 23:23:23 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.714s]
02/21/2021 23:23:24 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.996s]


In [11]:
sklearn_retriever.retrieve('atheism')

02/21/2021 23:23:24 - INFO - elasticsearch -   POST http://localhost:9200/newsgroups_text/_search [status:200 request:0.006s]


[{'text': "\n: The Bible says there is a God; if that is true then our atheism is\n: mistaken.  What of it?  Seems pretty obvious to me.  Socrates said\n: there were many gods; if that is true then your monotheism (and our\n: atheism) is mistaken, even if Socrates never existed.\n\n\nJim,\n\nI think you must have come in late. The discussion (on my part at\nleast) began with Benedikt's questioning of the historical acuuracy of\nthe NT. I was making the point that, if the same standards are used to\nvalidate secular history that are used here to discredit NT history,\nthen virtually nothing is known of the first century.\n\nYou seem to be saying that the Bible -cannot- be true because it\nspeaks of the existence of God as it it were a fact. Your objection\nhas nothing to do with history, it is merely another statement of\natheism.", 'id': 'bc4b2af5-195b-4dd9-ab9c-b6e546e89d81', 'score': 0.03589999999996962, 'probability': 0.500089749999036, 'question': None, 'meta': {'newsgroups_text': 

## Advanced: PCR embeddings vectorizer

For detailed description of method check out [A Critique of the Smooth Inverse Frequency Sentence Embeddings](https://arxiv.org/pdf/1909.13494.pdf)


In [12]:
from mlutil.feature_extraction import embeddings

word_embeddings = embeddings.load_gensim_embedding_model('glove-twitter-50')

02/21/2021 23:23:24 - INFO - gensim.models.utils_any2vec -   loading projection weights from /home/kuba/gensim-data/glove-twitter-50/glove-twitter-50.gz
02/21/2021 23:24:00 - INFO - gensim.models.utils_any2vec -   loaded (1193514, 50) matrix from /home/kuba/gensim-data/glove-twitter-50/glove-twitter-50.gz


In [13]:
pcr_vectorizer = embeddings.PCREmbeddingVectorizer(word_embeddings)

In [14]:
pcr_vectorizer.fit(df['newsgroups_text'])

In [15]:
df['newsgroups_text_pcr'] = df['newsgroups_text']

In [16]:
vectors = pcr_vectorizer.transform(df['newsgroups_text'])

In [17]:
vectors.shape

(990, 50)

In [18]:
pcr_retriever = make_sklearn_retriever(df, pcr_vectorizer, col='newsgroups_text_pcr')

02/21/2021 23:24:00 - INFO - elasticsearch -   HEAD http://localhost:9200/ [status:200 request:0.002s]
02/21/2021 23:24:00 - INFO - elasticsearch -   HEAD http://localhost:9200/newsgroups_text_pcr [status:200 request:0.001s]
02/21/2021 23:24:00 - INFO - elasticsearch -   GET http://localhost:9200/newsgroups_text_pcr [status:200 request:0.001s]
02/21/2021 23:24:00 - INFO - elasticsearch -   PUT http://localhost:9200/newsgroups_text_pcr/_mapping [status:200 request:0.005s]
02/21/2021 23:24:00 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.001s]
02/21/2021 23:24:01 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.326s]
02/21/2021 23:24:02 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.009s]


In [19]:
pcr_retriever.retrieve('atheism')

02/21/2021 23:24:02 - INFO - elasticsearch -   POST http://localhost:9200/newsgroups_text_pcr/_search [status:200 request:0.009s]


[{'text': '\n\n\nAnd organized religion is a religion built from organized values.\nAnd Ford Tempo is a Tempo built from Ford values.\nAnd rational response is response built from rational values.\nAnd unconditional surrender is surrender built from unconditional values.\n    ^^^^^^^^^^^^^^^^^^^^^^^\n          uncle!', 'id': '5b118c6d-9975-4877-8590-cd62f1cab78a', 'score': 4.266399999999976, 'probability': 0.510664382424982, 'question': None, 'meta': {'newsgroups_text_pcr': '\n\n\nAnd organized religion is a religion built from organized values.\nAnd Ford Tempo is a Tempo built from Ford values.\nAnd rational response is response built from rational values.\nAnd unconditional surrender is surrender built from unconditional values.\n    ^^^^^^^^^^^^^^^^^^^^^^^\n          uncle!', 'newsgroups_text': '\n\n\nAnd organized religion is a religion built from organized values.\nAnd Ford Tempo is a Tempo built from Ford values.\nAnd rational response is response built from rational values.\nAnd