In [1]:

import numpy as np
from sklearn import pipeline, decomposition
from sklearn import feature_extraction 
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

import haystack.document_store.memory
import haystack.document_store.elasticsearch
from haystack import document_store

import haystack.retriever.dense
from haystack import retriever
import pandas as pd

02/21/2021 17:16:15 - INFO - faiss -   Loading faiss with AVX2 support.
02/21/2021 17:16:15 - INFO - faiss -   Loading faiss.
02/21/2021 17:16:15 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
n_samples = 1000
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

data_train = dataset.data[:n_samples]
data_test = dataset.data[n_samples:]

In [3]:
pipe = pipeline.make_pipeline(
    feature_extraction.text.TfidfVectorizer(),
    decomposition.TruncatedSVD(n_components=100)
)

In [4]:
%%time
pipe.fit(data_train)

CPU times: user 3.14 s, sys: 8.56 s, total: 11.7 s
Wall time: 451 ms


Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('truncatedsvd',
                 TruncatedSVD(algorithm='randomized', n_components=100,
                              n_iter=5, random_state=Non

In [5]:
def get_embedding_dim(pipeline):
    embedder = pipeline.steps[-1][1]
    return embedder.components_.shape[0]


def make_dicts_with_added_field(dicts, field_name, field_values):
    for d, val in zip(dicts, field_values):
        d[field_name] = val
        yield d
    

def make_sklearn_retriever(df, pipeline, col='text'):
    embedding_dim = get_embedding_dim(pipeline)
    memory_docstring_store = document_store.elasticsearch.ElasticsearchDocumentStore(index=col, embedding_dim=embedding_dim)
    df['text'] = df[col]
    embeddings = pipeline.fit_transform(df['text'])
    documents = df.to_dict('records')
    dicts_with_added_fields = list(make_dicts_with_added_field(documents, 'embedding', embeddings))
    memory_docstring_store.write_documents(dicts_with_added_fields)
    return retriever.dense.SklearnPipelineRetriever(embedding_pipeline=pipeline, document_store=memory_docstring_store)

In [6]:
df = pd.DataFrame({'newsgroups_text': data_train})

In [7]:
df = df[~(df['newsgroups_text'].apply(len) > 10000)]

In [8]:
pipe.fit(df['newsgroups_text'])

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('truncatedsvd',
                 TruncatedSVD(algorithm='randomized', n_components=100,
                              n_iter=5, random_state=Non

In [9]:
sklearn_retriever = make_sklearn_retriever(df, pipe, col='newsgroups_text')

02/21/2021 17:16:17 - INFO - elasticsearch -   HEAD http://localhost:9200/ [status:200 request:0.002s]
02/21/2021 17:16:17 - INFO - elasticsearch -   PUT http://localhost:9200/newsgroups_text [status:200 request:0.057s]
02/21/2021 17:16:17 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.001s]
02/21/2021 17:16:18 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.528s]
02/21/2021 17:16:19 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.984s]


In [10]:
sklearn_retriever.retrieve('atheism')

02/21/2021 17:16:19 - INFO - elasticsearch -   POST http://localhost:9200/newsgroups_text/_search [status:200 request:0.006s]


[{'text': "\n: The Bible says there is a God; if that is true then our atheism is\n: mistaken.  What of it?  Seems pretty obvious to me.  Socrates said\n: there were many gods; if that is true then your monotheism (and our\n: atheism) is mistaken, even if Socrates never existed.\n\n\nJim,\n\nI think you must have come in late. The discussion (on my part at\nleast) began with Benedikt's questioning of the historical acuuracy of\nthe NT. I was making the point that, if the same standards are used to\nvalidate secular history that are used here to discredit NT history,\nthen virtually nothing is known of the first century.\n\nYou seem to be saying that the Bible -cannot- be true because it\nspeaks of the existence of God as it it were a fact. Your objection\nhas nothing to do with history, it is merely another statement of\natheism.", 'id': '5989f5a9-2173-4038-8517-d33a92e975dd', 'score': 0.03570000000001983, 'probability': 0.5000892499990521, 'question': None, 'meta': {'newsgroups_text':