In [1]:
import numpy as np
import pandas as pd

from sklearn import pipeline, decomposition
from sklearn import feature_extraction 
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

import haystack.document_store.memory
from haystack import document_store
import haystack.retriever.dense
from haystack import retriever

import mlutil
from mlutil.feature_extraction import embeddings

03/03/2021 16:30:39 - INFO - faiss -   Loading faiss with AVX2 support.
03/03/2021 16:30:39 - INFO - faiss -   Loading faiss.
03/03/2021 16:30:39 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_colwidth", 150) 

# Haystack library with scikit-learn extracted features

Haystack is a library for question-answering and search that is mostly used with features extracted using word embeddings.

Haystack supports basically two modes for using text features:
- sparse, loading text for search using TF-IDF or BM25 using Elasticsearch
- dense, with features extracted using huggingface transformers.

This feature adds another way to add dense features: using scikit-learn.
It might make sense to use this for fast prototyping (features like in Latent Semantic Analysis) or in low-resource environment where transformers would be a burden.

Another motivation is using features extracted with word embeddings (like using PCREmbedding vectorizer from my [lambdaofgod/mlutil](https://github.com/lambdaofgod/mlutil) library).

In [3]:
n_samples = 1000
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

data_train = dataset.data[:n_samples]
data_test = dataset.data[n_samples:]

## Good old Latent Semantic Analysis

We extract features by factorizing TF-IDF matrix with Truncated SVD.
This can be done by creating scikit-learn pipeline.

In [4]:
lsa_pipe = pipeline.make_pipeline(
    feature_extraction.text.TfidfVectorizer(),
    decomposition.TruncatedSVD(n_components=100)
)

In [5]:
%%time
lsa_pipe.fit(data_train)

CPU times: user 3.51 s, sys: 9.69 s, total: 13.2 s
Wall time: 504 ms


Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('truncatedsvd',
                 TruncatedSVD(algorithm='randomized', n_components=100,
                              n_iter=5, random_state=Non

In [6]:
def get_embedding_dim(embedder):
    if isinstance(embedder, pipeline.Pipeline):
        return embedder.steps[-1][1].components_.shape[0]
    elif isinstance(embedder, mlutil.feature_extraction.embeddings.PCREmbeddingVectorizer):
        return embedder.dimensionality_
    else:
        return embedder.components_.shape[0]


def make_dicts_with_added_field(dicts, field_name, field_values):
    for d, val in zip(dicts, field_values):
        d[field_name] = val
        yield d
    

def make_sklearn_retriever(df, pipeline, col='text'):
    embedding_dim = get_embedding_dim(pipeline)
    memory_docstring_store = document_store.memory.InMemoryDocumentStore(index=col, embedding_dim=embedding_dim)
    df['text'] = df[col]
    embeddings = pipeline.fit_transform(df['text'])
    documents = df.to_dict('records')
    dicts_with_added_fields = list(make_dicts_with_added_field(documents, 'embedding', embeddings))
    memory_docstring_store.write_documents(dicts_with_added_fields)
    return retriever.dense.SklearnTransformerRetriever(embedding_transformer=pipeline, document_store=memory_docstring_store)


def prettify_response(response):
    return pd.Series([ 
        doc.text for doc in response
    ]).str.strip()

In [7]:
df = pd.DataFrame({'newsgroups_text': data_train})

In [8]:
df = df[~(df['newsgroups_text'].apply(len) > 10000)]

In [9]:
lsa_pipe.fit(df['newsgroups_text']);

In [10]:
lsa_retriever = make_sklearn_retriever(df, lsa_pipe, col='newsgroups_text')

In [11]:
prettify_response(lsa_retriever.retrieve('atheism'))

0    Well, this is alt.atheism.  I hope you arent here to try to convert anyone.\n\n\nMany would disagree.\n\n[...]\n\nWell, you shouldn't give any par...
1    Yeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs ...
2    As I was created in the image of Gaea, therefore I must\nbe the pinnacle of creation, She which Creates, She which\nBirths, She which Continues.\n...
3    Are you your own master?  Do you have any habits that you cannot break?\nFor one, you seem unable to master your lack of desire to understand\neve...
4                                                                                                                                                         
5    [..]\nReferring to the manual of my motherboard with AMI-BIOS, 10 beeps are a \n'CMOS Shutdown Register Read/Write Error', if the system stops aft...
6    I posted about this a while ago but without code excerpts noone w

## Advanced: PCR embeddings vectorizer

For detailed description of method check out [A Critique of the Smooth Inverse Frequency Sentence Embeddings](https://arxiv.org/pdf/1909.13494.pdf)

PCR Vectorizer [implementation link](https://github.com/lambdaofgod/mlutil/blob/master/mlutil/feature_extraction/embeddings.py#L179)

In [12]:
word_embeddings = embeddings.load_gensim_embedding_model('glove-twitter-50')

03/03/2021 16:30:41 - INFO - gensim.models.utils_any2vec -   loading projection weights from /home/kuba/gensim-data/glove-twitter-50/glove-twitter-50.gz
03/03/2021 16:31:17 - INFO - gensim.models.utils_any2vec -   loaded (1193514, 50) matrix from /home/kuba/gensim-data/glove-twitter-50/glove-twitter-50.gz


In [13]:
pcr_vectorizer = embeddings.PCREmbeddingVectorizer(word_embeddings)

In [14]:
pcr_vectorizer.fit(df['newsgroups_text'])

In [15]:
df['newsgroups_text_pcr'] = df['newsgroups_text']

In [16]:
vectors = pcr_vectorizer.transform(df['newsgroups_text'])

In [17]:
vectors.shape

(990, 50)

In [18]:
pcr_retriever = make_sklearn_retriever(df, pcr_vectorizer, col='newsgroups_text_pcr')

In [19]:
prettify_response(pcr_retriever.retrieve('atheism'))

0    I'm sorry, I thought we were discussing heresy.  I assumed that heresy\nmeant a departure from orthodoxy, in which case generally accepted belief ...
1            True.\n\nAlso read 2 Peter 3:16\n\nPeter warns that the scriptures are often hard to understand by those who\nare not learned on the subject.
2    Exactly.\n\nBut I'll add another observation: if the chip does become a standard,\nthe algorithm won't _remain_ secret.\n\nLeaving the government ...
3    "Put not your trust in princes" is the Biblical proverb.  The modern\nanalog is governments.  At the time of the founding of the US, the\nidea tha...
4    You're admitting a lot more than that.  You are admitting that\nyour morals are situational.   You are admitting that the actions\nof other people...
5    Yes it is, as has been evidenced by the previous two stages\nof withdrawal from the area and by the reductions in troops.\nCurrently the troops ar...
6    Although I realize that principle is not one of your strongest\np