# TODO

## Major
- [x] create term-by-document matrix (calculate words frequncies for each term-document pair)
 - [ ] check that it's actually correct - especially if we don't map terms to wrong documents
- [x] convert term-by-document frequencies to tf-idf (calcualte tf-idf for each term-document pair)
 - [ ] check
- [x] we may need actual (numpy?) matrix?
- [x] LSI magic
- [ ] Put it together
- [ ] GUI

### Minor
- [x] remove numbers from terms - done but not sure if it's good thing to do, maybe it's also important for relevancy of docs,
like for example when there is year written?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import math
from pandas._testing import assert_frame_equal

In [2]:
np.random.seed(42)

In [3]:
bp_data = pd.read_csv("articles.csv", header=0)

In [4]:
bp_data.head(1)

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."


In [5]:
def get_lemmatization_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dictionary = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dictionary.get(tag, wordnet.NOUN)

In [6]:
def preprocess_docs(docs, use_lemmatizer = True):
    '''Tokenize and preprocess documents
    
    Parameters
    ----------
    use_lemmatizer : bool
                     Uses lemmazizer if True, othrerwise uses stemmer.
    '''
    preproccessed_docs = []
    
    # English stop words list
    en_stop = set(stopwords.words('english'))
    
    # Word tokenizer that removes punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    
    # lemmatizer / Stemmer
    if use_lemmatizer:
        lemmatizer = WordNetLemmatizer()
    else:
        stemmer = SnowballStemmer("english")
    
    for row in docs.itertuples(index=True, name='Doc'):
        text = row.text
        
        # remove numbers
        text = re.sub(r'\d+', '', text)
        
        text_words = tokenizer.tokenize(text)
        
        if use_lemmatizer:
            text_words = [lemmatizer.lemmatize(word.lower(), get_lemmatization_pos(word.lower())) for word in text_words
                          if word not in string.punctuation and word.lower() not in en_stop]
        else:
            text_words = [stemmer.stem(word.lower()) for word in text_words
                         if word not in string.punctuation and word.lower() not in en_stop]
        
        preproccessed_docs.append({'words': text_words})
    
    pdocs = pd.DataFrame(preproccessed_docs)
    return pdocs

In [7]:
preproccessed_docs = preprocess_docs(bp_data)
preproccessed_docs

Unnamed: 0,words
0,"[oh, headline, blare, chatbots, next, big, thi..."
1,"[ever, found, look, question, concept, syntax,..."
2,"[machine, learn, increasingly, move, hand, des..."
3,"[understand, machine, learn, big, question, ma..."
4,"[want, learn, apply, artificial, intelligence,..."
...,...
332,"[click, share, article, linkedin, skip, part, ..."
333,"[opinion, deep, neural, network, machine, lear..."
334,"[everyone, remotely, tune, recent, progress, m..."
335,"[one, big, misconception, around, idea, deep, ..."


In [8]:
def get_term_by_document_frequency(preprocessed_docs):
    document_by_term = {}
    
    for index, row in preprocessed_docs.iterrows():
        doc_id = index
        doc_words = row['words']
        
        # computed later, @TODO: move computation here and fix below or remove
#         document_by_term[doc_id] = {
#             'total_words': len(doc_words)
#         }
        document_by_term[doc_id] = {}
        
        for word in set(row['words']):
            document_by_term[doc_id][word] = doc_words.count(word)

    df = pd.DataFrame(document_by_term)
    
    return df

In [9]:
df_frequency = get_term_by_document_frequency(preproccessed_docs)

In [10]:
df_frequency

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
feedback,1.0,,1.0,,,,,,,4.0,...,,,,1.0,,,1.0,,,
dave,2.0,,,,,,,,,,...,,,,,,,,,,
useful,1.0,1.0,2.0,1.0,1.0,3.0,,2.0,,2.0,...,2.0,,,,,2.0,3.0,,,2.0
unpredictability,1.0,,,,,,,,,,...,,,,,,,,,,
thing,7.0,1.0,1.0,4.0,1.0,4.0,4.0,1.0,,7.0,...,8.0,,3.0,3.0,,5.0,1.0,,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
paradox,,,,,,,,,,,...,,,,,,,,,,1.0
starking,,,,,,,,,,,...,,,,,,,,,,1.0
np,,,,,,,,,,,...,,,,,,,,,,2.0
mistaken,,,,,,,,,,,...,,,,,,,,,,1.0


In [11]:
def reduce_terms(df_frequency, max_df=1, min_df=0, max_terms=None):
    '''Remove unimportant terms from term-by-document matrix.
    
    Parameters
    ----------
    df : pd.DataFrame
    max_df : float , between [0, 1]
             Terms that appear in more % of documents will be ignored
    min_df : float , between [0, 1]
             Terms that appear in less % of documents will be ignored
    max_terms : int , None
                If not None, only top `max_terms` terms will be returned.
    '''
    df = df_frequency.copy()
    corpus_size = df.shape[1]

    if 'doc_frequency' not in df:
        df['doc_frequency'] = df_frequency.fillna(0).astype(bool).sum(axis=1) / corpus_size
            
    df = df[df.doc_frequency <= max_df]
    df = df[df.doc_frequency >= min_df]
    
    if max_terms is not None and max_terms < df.shape[0]:
        df['term_count'] = df_frequency.fillna(0).sum(axis=1)
        df = df.sort_values('term_count', ascending=False)
        df = df.head(max_terms)
        df.drop('term_count',axis=1, inplace=True)
    
    return df

In [12]:
reduce_terms(df_frequency).sort_values('doc_frequency', ascending=False).shape

(15584, 338)

In [13]:
reduce_terms(df_frequency, 0.8, 0.1,1000).sort_values('doc_frequency', ascending=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,328,329,330,331,332,333,334,335,336,doc_frequency
way,11.0,3.0,1.0,5.0,4.0,5.0,2.0,2.0,2.0,9.0,...,,14.0,,5.0,1.0,2.0,,1.0,7.0,0.792285
take,7.0,3.0,4.0,5.0,9.0,2.0,,4.0,,5.0,...,6.0,2.0,,6.0,2.0,1.0,4.0,,3.0,0.777448
machine,6.0,2.0,10.0,11.0,2.0,,20.0,3.0,2.0,5.0,...,2.0,8.0,2.0,1.0,6.0,11.0,4.0,4.0,,0.768546
give,1.0,3.0,2.0,2.0,5.0,2.0,1.0,,5.0,8.0,...,1.0,2.0,,7.0,2.0,2.0,3.0,,4.0,0.762611
go,2.0,2.0,2.0,3.0,4.0,2.0,1.0,2.0,,10.0,...,1.0,2.0,3.0,,8.0,5.0,,2.0,2.0,0.756677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
unsupervised,,,,,,,1.0,,1.0,1.0,...,,,,,,2.0,1.0,,,0.100890
emerge,1.0,,,,,,,,,,...,,,,,,,,,,0.100890
apple,,,,,,,1.0,,,,...,,,,,,,,,,0.100890
man,,,,,,,,,,,...,,,,,,,,,1.0,0.100890


In [14]:
df_reduced = reduce_terms(df_frequency, 0.8, 0.1)

In [15]:
def get_tf_idf(df_frequency):
    df = df_frequency.copy()
    # tf := word frequency / total frequency
    df.loc['total_words'] = df.sum()
        
    df = df.drop('total_words')[:] / df.loc['total_words']
    
    # idf := log ( len(all_documents) / len(documents_containing_word) )
    
    corpus_size = df.shape[1]

    # number of non-zero cols
    if 'doc_frequency' not in df_frequency:
        df['doc_frequency'] = df.fillna(0).astype(bool).sum(axis=1)
        
    df['idf'] = np.log( corpus_size / df['doc_frequency'] )
    
    # tf-idf := tf * idf
    _cols = df.columns.difference(['idf', 'doc_frequency'])
    df[_cols] = df[_cols].multiply(df["idf"], axis="index")
    
    df.drop(columns=['doc_frequency', 'idf'], inplace=True)
    
    return df

In [16]:
df_tf_idf = get_tf_idf(df_reduced)
display(df_tf_idf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
feedback,0.017634,,0.016055,,,,,,,0.048283,...,,,,0.048642,,,0.017998,,,
useful,0.016963,0.026781,0.030888,0.026499,0.01419,0.031893,,0.075371,,0.023223,...,0.012863,,,,,0.043478,0.051940,,,0.036066
thing,0.111710,0.025194,0.014529,0.099716,0.01335,0.040004,0.117823,0.035453,,0.076466,...,0.048406,,0.051409,0.132058,,0.102256,0.016288,,,0.033929
benefit,0.017634,0.027840,,,,,,,,0.012071,...,,,,0.097284,0.137253,,,,,
interface,0.124921,,,,,,,,,,...,,,0.019163,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
improvement,,,,,,,,,,,...,,,0.037919,,0.022904,,,,0.067357,
movie,,,,,,,,,,,...,,,,,,,,,,
behavior,,,,,,,,,,,...,,,,,,,,0.050999,0.206094,
chosen,,,,,,,,,,,...,,,,,,,,,,0.019410


In [17]:
def custom_svd(A, full_matrices=True):
    eig_vals, eig_vecs = np.linalg.eig(A @ A.transpose())
    U = eig_vecs.real
    
    eig_vals, eig_vecs = np.linalg.eig(A.transpose() @ A)
    V = eig_vecs.transpose().real
    
    s_eigen = [math.sqrt(abs(x.real)) for x in eig_vals]
    
    if full_matrices == False:
        k = min(A.shape[0], A.shape[1])
        U = U[:, :k]
        V = V[:k, :]
    
    return U, np.array(s_eigen), V

In [18]:
def get_concept_by_document(df_tf_idf, customSVD = False):
    '''Transform data to concept space.
    '''
    values = df_tf_idf.fillna(0).to_numpy()
    
    if customSVD:
        U, s_eigen, V = custom_svd(values, False)
    else:
        U, s_eigen, V = np.linalg.svd(values, full_matrices=False)
    
    S = np.diag(s_eigen)
    
    concept_by_document = S @ V.T
    return pd.DataFrame(concept_by_document)

In [19]:
get_concept_by_document(df_tf_idf, True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
0,4.259916e-01,4.735769e-02,-4.988374e-02,6.605945e-01,-4.057636e-01,2.017974e-01,1.842499e-01,2.013197e-01,-1.133062e-01,2.206752e-01,...,6.344292e-02,-1.965424e-01,-3.437803e-03,-3.437803e-03,3.026468e-02,3.026468e-02,8.216953e-02,8.216953e-02,-6.274620e-02,-3.023712e-02
1,2.795415e-01,1.019208e-01,1.142461e-01,9.587874e-02,-1.670740e-01,-1.105999e-01,-9.245931e-02,-2.916502e-01,-8.074129e-02,-2.322764e-01,...,1.343727e-15,-1.830668e-15,2.233218e-17,2.233218e-17,4.663880e-16,4.663880e-16,5.215178e-16,5.215178e-16,-6.228108e-16,-1.614059e-18
2,2.969608e-01,2.414341e-02,-7.620556e-02,-3.244868e-02,-1.313077e-02,8.612612e-02,2.243156e-01,-2.861063e-01,-1.009107e-01,-3.374352e-01,...,-1.155219e-15,9.388783e-17,2.415961e-16,2.415961e-16,-5.247274e-16,-5.247274e-16,1.010382e-16,1.010382e-16,3.103595e-17,-2.345217e-16
3,2.867078e-01,2.366230e-02,-6.902352e-02,2.197335e-01,-1.269565e-01,-1.081077e-01,1.544402e-03,1.540779e-01,4.869423e-02,-3.573644e-01,...,4.875906e-01,-9.991720e-02,-6.170240e-02,-6.170240e-02,1.570920e-01,1.570920e-01,-3.273377e-02,-3.273377e-02,4.762522e-02,1.178360e-01
4,2.387406e-01,5.002920e-02,1.812412e-01,3.067574e-02,-1.084747e-02,-2.279296e-01,2.142887e-01,-5.804021e-01,2.589619e-01,4.967155e-02,...,-1.934876e-01,-9.638964e-02,1.629875e-02,1.629875e-02,-8.401476e-02,-8.401476e-02,2.247379e-02,2.247379e-02,-2.165160e-02,-1.841158e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,5.180239e-18,2.090961e-18,2.942616e-18,-1.722293e-18,-1.456185e-18,-1.251678e-18,-1.878074e-18,2.391504e-18,5.082309e-19,-9.163757e-19,...,1.215513e-17,1.427951e-18,1.343159e-18,1.343159e-18,8.822446e-18,8.822446e-18,-1.988191e-17,-1.988191e-17,1.376613e-17,1.570356e-17
333,3.622019e-18,1.411248e-18,2.263960e-18,-1.022815e-18,1.642467e-18,-2.802517e-18,-2.328739e-18,3.200219e-18,2.447777e-18,9.672342e-19,...,-2.243655e-32,-3.157051e-33,5.964271e-33,5.964271e-33,-7.562728e-33,-7.562728e-33,5.692346e-33,5.692346e-33,-5.669645e-33,-9.187774e-33
334,3.181885e-18,1.464290e-18,1.127673e-18,-1.740370e-18,6.060973e-19,-1.644318e-18,-1.986018e-18,1.491591e-18,2.307909e-18,4.667620e-19,...,2.315227e-33,-2.496536e-33,-3.853576e-33,-3.853576e-33,4.254467e-33,4.254467e-33,-4.749327e-35,-4.749327e-35,-1.022725e-34,4.256305e-34
335,2.247478e-18,-2.849830e-21,3.957946e-19,-1.005521e-18,-4.197702e-20,-1.836213e-18,1.148703e-18,1.019230e-18,4.613187e-19,1.018405e-18,...,4.297415e-33,1.155903e-33,-2.507104e-35,-2.507104e-35,3.131410e-33,3.131410e-33,1.710355e-33,1.710355e-33,-2.869342e-34,-4.922868e-34


In [20]:
get_concept_by_document(df_tf_idf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
0,4.259916e-01,-4.735769e-02,4.988374e-02,-6.605945e-01,4.057636e-01,-2.017974e-01,1.842499e-01,-2.013197e-01,1.133062e-01,-2.206752e-01,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,8.412690e+00,0.000000e+00
1,2.795415e-01,-1.019208e-01,-1.142461e-01,-9.587874e-02,1.670740e-01,1.105999e-01,-9.245931e-02,2.916502e-01,8.074129e-02,2.322764e-01,...,-9.524168e-16,-3.032755e-16,1.578148e-17,7.169224e-17,-6.919810e-17,2.895468e-17,-4.052101e-17,1.265118e-16,-2.120958e-15,-8.870188e-16
2,2.969608e-01,-2.414341e-02,7.620556e-02,3.244868e-02,1.313077e-02,-8.612612e-02,2.243156e-01,2.861063e-01,1.009107e-01,3.374352e-01,...,3.441197e-16,4.056753e-17,-2.336556e-17,3.213400e-17,-7.303946e-17,3.289132e-17,1.055218e-16,-6.212396e-17,2.127177e-15,1.388218e-15
3,2.867078e-01,-2.366230e-02,6.902352e-02,-2.197335e-01,1.269565e-01,1.081077e-01,1.544402e-03,-1.540779e-01,-4.869423e-02,3.573644e-01,...,-1.507740e-01,-5.016305e-01,-6.036712e-13,8.790665e-17,3.742111e-16,1.443813e-15,-1.852294e-15,4.918550e-16,-3.969495e-03,4.087100e-01
4,2.387406e-01,-5.002920e-02,-1.812412e-01,-3.067574e-02,1.084747e-02,2.279296e-01,2.142887e-01,5.804021e-01,-2.589619e-01,-4.967155e-02,...,5.008683e-01,8.810549e-02,-9.491592e-14,-9.181279e-16,-1.672780e-15,-9.319311e-16,-6.685654e-16,3.258684e-16,4.371110e-03,-9.148043e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,4.791039e-17,-1.933864e-17,-2.721532e-17,1.592894e-17,1.346780e-17,1.157638e-17,-1.736971e-17,-2.211827e-17,-4.700467e-18,8.475268e-18,...,5.180457e-18,1.090054e-17,4.773019e-30,9.948995e-32,-8.134035e-33,-3.628042e-32,-3.342736e-33,3.168764e-32,7.180716e-20,4.310753e-18
333,5.597801e-17,-2.181073e-17,-3.498932e-17,1.580752e-17,-2.538420e-17,4.331268e-17,-3.599048e-17,-4.945913e-17,-3.783020e-17,-1.494853e-17,...,-5.157048e-32,-2.171389e-32,-9.708295e-34,8.000336e-33,-7.149261e-33,-5.200060e-34,2.933033e-33,-8.376918e-33,1.081453e-33,-1.764253e-32
334,4.917577e-17,-2.263049e-17,-1.742810e-17,2.689728e-17,-9.367185e-18,2.541280e-17,-3.069375e-17,-2.305241e-17,-3.566855e-17,-7.213769e-18,...,-2.714236e-32,1.085694e-32,1.075536e-32,-1.346573e-33,-5.627420e-33,-4.096312e-33,9.104031e-34,-2.295244e-33,2.086569e-32,2.714236e-32
335,2.764063e-17,3.504866e-20,-4.867683e-18,1.236641e-17,5.162547e-19,2.258268e-17,1.412733e-17,-1.253501e-17,-5.673531e-18,-1.252487e-17,...,-2.212007e-32,5.421585e-33,-4.783259e-34,2.691328e-33,-5.497437e-33,5.348249e-34,-3.380698e-33,-3.145508e-34,-1.984300e-32,5.231829e-33


In [21]:
df_concept = get_concept_by_document(df_tf_idf)
df_concept

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
0,4.259916e-01,-4.735769e-02,4.988374e-02,-6.605945e-01,4.057636e-01,-2.017974e-01,1.842499e-01,-2.013197e-01,1.133062e-01,-2.206752e-01,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,8.412690e+00,0.000000e+00
1,2.795415e-01,-1.019208e-01,-1.142461e-01,-9.587874e-02,1.670740e-01,1.105999e-01,-9.245931e-02,2.916502e-01,8.074129e-02,2.322764e-01,...,-9.524168e-16,-3.032755e-16,1.578148e-17,7.169224e-17,-6.919810e-17,2.895468e-17,-4.052101e-17,1.265118e-16,-2.120958e-15,-8.870188e-16
2,2.969608e-01,-2.414341e-02,7.620556e-02,3.244868e-02,1.313077e-02,-8.612612e-02,2.243156e-01,2.861063e-01,1.009107e-01,3.374352e-01,...,3.441197e-16,4.056753e-17,-2.336556e-17,3.213400e-17,-7.303946e-17,3.289132e-17,1.055218e-16,-6.212396e-17,2.127177e-15,1.388218e-15
3,2.867078e-01,-2.366230e-02,6.902352e-02,-2.197335e-01,1.269565e-01,1.081077e-01,1.544402e-03,-1.540779e-01,-4.869423e-02,3.573644e-01,...,-1.507740e-01,-5.016305e-01,-6.036712e-13,8.790665e-17,3.742111e-16,1.443813e-15,-1.852294e-15,4.918550e-16,-3.969495e-03,4.087100e-01
4,2.387406e-01,-5.002920e-02,-1.812412e-01,-3.067574e-02,1.084747e-02,2.279296e-01,2.142887e-01,5.804021e-01,-2.589619e-01,-4.967155e-02,...,5.008683e-01,8.810549e-02,-9.491592e-14,-9.181279e-16,-1.672780e-15,-9.319311e-16,-6.685654e-16,3.258684e-16,4.371110e-03,-9.148043e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,4.791039e-17,-1.933864e-17,-2.721532e-17,1.592894e-17,1.346780e-17,1.157638e-17,-1.736971e-17,-2.211827e-17,-4.700467e-18,8.475268e-18,...,5.180457e-18,1.090054e-17,4.773019e-30,9.948995e-32,-8.134035e-33,-3.628042e-32,-3.342736e-33,3.168764e-32,7.180716e-20,4.310753e-18
333,5.597801e-17,-2.181073e-17,-3.498932e-17,1.580752e-17,-2.538420e-17,4.331268e-17,-3.599048e-17,-4.945913e-17,-3.783020e-17,-1.494853e-17,...,-5.157048e-32,-2.171389e-32,-9.708295e-34,8.000336e-33,-7.149261e-33,-5.200060e-34,2.933033e-33,-8.376918e-33,1.081453e-33,-1.764253e-32
334,4.917577e-17,-2.263049e-17,-1.742810e-17,2.689728e-17,-9.367185e-18,2.541280e-17,-3.069375e-17,-2.305241e-17,-3.566855e-17,-7.213769e-18,...,-2.714236e-32,1.085694e-32,1.075536e-32,-1.346573e-33,-5.627420e-33,-4.096312e-33,9.104031e-34,-2.295244e-33,2.086569e-32,2.714236e-32
335,2.764063e-17,3.504866e-20,-4.867683e-18,1.236641e-17,5.162547e-19,2.258268e-17,1.412733e-17,-1.253501e-17,-5.673531e-18,-1.252487e-17,...,-2.212007e-32,5.421585e-33,-4.783259e-34,2.691328e-33,-5.497437e-33,5.348249e-34,-3.380698e-33,-3.145508e-34,-1.984300e-32,5.231829e-33


In [22]:
def cosine_similarity(x, y):
    '''Returns cosine similarity of two vectors.'''
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [29]:
def get_n_nearest(df_concept, i, n=None, min_sim=0):
    '''Returns most similar (column) vectors to `i`-th vector in `arr`.
    
    Parameters
    ----------
    df_concept : pd.DataFrame
    i : index of vector to be compared to
    n : return at most `n` vectors
    '''
    
    src_vector = df_concept[i].copy()
    df = df_concept.apply(func=cosine_similarity, axis=0, args=(src_vector, ))
    
    if n:
        # skip first value - the src_vector itself
        return df.sort_values(ascending=False)[1:n + 1]
    else:
        return df.sort_values(ascending=False)

In [50]:
best_match = get_n_nearest(df_concept, 2, 3)
best_match

3    0.424957
1    0.407875
4    0.309357
dtype: float64

In [56]:
doc = 2
display(bp_data.iloc[doc])
for i, similarity in get_n_nearest(df_concept, doc, 3).iteritems():
    display(bp_data.iloc[i])

author                                           William Koehrsen
claps                                                        2.8K
reading_time                                                   11
link            https://towardsdatascience.com/automated-featu...
title           Automated Feature Engineering in Python – Towa...
text            Machine learning is increasingly moving from h...
Name: 2, dtype: object

author                                               Gant Laborde
claps                                                        1.3K
reading_time                                                    7
link            https://medium.freecodecamp.org/machine-learni...
title           Machine Learning: how to go from Zero to Hero ...
text            If your understanding of A.I. and Machine Lear...
Name: 3, dtype: object

author                                                Conor Dewey
claps                                                        1.4K
reading_time                                                    7
link            https://towardsdatascience.com/python-for-data...
title           Python for Data Science: 8 Concepts You May Ha...
text            If you’ve ever found yourself looking up the s...
Name: 1, dtype: object

author                                           Emmanuel Ameisen
claps                                                         935
reading_time                                                   11
link            https://blog.insightdatascience.com/reinforcem...
title           Reinforcement Learning from scratch – Insight ...
text            Want to learn about applied Artificial Intelli...
Name: 4, dtype: object