### Pipeline 2 - search

In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd

In [3]:
import lib.database_module as dm
from lib import encoding_module as enc
import lib.wiki_module as wiki

### Parse: Encode search as document vector

In [4]:
# our transformer will come from this pickle
from sklearn.externals import joblib
transformer = joblib.load('data/vectorizer.pkl')

In [5]:
# search term will be 'corvette'
search_term_document_vector = enc.get_searchterm_vector(transformer, 'corvette')

In [8]:
search_term_document_vector['corvette']

array([[  2.55546930e-02,  -1.34660137e-02,   1.20314309e-03,
          6.33055554e-05,   2.80353981e-03,  -4.24428246e-04,
          7.62486237e-04,   4.30924644e-03,  -1.38994415e-03,
         -1.24656545e-02,   1.68548878e-03,   3.54784132e-04,
          1.61818561e-03,  -3.13810516e-03,  -3.38471621e-02,
          1.38734374e-02,   1.56945002e-02,  -2.07773646e-02,
         -1.59068552e-02,  -6.51045051e-03,   3.11099606e-03,
         -3.69780723e-03,   1.05792774e-02,  -4.20017555e-04,
         -5.11306752e-02,   2.24581361e-02,  -2.57011523e-02,
         -7.98875334e-05,   1.19648852e-02,  -8.81497920e-03,
         -1.31379273e-02,  -1.79805479e-02,   1.20017991e-03,
          9.91852576e-03,   1.63893769e-03,   3.59213783e-03,
         -2.21954752e-02,   7.92397248e-03,  -1.28308386e-02,
         -1.49624702e-03,  -2.37494943e-02,   1.47708113e-02,
          1.55827902e-02,   1.63395414e-02,  -2.07372421e-02,
         -4.35050592e-03,   2.03832640e-02,  -6.78920883e-03,
        

### Mine: Select document vectors for all pages from database

In [11]:
page_vectors = dm.select_all_page_vectors()

Connected to server joshuacook.me.


In [19]:
indices = [tup[0] for tup in page_vectors]
vectors = [tup[1] for tup in page_vectors]

In [20]:
page_vectors_df = pd.DataFrame(vectors, index=indices)

In [21]:
page_vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
344083,0.275585,-0.101265,0.007115,-0.001592,0.006335,0.00768,-0.001214,-0.004784,0.002477,-0.007817,...,0.032629,0.004709,-0.005903,0.008758,0.014177,0.010986,0.000801,-0.012796,-0.005982,-0.003761
45260809,0.187593,-0.053228,0.016399,0.002273,-0.007791,0.005352,-0.005889,0.000993,0.014205,-0.000349,...,0.032435,-0.04457,-0.007897,0.013027,0.038891,0.004053,0.028509,-0.046108,-0.014064,-0.002065
4341789,0.093037,-0.03854,-0.000791,0.000941,0.004827,0.003893,0.00315,-0.005049,-0.001734,0.004833,...,0.009803,0.056048,-0.022399,0.02912,-0.022184,-0.02089,0.063562,-0.022862,-0.01002,0.014199
2514975,0.124438,0.166967,-0.215827,-0.247178,0.057838,0.006808,-0.010344,-0.057885,-0.280471,-0.048702,...,-0.024003,0.018549,0.02522,0.016304,-0.009877,-0.017131,-0.001509,-0.017671,-0.001048,-0.003444
35135520,0.191219,-0.031979,0.011209,-0.00091,-0.0051,0.015066,0.00534,0.012332,0.027897,0.030451,...,0.009617,-0.050009,-0.000679,-0.003436,-0.046443,-0.040477,-0.016854,0.018762,0.008802,-0.048635


### Model: Find five most similar documents based on document vectors

In [22]:
# NearestNeighbors will give you the 5 (by default) nearest neighbors
from sklearn.neighbors import NearestNeighbors

In [23]:
this_NN = NearestNeighbors()
this_NN.fit(page_vectors_df)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [43]:
Nearest_Neighbors_result = this_NN.kneighbors(search_term_document_vector['corvette'])
Nearest_Neighbors_result

(array([[ 0.65739035,  0.65739035,  0.65739035,  0.65739035,  0.65739035]]),
 array([[1247, 2185, 1869, 2026, 1259]]))

In [44]:
# This will give us an array of the 5 most similar documents, by index number. Recall
# however, that our index numbers are really our page_ids
Nearest_Neighbors_result[1]

array([[1247, 2185, 1869, 2026, 1259]])

### Mine: Retrieve text for the five most similar documents

In [100]:
# Make a table to match the 1) array indices returned from nearest neighbors to our 2) page_id
# indexed DataFrame (page_vectors_df)
lookup_table = pd.DataFrame(page_vectors_df.index, columns=['page_id'])

In [101]:
lookup_table.head()

Unnamed: 0,page_id
0,344083
1,45260809
2,4341789
3,2514975
4,35135520


In [134]:
nn_indices = Nearest_Neighbors_result[1].tolist()
nn_indices

[[1247, 2185, 1869, 2026, 1259]]

In [137]:
vmask = [True if i in nn_indices[0] else False for i in lookup_table.index]

In [141]:
Nearest_Neighbors_pages = lookup_table['page_id'][vmask]
Nearest_Neighbors_pages

1247     1702260
1259    32692637
1869    22928159
2026     5791134
2185    52108912
Name: page_id, dtype: int64

In [154]:
counter = 0
for page_id in Nearest_Neighbors_pages:
    wiki_response = wiki.query_page(str(page_id))
    print wiki_response['text']
    counter += 1 # for test purposes only, just to make sure I tried 5 times to get text
    print "Attempt #", counter, "-", "page_id: ", page_id


Attempt # 1 - page_id:  1702260

Attempt # 2 - page_id:  32692637

Attempt # 3 - page_id:  22928159

Attempt # 4 - page_id:  5791134

Attempt # 5 - page_id:  52108912


In [165]:
# Internal Note for myself: for future reference, instead of the lookup_table and vmask,
# a simpler option to access the page_ids is to do:
# page_vectors_df.index[1247] <- The Nearest_Neighbors_result[1] returned an array of index
# positions (1247, 2185, etc.). This means that we're interested in the page_id at the
# 1247th and 2185th index. Using the page_vectors_df, which has page_ids as it's index,
# we can call page_vectors_df.index to get an array of all the page_ids in order, and then
# use the 1247 to get the 1247th entry in the page_vectors_df.index, which will return a
# page_id
page_vectors_df.index[1247]

1702260