### Pipeline 2 - search

In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd

In [3]:
import lib.database_module as dm
from lib import encoding_module as enc
import lib.wiki_module as wiki

### Parse: Encode search as document vector

In [4]:
# our transformer will come from this pickle
from sklearn.externals import joblib
transformer = joblib.load('data/vectorizer.pkl')

In [5]:
# search term will be 'corvette'
search_term_document_vector = enc.get_searchterm_vector(transformer, 'corvette')

In [6]:
search_term_document_vector['corvette']

array([[  2.55546930e-02,  -1.34660137e-02,   1.20314309e-03,
          6.33055554e-05,   2.80353981e-03,  -4.24428246e-04,
          7.62486237e-04,   4.30924644e-03,  -1.38994415e-03,
         -1.24656545e-02,   1.68548878e-03,   3.54784132e-04,
          1.61818561e-03,  -3.13810516e-03,  -3.38471621e-02,
          1.38734374e-02,   1.56945002e-02,  -2.07773646e-02,
         -1.59068552e-02,  -6.51045051e-03,   3.11099606e-03,
         -3.69780723e-03,   1.05792774e-02,  -4.20017555e-04,
         -5.11306752e-02,   2.24581361e-02,  -2.57011523e-02,
         -7.98875334e-05,   1.19648852e-02,  -8.81497920e-03,
         -1.31379273e-02,  -1.79805479e-02,   1.20017991e-03,
          9.91852576e-03,   1.63893769e-03,   3.59213783e-03,
         -2.21954752e-02,   7.92397248e-03,  -1.28308386e-02,
         -1.49624702e-03,  -2.37494943e-02,   1.47708113e-02,
          1.55827902e-02,   1.63395414e-02,  -2.07372421e-02,
         -4.35050592e-03,   2.03832640e-02,  -6.78920883e-03,
        

### Mine: Select document vectors for all pages from database

In [7]:
page_vectors = dm.select_all_page_vectors()

Connected to server joshuacook.me.


In [8]:
indices = [tup[0] for tup in page_vectors]
vectors = [tup[1] for tup in page_vectors]

In [9]:
page_vectors_df = pd.DataFrame(vectors, index=indices)

In [10]:
page_vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
4341789,0.092992,-0.040985,0.006952,0.000243,0.001681,0.004999,0.00343,0.003523,-0.005129,-0.001911,...,-0.021003,-0.039528,0.027716,0.021033,-0.034768,-0.022294,0.031537,-0.00971,-0.010331,0.025944
48201744,0.021301,0.027559,-0.003917,-0.010877,0.013055,0.000934,0.000128,0.010317,0.001444,-0.009792,...,-0.05504,0.00011,0.057817,0.005156,0.058477,0.008807,0.043527,-0.011693,0.064738,0.051296
2514975,0.124983,0.153025,0.026264,-0.257191,-0.210183,0.074847,0.003862,-0.009666,-0.065705,-0.280972,...,-0.022703,0.015894,-0.002476,0.004992,-0.006653,-0.017394,-0.008525,0.017188,-0.001596,0.00012
35135520,0.191357,-0.039003,-0.006018,0.006389,-0.001106,-0.004748,0.015829,0.005612,0.011846,0.028248,...,0.000578,-0.004204,0.030916,0.010484,-0.012454,0.02605,0.038744,0.015332,-0.008053,0.009506
27303975,0.034292,0.039499,-0.006095,-0.015156,0.026112,-0.039749,-0.08144,-0.036546,0.005076,0.018881,...,0.030056,-0.026965,-0.002426,0.042282,0.024082,0.004498,-0.009517,-0.005315,-0.003784,-0.010202


### Model: Find five most similar documents based on document vectors

In [11]:
# NearestNeighbors will give you the 5 (by default) nearest neighbors
from sklearn.neighbors import NearestNeighbors

In [12]:
this_NN = NearestNeighbors()
this_NN.fit(page_vectors_df)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [13]:
Nearest_Neighbors_result = this_NN.kneighbors(search_term_document_vector['corvette'])
Nearest_Neighbors_result

(array([[ 0.69334471,  0.72892273,  0.74019499,  0.7528829 ,  0.75419463]]),
 array([[1172,  483,  247,  566, 1448]]))

In [14]:
# This will give us an array of the 5 most similar documents, by index number. Recall
# however, that our index numbers are really our page_ids
Nearest_Neighbors_result[1]

array([[1172,  483,  247,  566, 1448]])

### Mine: Retrieve text for the five most similar documents

In [15]:
# Make a table to match the 1) array indices returned from nearest neighbors to our 2) page_id
# indexed DataFrame (page_vectors_df)
lookup_table = pd.DataFrame(page_vectors_df.index, columns=['page_id'])

In [16]:
lookup_table.head()

Unnamed: 0,page_id
0,4341789
1,48201744
2,2514975
3,35135520
4,27303975


In [17]:
nn_indices = Nearest_Neighbors_result[1].tolist()
nn_indices

[[1172, 483, 247, 566, 1448]]

In [18]:
vmask = [True if i in nn_indices[0] else False for i in lookup_table.index]

In [19]:
Nearest_Neighbors_pages = lookup_table['page_id'][vmask]
Nearest_Neighbors_pages

247     18007329
483     13846997
566     38485157
1172     8181732
1448      630169
Name: page_id, dtype: int64

In [20]:
counter = 0
for page_id in Nearest_Neighbors_pages:
    wiki_response = wiki.query_page(str(page_id))
    print wiki_response['text']
    counter += 1 # for test purposes only, just to make sure I tried 5 times to get text
    print "Attempt #", counter, "-", "page_id: ", page_id

Karate terms come almost entirely from Japanese. The following terms are not exclusive to karate. They appear during its study and practice, varying depending on style and school. Karate terms include: C Chito-ryu D Dan – Dojo G Gi – Goju-ryu - H Hajime – Heian K Karate – Kata – Kihon – Kohai – Kumite – Kyū O Osu R Rei S Senpai – Sensei – Shihan – Shotokan – Sōke W Waza Z Zanshin See also References External links
Attempt # 1 - page_id:  18007329
The Covin is a replica kit car of the Porsche 911 Turbo created by Tim Cook and Nick Vincent in the early 1980s. The name Covin came about from CO (Cook) and VIN (Vincent) giving us COVIN Performance Mouldings. Early models of the Covin were based on a shortened Beetle floorpan running gear but later used its own Covin chassis and VW Type 3 running gear. The company was sold in the 1990s to DAX and later moved to new owners GPC and was relocated to County Galway in Ireland where unfortunately up to now the Covin has not been produced again. Th

In [165]:
# Internal Note for myself: for future reference, instead of the lookup_table and vmask,
# a simpler option to access the page_ids is to do:
# page_vectors_df.index[1247] <- The Nearest_Neighbors_result[1] returned an array of index
# positions (1247, 2185, etc.). This means that we're interested in the page_id at the
# 1247th and 2185th index. Using the page_vectors_df, which has page_ids as it's index,
# we can call page_vectors_df.index to get an array of all the page_ids in order, and then
# use the 1247 to get the 1247th entry in the page_vectors_df.index, which will return a
# page_id
page_vectors_df.index[1247]

1702260