# SearchBetter demos

In [21]:
# First, let's get all the imports out of the way...

import gensim.models.word2vec as word2vec
from pprint import pprint
from IPython.core.display import display, HTML

import sys
sys.path.append('../')

import searchbetter.search as search
reload(search)
import searchbetter.rewriter as rewriter
reload(rewriter)

import secure

## Making a search engine

SearchBetter 

In [3]:


# Create a search engine that searches over all edX courses.
# Under the hood, this uses Python's Whoosh library to index
# the course data stored in a CSV and then run searches against it.
dataset_path = secure.DATASET_PATH_BASE+'Master CourseListings - edX.csv'
index_path = secure.INDEX_PATH_BASE+'edx'
edx_engine = search.EdXSearchEngine(dataset_path, index_path, create=False)

# We expose a simple searching API
edx_results = edx_engine.search("biology")
pprint(edx_results)

# You could also search over HarvardX resources or Udacity courses
# with similar classes!

[{'course_id': u'MITx/7.28.1x/1T2015', 'name': u'Molecular Biology'},
 {'course_id': u'course-v1:RICEx+AdvBIOx+3T2016',
  'name': u'AP\xc2\xae Biology'},
 {'course_id': u'course-v1:HarvardX+MCB64.1x+2T2016',
  'name': u'Cell Biology: Mitochondria'},
 {'course_id': u'course-v1:IEEEx+SysBio1x+2016_T1',
  'name': u'Introduction to Systems Biology.'},
 {'course_id': u'course-v1:IEEEx+SysBio1x+3T2016',
  'name': u'Introduction to Systems Biology.'},
 {'course_id': u'MITx/7.QBWx/2T2014',
  'name': u'Quantitative Biology Workshop'},
 {'course_id': u'MITx/7.QBW_1x/1T2015',
  'name': u'Quantitative Biology Workshop'},
 {'course_id': u'course-v1:MITx+20.305x+3T2015',
  'name': u'Principles of Synthetic Biology'},
 {'course_id': u'course-v1:MITx+7.QBWx_3+1T2016',
  'name': u'Quantitative Biology Workshop'},
 {'course_id': u'course-v1:MITx+7.QBWx_4+3T2016',
  'name': u'Quantitative Biology Workshop'}]


In [4]:
## QUERY REWRITING DEMOS



# Query rewriting lets you turn a single search query into
# multiple related queries. You can then search for *all*
# of these queries, which can result in more and more useful
# results than just the original query would give.

# First, a rewriter that uses the Wikipedia category API
# to find terms related to the original term
wiki_rewriter = rewriter.WikipediaRewriter()
wiki_rewritten_queries = wiki_rewriter.rewrite("newton's laws of motion")
pprint(wiki_rewritten_queries)


# Second, a rewriter that uses Word2Vec to find similar
# words to the entered term. This is a machine learning
# algorithm trained on a large text corpus.
# Prepare the corpus (from Wikipedia) to use for the Word2Vec Rewriter.
corpus = word2vec.LineSentence(secure.DATASET_PATH_BASE + 'wikiclean8')

# Now make the rewriter...
model_path = secure.MODEL_PATH_BASE+'word2vec/word2vec'
# w2v_rewriter = rewriter.Word2VecRewriter(model_path, create=True, corpus=corpus, bigrams=True)
w2v_rewriter = rewriter.Word2VecRewriter(model_path, create=False)
w2v_rewritten_queries = w2v_rewriter.rewrite("socialism")
pprint(w2v_rewritten_queries)

['classical mechanics',
 'commons category with local link different than on wikidata',
 'concepts in physics',
 'copernican revolution',
 'engvarb from july 2014',
 'experimental physics',
 'history of physics',
 "newton's laws of motion"]
[u'communism',
 u'capitalism',
 u'ideology',
 u'fascism',
 u'liberalism',
 u'marxism',
 u'marxist',
 u'laissez faire',
 u'imperialism',
 u'nationalism',
 u'socialism']


In [12]:
## PUTTING IT ALL TOGETHER

# We can apply a query rewriter to a search engine
# to create a new search engine that runs the
# query rewriter on all incoming searches and returns
# all results it gets from the queries

# For example, imagine trying the Word2Vec rewriter
# on the edX search engine 
edx_engine.set_rewriter(w2v_rewriter)
rewritten_results = edx_engine.search("calculus")
pprint(rewritten_results[0:18])

[{'course_id': u'CaltechX/BEM1105x/1T2015',
  'name': u'Pricing Options with Mathematical Models'},
 {'course_id': u'course-v1:CaltechX+BEM1105x+3T2015',
  'name': u'Pricing Options with Mathematical Models'},
 {'course_id': u'course-v1:CaltechX+BEM1105x+1T2016',
  'name': u'Pricing Options with Mathematical Models'},
 {'course_id': u'course-v1:CaltechX+BEM1105x+3T2016',
  'name': u'Pricing Options with Mathematical Models'},
 {'course_id': u'SchoolYourself/GeometryX/1T2015',
  'name': u'Introduction to Geometry'},
 {'course_id': u'course-v1:SchoolYourself+GeometryX+2T2016',
  'name': u'Introduction to Geometry'},
 {'course_id': u'course-v1:TsinghuaX+70240183x+3T2015',
  'name': u'Computational Geometry'},
 {'course_id': u'course-v1:TsinghuaX+70240183x+3T2016',
  'name': u'Computational Geometry'},
 {'course_id': u'course-v1:TeachForAmericaX+HSMATH2.1x+3T2016',
  'name': u'How to Teach High School Geometry'},
 {'course_id': u'SchoolYourself/AlgebraX/1T2015',
  'name': u'Introduction to

In [19]:


tmp_img_url = "https://lh3.googleusercontent.com/s2S7Q8NyH4OlJ8Evfgdm08DDn9xyT6gUsbxZd3eN9Fpr9p_QAnZZfocSbgFG0uwvBQC4vElFS_zJ5btSRg=s0#w=1440&h=780"
template = """
    <div>
        <div>
            <img class="pull-left" src={} width=6%/>
        </div>
        <div style="padding-left: 2cm;">
            <h5>{}. {}</h5>({})
        </div>
    </div>
"""

display(HTML('<div>Total Number of Results {}</div>'.format(len(rewritten_results))))
for i,r in enumerate(rewritten_results):
    display(HTML(template.format(tmp_img_url,i+1,r['name'].encode('ascii', 'replace'),r['course_id'].encode('ascii', 'replace'))))

In [13]:
uni = u'Bases matem\xc3\xa1ticas: Algebra'

In [18]:
uni.encode('ascii', 'replace')

'Bases matem??ticas: Algebra'