In [1]:
### FORMAL DEMOS

In [4]:
## SEARCH ENGINE DEMOS
from pprint import pprint

import rewriter.search as search
reload(search)

import secure

# Create a search engine that searches over all edX courses.
# Under the hood, this uses Python's Whoosh library to index
# the course data stored in a CSV and then run searches against it.
dataset_path = secure.DATASET_PATH_BASE+'Master CourseListings - edX.csv'
index_path = secure.INDEX_PATH_BASE+'edx'
edx_engine = search.EdXSearchEngine(dataset_path, index_path, create=False)

# We expose a simple searching API
edx_results = edx_engine.search("biology")
pprint(edx_results)

# You could also search over HarvardX resources or Udacity courses
# with similar classes!

[{'course_id': u'MITx/7.28.1x/1T2015', 'name': u'Molecular Biology'},
 {'course_id': u'course-v1:RICEx+AdvBIOx+3T2016',
  'name': u'AP\xc2\xae Biology'},
 {'course_id': u'course-v1:HarvardX+MCB64.1x+2T2016',
  'name': u'Cell Biology: Mitochondria'},
 {'course_id': u'course-v1:IEEEx+SysBio1x+2016_T1',
  'name': u'Introduction to Systems Biology.'},
 {'course_id': u'course-v1:IEEEx+SysBio1x+3T2016',
  'name': u'Introduction to Systems Biology.'},
 {'course_id': u'MITx/7.QBWx/2T2014',
  'name': u'Quantitative Biology Workshop'},
 {'course_id': u'MITx/7.QBW_1x/1T2015',
  'name': u'Quantitative Biology Workshop'},
 {'course_id': u'course-v1:MITx+20.305x+3T2015',
  'name': u'Principles of Synthetic Biology'},
 {'course_id': u'course-v1:MITx+7.QBWx_3+1T2016',
  'name': u'Quantitative Biology Workshop'},
 {'course_id': u'course-v1:MITx+7.QBWx_4+3T2016',
  'name': u'Quantitative Biology Workshop'}]


In [6]:
## QUERY REWRITING DEMOS

import gensim.models.word2vec as word2vec
import secure
import rewriter

# Query rewriting lets you turn a single search query into
# multiple related queries. You can then search for *all*
# of these queries, which can result in more and more useful
# results than just the original query would give.

# First, a rewriter that uses the Wikipedia category API
# to find terms related to the original term
wiki_rewriter = rewriter.WikipediaRewriter()
wiki_rewritten_queries = wiki_rewriter.rewrite("newton's laws of motion")
pprint(wiki_rewritten_queries)


# Second, a rewriter that uses Word2Vec to find similar
# words to the entered term. This is a machine learning
# algorithm trained on a large text corpus.
# Prepare the corpus (from Wikipedia) to use for the Word2Vec Rewriter.
corpus = word2vec.Text8Corpus(secure.DATASET_PATH_BASE + 'wikiclean8')
# Now make the rewriter...
w2v_rewriter = rewriter.Word2VecRewriter(corpus=corpus, create=False)
w2v_rewritten_queries = w2v_rewriter.rewrite("socialism")
pprint(w2v_rewritten_queries)

['classical mechanics', 'commons category with local link different than on wikidata', 'concepts in physics', 'copernican revolution', 'engvarb from july 2014', 'experimental physics', 'history of physics']
classical mechanics commons category with local link different than on wikidata concepts in physics copernican revolution engvarb from july 2014 experimental physics history of physics newton's laws of motion
['classical mechanics',
 'commons category with local link different than on wikidata',
 'concepts in physics',
 'copernican revolution',
 'engvarb from july 2014',
 'experimental physics',
 'history of physics',
 "newton's laws of motion"]
[u'capitalism',
 u'communism',
 u'liberalism',
 u'ideology',
 u'marxism',
 u'marxist',
 u'anarcho',
 u'conservatism',
 u'democracy',
 u'fascism',
 u'socialism']


In [9]:
## PUTTING IT ALL TOGETHER

# We can apply a query rewriter to a search engine
# to create a new search engine that runs the
# query rewriter on all incoming searches and returns
# all results it gets from the queries

# For example, imagine trying the Word2Vec rewriter
# on the edX search engine 
w2v_edx_engine = search.RewritingSearchEngine(w2v_rewriter, edx_engine)
rewritten_results = w2v_edx_engine.search("calculus")
pprint(rewritten_results)

[{'course_id': u'SchoolYourself/GeometryX/1T2015',
  'name': u'Introduction to Geometry'},
 {'course_id': u'course-v1:SchoolYourself+GeometryX+2T2016',
  'name': u'Introduction to Geometry'},
 {'course_id': u'course-v1:TsinghuaX+70240183x+3T2015',
  'name': u'Computational Geometry'},
 {'course_id': u'course-v1:TsinghuaX+70240183x+3T2016',
  'name': u'Computational Geometry'},
 {'course_id': u'course-v1:TeachForAmericaX+HSMATH2.1x+3T2016',
  'name': u'How to Teach High School Geometry'},
 {'course_id': u'BUx/Math226.1x/1T2015',
  'name': u'Introduction to Differential Equations'},
 {'course_id': u'course-v1:BUx+Math226.2x+2T2015',
  'name': u'Linear Differential Equations'},
 {'course_id': u'course-v1:BUx+Math226.1x+1T2016',
  'name': u'Introduction to Differential Equations'},
 {'course_id': u'course-v1:BUx+Math226.2x+1T2016',
  'name': u'Linear Differential Equations'},
 {'course_id': u'course-v1:BUx+Math226.3x+3T2015',
  'name': u'Nonlinear Differential Equations: Order and Chaos'},