In [1]:
import sys
sys.path.append('../')

from lxml import etree
import abc
import re
import requests
import pandas
import scipy
import numpy as np
from pprint import pprint

from gensim.models.phrases import *

import gensim.models as models
import gensim.models.word2vec as word2vec
import secure

import searchbetter.search as search
reload(search)
import searchbetter.rewriter as rewriter
reload(rewriter)

import plotly
import plotly.graph_objs as go
import plotly.offline as py

import webcolors

py.init_notebook_mode()


<module 'searchbetter.rewriter' from '../searchbetter/rewriter.pyc'>

In [2]:
model_path = secure.MODEL_PATH_BASE+'word2vec/word2vec'
w2v_rewriter = rewriter.Word2VecRewriter(model_path, create=False)

In [3]:
w2v_rewriter.rewrite("New York")

[u'philadelphia',
 u'los angeles',
 u'london',
 u'boston',
 u'chicago',
 u'princeton',
 u'seattle',
 u'chicago illinois',
 u'san francisco',
 u'ny',
 u'New York']

In [4]:
# Create a search engine that searches over all edX courses.
# Under the hood, this uses Python's Whoosh library to index
# the course data stored in a CSV and then run searches against it.
dataset_path = secure.DATASET_PATH_BASE+'Master CourseListings - edX.csv'
index_path = secure.INDEX_PATH_BASE+'edx'
edx_engine = search.EdXSearchEngine(dataset_path, index_path, create=False)

In [5]:
edx_engine.search("chemistry")

[{'course_id': u'DavidsonX/D.001x/3T2014', 'name': u'Medicinal Chemistry'},
 {'course_id': u'course-v1:DavidsonX+D001x+3T2015',
  'name': u'Medicinal Chemistry'},
 {'course_id': u'course-v1:CooperUnion+Chem.1x+2T2016',
  'name': u'Preparing for CLEP Chemistry: Part 1'},
 {'course_id': u'course-v1:CooperUnion+Chem.2x+2T2016',
  'name': u'Preparing for CLEP Chemistry: Part 2'},
 {'course_id': u'MITx/3.091x_2/1T2014',
  'name': u'Introduction to Solid State Chemistry'},
 {'course_id': u'MITx/3.091x_3/3T2014',
  'name': u'Introduction to Solid State Chemistry'},
 {'course_id': u'course-v1:MITx+3.091x_4+1T2015',
  'name': u'Introduction to Solid State Chemistry'},
 {'course_id': u'course-v1:MITx+3.091x_5+3T2015',
  'name': u'Introduction to Solid State Chemistry'},
 {'course_id': u'course-v1:MITx+3.091x+3T2016',
  'name': u'Introduction to Solid State Chemistry'},
 {'course_id': u'CooperUnion/Chem.1x/1T2015',
  'name': u'Preparing for the AP* Chemistry Exam - Part 1'}]

## Testing

In [6]:
# comparison testing

rewriters = [
    rewriter.ControlRewriter(),
    rewriter.WikipediaRewriter(),
    w2v_rewriter
]

In [7]:
def stats(term):
    ans = [num_results(term, rw) for rw in rewriters]
    ans = [term] + ans
    return ans
    
    
def num_results(term, rw):
    edx_engine.set_rewriter(rw)
    results = edx_engine.search(term)
    num_results = len(results)
    return num_results


with open('../test/test-search-terms/generic.txt', 'r') as f:
    # read terms but chop the newlines at the end of each line
    terms = [line.rstrip('\n') for line in f]
    data = [stats(term) for term in terms]

df = pandas.DataFrame(columns=["term","control","wiki","word2vec"],data=data)

In [9]:
df

Unnamed: 0,term,control,wiki,word2vec
0,chemical reaction,0,10,6
1,psychology,10,10,58
2,cognitive bias,0,4,8
3,machine learning,10,20,10
4,cognition,1,11,18
5,einstein,3,3,4
6,biochemistry,1,9,47
7,statistics,10,20,16
8,cultural capital,0,0,0
9,lorentz,0,0,0


In [10]:
xs = list(df['control'])

In [11]:
ys = list(df['wiki'])

In [12]:
y2 = list(df['word2vec'])

In [13]:

# colors plotly uses

colors = [
    '#1f77b4', # blue
    '#ff7f0e', # orange
    '#2ca02c', # green
    '#d62728', # red
    '#9467bd'  # purple
]

# need to convert, e.g., #FF0000 to 'rgb(255,0,0)'
rgb_colors = [webcolors.hex_to_rgb(color) for color in colors]
color_strings = ['rgb(%s,%s,%s)' % (c[0], c[1], c[2]) for c in rgb_colors]

In [25]:
# reference: https://plot.ly/python/reference/#scattergl

traceControl = go.Scattergl(
    x = xs,
    y = xs,
    mode = 'lines',
    name = 'Control (no rewriting)',
    hoverinfo = 'text+name',
#     opacity = 0.3,
    line = dict(
#         color = 'rgb(0,0,0)',
#         opacity = 0.3
        color = color_strings[0]
    )
)

def plotSeriesWithRegression(xs, ys, name, color):
    # setting legend group lets user toggle the series AND its
    # line of best fit together
    legendgroup_name = '%s group' % name
    
    # plot points
    traceScatter = go.Scattergl(
        x = xs,
        y = ys,
        mode = 'markers',
        name = name,
        legendgroup = legendgroup_name,
        marker = dict(
            color = color,
            opacity = 0.5,
            size = 12,
            line = dict(
                width = 2,
                color = 'rgb(0, 0, 0)'
            )
        )
    )
    
    # plot line of best fit
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(xs, ys)
    line_of_best_fit = np.poly1d([slope, intercept])
    r_squared = r_value ** 2
    
    # to show line of best fit
    # y = %.2fx + %.2f, r^2 = %.2f' % (slope, intercept, r_squared),
    
    traceRegression = go.Scattergl(
        x = np.unique(xs),
        y = line_of_best_fit(np.unique(xs)),
        mode = 'lines',
        name = name + ' linear regression',
        legendgroup = legendgroup_name,
        hoverinfo = 'text+name',
        line = dict(
            color = color
        )
    )
    
    return [traceScatter, traceRegression]


# plot wiki
wikiTraces = plotSeriesWithRegression(xs, ys, name='Wikipedia Categories', color=color_strings[1])
w2vTraces = plotSeriesWithRegression(xs, y2, name='Word2Vec', color=color_strings[2])

plot = [traceControl] + w2vTraces + wikiTraces

layout = go.Layout(
    title='Effect of query rewriting on search engine hits (edX)',
    xaxis=dict(
        title='# hits before rewriting'
    ),
    yaxis=dict(
        title='# hits after rewriting'
    )
)

fig = go.Figure(data=plot, layout=layout)

# Plot and embed in ipython notebook!
py.iplot(fig)

In [31]:
# more stats
rewriter_names =[
    'control',
    'wiki',
    'word2vec'
]
# series containing # of hits for each search term
data_series = [df[name] for name in rewriter_names]
average_hits = [s.mean() for s in data_series]

# now filter on just those terms where the control gives nothing

df_where_no_hits = df[df['control'] == 0]
data_series_zero = [df_where_no_hits[name] for name in rewriter_names]
average_hits_zero = [s.mean() for s in data_series_zero]


# bar chart of hits

# first trace: all search terms
rewriter_fancy_names = [
    'Control (no rewriting)',
    'Wikipedia Categories',
    'Word2Vec'
]

traceAllTerms = go.Bar(
    x=rewriter_fancy_names,
    y=average_hits,
    name='All terms'
)
traceJustMisses = go.Bar(
    x=rewriter_fancy_names,
    y=average_hits_zero,
    name='Terms where no hits by default'
)

traces = [traceAllTerms, traceJustMisses]
layout = go.Layout(
    barmode='group',
    title='Comparing rewriter efficacy (edX)',
    xaxis=dict(
        title='Average # hits'
    ),
    yaxis=dict(
        title='Query rewriter'
    )
)

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig)