In [1]:
import re

import pandas as pd
import numpy as np

# gensim
from gensim import corpora, models, similarities, matutils
import gensim

# sklearn
from sklearn.decomposition import PCA

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import plotly as py
import plotly.graph_objs as go
from plotly import offline
py.offline.init_notebook_mode(connected=True)

import colorlover as cl

import pickle

In [2]:
packages = (('Gensim', gensim), ('Regex', re),('Plotly',py))

for package in packages:
    print('{0} version: {1}'.format(package[0],package[1].__version__))
    
!Python -V

Gensim version: 3.3.0
Regex version: 2.2.1
Plotly version: 2.2.3
Python 3.6.4 :: Anaconda custom (64-bit)


# Get Models

In [3]:
with open('Word2Vec_Stats.pkl','rb') as pickle_in:
    stats_model = pickle.load(pickle_in)
    
with open('Word2Vec_Math.pkl','rb') as pickle_in:
    math_model = pickle.load(pickle_in)
    
with open('Word2Vec_CS.pkl','rb') as pickle_in:
    cs_model = pickle.load(pickle_in)

In [4]:
def subject_similarities(model, word1, word2):
    '''
        Returns the similarities between word 1 and word 2 based on
        the given model. If one of the words is not the in model 
        vocabulary the function will return None.
    '''
    
    model_vocab = model.wv.vocab
    
    try:
        sim = model.wv.similarity(word1, word2)
        return sim
    
    except:
        None
        
    return None

In [5]:
def in_top10(model, word1, word2):
    '''
        Sees if word2 is in the top ten most similar words for 
        word1.
    '''
#     if word1 in model
    top10 = model.wv.most_similar(word1, topn=10)
    
    sim = subject_similarities(model, word1, word2)
    
    for entry in top10:
        if word2 == entry[0]:
            return 1
        
    return 0

In [6]:
def get_similarities(word1, word2):
    '''
        Finds the similarties between word1 and word2 for all topics.
        Returns a 3x2 array with the topic, if word1 is not in the
        model's vocabulary, if word2 is not in the models vocabulary
        and the similarity score.
    '''
    
    individual_models = {'math':math_model, 'stats':stats_model, 'cs':cs_model}

    similarities_by_topic = np.array([])

    for topic in individual_models.keys():
        model = individual_models[topic]

        sim = subject_similarities(model, word1, word2)
        is_in_top10 = in_top10(model, word1, word2)

        sbt = np.array([topic, sim, is_in_top10])

        similarities_by_topic = np.append(similarities_by_topic, sbt)
    
    similarities_by_topic = similarities_by_topic.reshape(3,-1)

    return similarities_by_topic

In [7]:
def word1_plot_title(word1):
    '''Replaces underscore with a space and capitalizes all words.'''
    
    
    word1 = word1.split('_')
    
    pretty_word = []
    
    if isinstance(word1, list):
        
        for word in word1:
            pretty_word.append(word[0].upper() + word[1:])
        
        pretty_word = ' '.join(pretty_word)
            
    else:
        pretty_word = word1[0].upper() + word1[1:]

    
    return pretty_word

In [8]:
def similarity_plots(word1, test_words, word1_title):
    '''
        Plots the similarity scores for a set of words (test_words) against
        word1 for each topic.
    '''
    
    test_word_lengths = [len(word) for word in test_words]
    max_length = max(test_word_lengths)
    
    if len(test_words) < 3:
        colors = cl.scales['3']['qual']['Set1']
        colors = colors[:len(test_words)]
    else:
        colors = cl.scales[str(len(test_words))]['qual']['Set1']

    data = []
    annotations = []

    for index, word2 in enumerate(test_words):

        model_similarities = get_similarities(word1, word2)

        pretty_words = {'math':'Mathematics', 'stats':'Statistics', 'cs':'Computer Science'}
        pw = np.vectorize(pretty_words.get)
        model_similarities[:,0] = pw(model_similarities[:,0])
        
        if len(word2) > 1:
            pretty_word2 = ' '.join(word2.split('_'))
        else:
            pretty_word2 = word2

        symbols = []
        for in_top10 in model_similarities[:,2]:
            if int(in_top10):
                symbols.append('star')
            else:
                symbols.append('circle')
                        
        points = go.Scatter(
            x = model_similarities[:,0],
            y = model_similarities[:,1],
            mode = 'markers+lines',
            marker = dict(
                color = colors[index],
                symbol = symbols,
                size = 18
            ),
            line = dict(
                width = 3
            )
        )

        annot = dict(
            showarrow = False,
            x = 'Computer Science',
            y = model_similarities[2,1],
            text = pretty_word2,
            xanchor = "left",
            xshift = max_length*1.1,
            font = dict(
                size = 18,
                color = colors[index]
            )
          )

        data.append(points)
        annotations.append(annot)



    layout = go.Layout(
        autosize=False,
        width=600,
        height=700,
        title='Similarities Scores for ' + word1_title,
        showlegend = False,
        titlefont=dict(
            size=25
        ),
        xaxis=dict(
            range = [-0.5,2.5],
            showgrid=True,
            zeroline=False,
            showticklabels=True,
            tickvals = [0,1,2],
            tickfont = dict(
                size = 18
            )
        ),
        yaxis=dict(
            range = [-0.1,1.1],
            showgrid=True,
            zeroline=True,
            showticklabels=True,
            tickfont = dict(
                size = 15
            )
        ),
        annotations = annotations
    )


    fig = go.Figure(data=data, layout=layout)

    py.offline.iplot(fig)

In [9]:
word1 = 'model'
test_words = ['regression','classification','cluster','fit','glm']
word1_title = word1_plot_title(word1)

similarity_plots(word1, test_words, word1_title)

2018-04-12 14:44:26,208 : INFO : precomputing L2-norms of word weight vectors


In [10]:
word1 = 'step_size'
test_words = ['learning_rate','parameter_λ']
word1_title = word1_plot_title(word1)
similarity_plots(word1, test_words, word1_title)

In [11]:
word1 = 'µ'
test_words = ['measure','mean']

similarity_plots(word1, test_words, word1)

In [12]:
word1 =  '∂'
test_words = ['partial','derivative','jacobian','rate']

similarity_plots(word1, test_words, word1)

In [13]:
word1 =  'log'
test_words = ['loss','loglikelihood','likelihood','entropy','odds']

similarity_plots(word1, test_words, word1)

# Comparing Results Across Fields

## First Level

In [14]:
def get_top10_all(word1):
    mdf = pd.DataFrame()
    sdf = pd.DataFrame()
    cdf = pd.DataFrame()
    
    try:
        mdf = pd.DataFrame(math_model.wv.most_similar(word1, topn=10), columns = ['math','similarity'])
    except:
        None
    
    try:
        sdf = pd.DataFrame(stats_model.wv.most_similar(word1, topn=10), columns = ['stats','similarity'])
    except:
        None
        
    try:
        cdf = pd.DataFrame(cs_model.wv.most_similar(word1, topn=10), columns = ['cs','similarity'])
    except:
        None

    return pd.concat([mdf,sdf,cdf], axis=1)

In [16]:
symbols = ['𝜞', '𝝏', '∍', '𝛐', '𝝑', 'ι', 'Ν', '𝝌', '𝚪', 'Υ', '𝜑', 'Π', '𝜅', '𝜸', 'ϑ', '𝝆', '⎿', '𝛊', '𝜂', 
              '𝛘', '𝛻', '𝝄', '𝝁', '𝝘', '𝜉', '𝛌', 'β', '𝝐', '𝜷', '≤', '𝜃', '𝜹', 'Λ', '∀', '𝛝', 'ς', '𝛿', '𝛏', 
              '∀', '𝝍', 'Ω', '𝛛', '𝝙', '𝛄', '𝛚', '∈', '𝛞', '≪', '⇏', '𝛾', '𝝊', 'ǫ', 'θ', '∞', 'ζ', '𝜌', 
              '∅', 'Γ', 'φ', '𝜿', '𝛽', '𝝕', '𝜚', '𝛟', '𝝓', 'ρ', 'λ', 'σ', '∪', '<', '𝛖', '𝛆', 'α', '𝛷', 'ο', 'χ', 
              '𝜊', '𝝈', '≩', '𝛍', '↔', '𝛡', 'η', 'Μ', '⟾', '⊤' '𝜓', 'xi', '∴', '⊆', 'Η', '𝛃', 'Ζ', '𝜘', 
              '⤃', '𝛠', 'κ', 'ξ', 'ω', '≨', 'ℝ', '𝜕', 'Ξ', 'Ψ', '≈', 'τ', '𝛎', '𝛑', '𝝂', '𝜽', '𝛥', '⇔', '𝛇', '∬', 
              '⊃', '𝜒', '𝛂', '𝜙', '𝜼', 'δ', '𝜶', '𝛬', '>', '𝝒', '𝜈', '𝛈', '𝛹', 'Σ', 'ψ', '𝛋', '𝜏', 'Ι', '≡', 'μ', 
              'ν', 'Α', '𝝔', '𝝀', '𝜐', '⤇', '𝛔', '⊇', '⇒', '𝝃', '𝜻', '𝛜', '⊉', '⊈', '∉', '≃', 'st', 'Ρ', '𝛓', '𝜋', 
              '𝜆', 'Τ', '𝜇', '𝜀', 'µ', '𝜾', 'π', '∋', '⊂', '∫', 'Β', '⊅', 'Χ', '⊄', '𝜍', '∵', '𝜛', '±', '·', '𝛉', 
              '𝜁', '𝝎', '𝛙', '∮', '𝝉', 'γ', '𝛼', '∆', 'Δ', '𝜄', '𝛗', 'Ο', '∩', 'Κ', '𝜺', '𝜗', 'rm×n', '𝜎', 'Θ', 
              '∂', 'Φ', '∧', '𝜖', '𝛤', '𝝋', '𝝇', 'ε', 'Ε', 'υ', '∑', '𝛅', '𝛒', '≫', '𝝅', '𝛕' ]

In [18]:
get_top10_all('linear_model')

Unnamed: 0,stats,similarity,cs,similarity.1
0,example_consider,0.995987,dynamics_model,0.994234
1,piecewise_constant,0.995442,one_might,0.992155
2,using_empirical,0.993772,regression_model,0.99176
3,worth_mentioning,0.99371,given_dataset,0.991612
4,one_way,0.993283,simple_linear,0.991302
5,distribution_see,0.992212,paper_use,0.991263
6,probability_model,0.992202,figure_left,0.991066
7,general_model,0.992166,linear_regression,0.991006
8,starting_point,0.991965,small_subset,0.990673
9,scaling_factor,0.991549,much_like,0.990643


In [19]:
get_top10_all('pλ')

Unnamed: 0,math,similarity,stats,similarity.1,cs,similarity.2
0,qλ,0.831288,δµ,0.705,rλ,0.730541
1,aλ,0.803907,λσ,0.680993,fλ,0.717076
2,lλ,0.766888,πλ,0.664533,qλ,0.711838
3,˜λ,0.753195,´ωjh,0.664143,sλ,0.642061
4,sλ,0.750431,f·,0.655204,∃t,0.613401
5,χλ,0.742127,σλ,0.652292,♥,0.592571
6,hλ,0.740442,cλ,0.642184,dλ,0.589962
7,cλ,0.734365,λ−,0.641564,ψθ,0.580467
8,λ,0.733722,γnc,0.636235,λv,0.58
9,λd,0.731858,γkxi,0.63384,gλ,0.578712


In [20]:
subject_similarities(stats_model,'linear_regression','linear_model')

0.9799580440045753

In [21]:
subject_similarities(cs_model,'linear_regression','linear_model')

0.9910060688677611