In [2]:
# This notebook looks at a simple way to use pre-trained Neural Network embeddings using the PyMagnitude package

# BEFORE YOU START
# You need to have gone to this URL : https://github.com/plasticityai/magnitude#pre-converted-magnitude-formats-of-popular-embeddings-models
# downloaded some vectors of you choice (these files can be massive so it might take a while), and stored
# them somewhere you can access (e.g. with the file path I specify below).

# The results below are using the fastText 'Medium' set of 1M words, with 300-dimensional embeddings,
# trained on English wikipedia

# YOU MAY ALSO
# need to have run the data_preparation.R script if you lack the `sentences.csv` file

In [3]:
from pymagnitude import *
import re
import numpy as np
import pandas as pd
from sklearn import preprocessing
from nltk.corpus import stopwords
import time

In [4]:
# So we can read wider text columns and not so much gets cut off
pd.set_option('max_colwidth', 500)

In [5]:
%%time
#here's where you get your vectors in - you need to replace the path below with your own one
vectors = Magnitude("/here/you/put/path/to/magnitude_file.magnitude")

CPU times: user 787 ms, sys: 21.8 ms, total: 809 ms
Wall time: 810 ms


In [None]:
#some things we can do with magnitude vectors for words

In [9]:
# I think this is Euclidean distance
vectors.distance("biscuit","macaroon")

0.8671641

In [22]:
vectors.distance("biscuit",["macaroon", "cabbage"])

[0.8671641, 1.0519347]

In [11]:
# This is cosine similarity
vectors.similarity("biscuit","macaroon")

0.62401325

In [21]:
vectors.similarity("biscuit",["macaroon", "cabbage"])

[0.62401325, 0.4467167]

In [20]:
# Using cosine similarity
vectors.most_similar_to_given("biscuit", ["macaroon", "cabbage"])

'macaroon'

In [19]:
# odd one out
vectors.doesnt_match(["biscuit", "macaroon", "cabbage"])

'cabbage'

In [23]:
# top n most similar
vectors.most_similar("biscuit", topn = 10)

[('biscuits', 0.8173666000366211),
 ('biscuity', 0.6973364353179932),
 ('cake', 0.6754744052886963),
 ('chocolate', 0.6582432985305786),
 ('oatcake', 0.6574128866195679),
 ('teacake', 0.6451709866523743),
 ('loaf', 0.6432216167449951),
 ('cornflake', 0.6331499814987183),
 ('flapjack', 0.6270235776901245),
 ('shortbread', 0.6263179779052734)]

In [6]:
# Paris + Germany - France = Berlin
vectors.most_similar(positive=['Paris', 'Germany'], negative=['France'])

[('Berlin', 0.7935939431190491),
 ('Munich', 0.7534011006355286),
 ('Frankfurt', 0.7376378774642944),
 ('Cologne', 0.7260650992393494),
 ('Stuttgart', 0.7239525318145752),
 ('Leipzig', 0.7191416025161743),
 ('Vienna', 0.7057973146438599),
 ('Hamburg', 0.7021979093551636),
 ('Frankfurt-am-Main', 0.6996399164199829),
 ('Düsseldorf', 0.6979357004165649)]

In [7]:
# England + Baghdad - London = Iraq
vectors.most_similar(positive=['England', 'Baghdad'], negative=['London'])

[('Iraq', 0.7341831922531128),
 ('Kuwait', 0.6448562145233154),
 ('Mosul', 0.6440438628196716),
 ('Basra', 0.6172256469726562),
 ('Iraq--and', 0.6111494302749634),
 ('Al-Anbar', 0.6111178398132324),
 ('Baghdady', 0.6097682118415833),
 ('Al-Basrah', 0.6095267534255981),
 ('Al-Najaf', 0.6082150936126709),
 ('Basrah', 0.6069812774658203)]

In [8]:
# It doesn't always work: Canberra + Germany - Australia = East Germany
vectors.most_similar(positive=['Canberra', 'Germany'], negative=['Australia'])

[('East-Germany', 0.684023916721344),
 ('Berlin', 0.6813000440597534),
 ('Bonn', 0.6783359050750732),
 ('West-Germany', 0.6640989780426025),
 ('Germany-', 0.6633050441741943),
 ('Stuttgart', 0.6480668187141418),
 ('Dresden', 0.6451014280319214),
 ('Munich', 0.643798828125),
 ('Potsdam-Babelsberg', 0.6396905183792114),
 ('Freiburg', 0.6354193687438965)]

In [9]:
# This embeds a search phrase or report sentence in our 300-dimensional vector space
# by simply averaging over the words in the phrase
# I have no doubt there are better ways of doing this, e.g. incorporating tf-idf weightings
def embed_phrase(phrase):
    try:
        input_nopunc = re.sub(r'[^\w\s]','',phrase) #take out punctuation
        input_lower = input_nopunc.lower().split() #make lower case and split by word
        #now take out words not in word2vec model, and also words in nltk stopword list
        clean_search = [word for word in input_lower if (word in vectors and word not in stopwords.words("English"))]
        if clean_search: #this means if clean_search isn't empty
            unnorm_vector = np.mean(vectors.query(clean_search), axis = 0) #take mean of vectors of words that remain
        else:
            unnorm_vector = np.zeros(300) #set to zero if no words remain
    except(TypeError):
        unnorm_vector = np.zeros(300) #set to zero if it doesn't seem to be a string
    #we normalize the result to length 1 so we can use dot products for cosine similarity
    norm_array = preprocessing.normalize(unnorm_vector.reshape(-1,1), norm = 'l2', axis = 0)
    return(np.concatenate(norm_array))

# this allows you to search a phrase and compare it to a set of comparison sentences
# again, no doubt this could be greatly improved.
def search_phrase(phrase, comparison_set):
    results_df = comparison_set
    #we take our search phrase and compute its dot product with all of our guide sentences
    #then we reorder by how similar the phrase is (larger dot product = more similar)
    #and give the top 10
    embedding = np.array(results_df['vectors'].values.tolist()).T
    results_df['search_results'] = np.dot(embed_phrase(phrase), embedding)
    return(results_df.sort_values(by=['search_results'])[::-1])

In [49]:
embed_phrase("jaffa cakes are my favourite and I like them the best")

array([-6.28026947e-03, -2.43652239e-02,  1.32176944e-03, -9.69787687e-03,
       -3.74229029e-02, -8.92745480e-02, -2.56897439e-03, -1.03370681e-01,
       -6.57415316e-02,  1.29279522e-02,  8.24017625e-04,  7.51992390e-02,
        2.91189346e-02, -2.35687736e-02,  5.70440330e-02,  4.56786007e-02,
        1.60301358e-01,  1.96943642e-03,  1.33410349e-01,  3.72173935e-02,
        2.48401333e-03,  7.24656135e-02,  6.67128386e-03,  4.35236953e-02,
        3.62036712e-02, -1.57978833e-02,  5.85241467e-02, -2.75252834e-02,
        4.37867455e-02, -3.61571088e-02,  1.13285063e-02, -2.19578911e-02,
       -1.92831922e-02,  7.66812358e-03,  3.62293907e-02, -5.31622693e-02,
        9.04731266e-03, -8.52390379e-03,  1.10962684e-03,  4.68735360e-02,
       -2.17354670e-02, -1.00070961e-01, -9.81302857e-02, -9.39529669e-03,
        4.40335227e-03, -2.58727856e-02,  1.92488786e-02, -5.05256131e-02,
       -1.49174258e-02,  3.19914962e-03, -1.35877803e-02,  1.67603744e-03,
        2.21653134e-02, -

In [10]:
# read in all sentences from our NLP guide (if you lack 'sentences.csv' then run `data_preparation.R`)
corpus = pd.read_csv("sentences.csv")
corpus.head()

Unnamed: 0.1,Unnamed: 0,ID,sentence
0,1,FeatureSelection.md_1,* Decisions about how to do these things are usually made by trial and error.
1,2,FeatureSelection.md_2,"This is usually easily accomplished - in R, for example, there are various different ways of doing it but it's always just one line of code, or a parameter set in a function."
2,3,FeatureSelection.md_3,"Despite the practical ease of achieving this, it still needs thought."
3,4,FeatureSelection.md_4,"You can choose to replace the punctuation marks with a space, or remove them altogether."
4,5,FeatureSelection.md_5,"On the other hand, removing punctuation altogether can lead to problems in cases where someone has hyphenated a word."


In [11]:
# embed all the sentences in our corpus
corpus['vectors'] = np.array(corpus['sentence'].apply(embed_phrase))
corpus.head()

Unnamed: 0.1,Unnamed: 0,ID,sentence,vectors
0,1,FeatureSelection.md_1,* Decisions about how to do these things are usually made by trial and error.,"[-0.0080329105, 0.021076743, 0.0016090235, 0.015615334, 0.021448795, -0.02895572, -0.017227316, -0.16568217, -0.015879735, -0.045628887, -0.027818492, -0.08743536, 0.015253517, -0.01839747, -0.00083485665, 0.021957552, 0.14317317, -0.002015149, 0.10568141, 0.014507441, -0.00872977, 0.017498098, -0.012503439, 0.116140865, -0.048947755, 0.02268431, -0.009576736, 0.0074026776, 0.11766725, -0.012650943, -0.007363485, 0.0065476163, -0.03638299, -0.106922016, 0.036052905, 0.020986998, -0.002827829..."
1,2,FeatureSelection.md_2,"This is usually easily accomplished - in R, for example, there are various different ways of doing it but it's always just one line of code, or a parameter set in a function.","[0.013527644, -0.012912958, 0.05678807, -0.0023535287, -0.031319316, -0.03373001, 0.0012797157, -0.18278837, -0.038471613, -0.054697327, -0.031186085, -0.09244956, -0.019320857, 0.009983517, -0.02046032, 0.045013547, 0.10419684, 0.03064537, 0.11806083, 0.02091864, 0.0040642424, 0.015343196, -0.010522759, 0.11600212, 0.011793437, 0.023224153, -0.008294441, 0.039412495, 0.08627377, 0.016949164, -0.039776232, -0.01686298, -0.0023626043, -0.10091015, 0.03590936, -0.046496466, -0.01643459, 0.0212..."
2,3,FeatureSelection.md_3,"Despite the practical ease of achieving this, it still needs thought.","[-0.020053409, -0.0069089155, -0.0060613644, -0.022930084, -0.034667023, -0.008358551, -0.036972728, -0.17744663, -0.033453833, 0.00873448, -0.048085753, -0.0862697, 0.014174514, -0.016834512, -0.0024806417, -0.00915952, 0.1015824, 0.02559705, 0.102265805, 0.033969667, -0.021966875, 0.002373617, 0.06554223, 0.12357432, -0.025039257, -0.012214568, 0.009082507, -0.011551237, 0.09926328, 0.00316169, -0.023433667, -0.017739069, 0.004649915, -0.10267782, 0.021853343, 0.029765343, 0.0019963442, -0..."
3,4,FeatureSelection.md_4,"You can choose to replace the punctuation marks with a space, or remove them altogether.","[0.03999276, 0.03196642, 0.05058594, 0.005573745, -0.01834618, 0.02007063, 0.028969927, -0.17134686, -0.02729807, -0.054477774, -0.04154321, -0.033670098, -0.018778903, 0.02571134, -0.020630216, 0.043483913, 0.04301242, 0.038266823, 0.10943258, 0.014354443, 0.004463497, 0.00018632211, 0.008865451, 0.10899704, -0.036367856, 0.0109212445, 0.01170619, 0.03975891, 0.025309594, 0.024196852, -0.07005064, -0.0064878585, 0.030491501, -0.03255476, 0.021842735, -0.081583425, -0.00013621786, 0.01781558..."
4,5,FeatureSelection.md_5,"On the other hand, removing punctuation altogether can lead to problems in cases where someone has hyphenated a word.","[0.0051377737, 0.006850067, 0.021656906, 0.0019045039, -0.021693604, 0.030433074, -0.004772003, -0.18376674, -0.045084, -0.01355956, -0.013582825, -0.04761095, 0.016942099, -0.0065597547, -0.0031950774, 0.026664509, 0.046471883, 0.062809065, 0.094349496, -0.0068967273, 0.012712798, -0.0002191907, 0.020930024, 0.11781901, -0.046308387, 0.012336634, 0.0047920467, 0.07330619, 0.11551752, 0.0053112754, -0.005649141, -0.024537914, 0.018353172, -0.079487145, 0.01152881, -0.06505355, -0.08125301, 0..."


In [13]:
# now we can search
search_phrase("this is how we can search through our documents", corpus)[['ID','sentence']][:20]

Unnamed: 0,ID,sentence
38,Glossary.md_21,"If you are doing work on *Search* or *Topics*, the *document*s will be the objects which you will be finding similarities between in order to group them topically."
25,Glossary.md_8,The set of text *document*s that you are analysing.
34,Glossary.md_17,The vector for a *document* points in the directions of the concepts that *document* contains.
21,Glossary.md_4,A catch-all term for a group of algorithms that aim to collect *documents* into clusters.
65,Glossary.md_48,Words routinely removed from *document*s at an early stage of the analysis.
92,Intro.md_8,Finding parts of the text that are about a particular topic of interest (e.g. allow the user to search for parts of the text that are about biscuits).
198,NNmodels.md_31,"We can search the parameter (and meta-parameter) space, compute the test results, and use the model that gives us the best answers."
209,README.md_2,"* [Search through a set of documents](Search.md) * [Find topics in a set of documents](Topics.md) * [Feature Selection](FeatureSelection.md) * [Latent Semantic Analysis (LSA)](LSA.md) * [Latent Dirichlet Allocation (LDA)](LDA.md) * [Word2Vec, Doc2Vec, fastText (Neural Network models)](NNmodels.md) * There is also R code for LSA and LDA accessible in `code/NLP-guidance."
79,Glossary.md_62,The scheme by which we go from a vector of counts of each word in the *vocabulary* for a given document to an *embedding*.
73,Glossary.md_56,"This function claims that its input must be > ...a document-term matrix ... containing *documents in colums, terms in rows*... (emphasis mine)."
