In [51]:
import os
import json

import array
import collections
import io
import itertools
import cPickle as pickle

import scipy.sparse as sp
import numbers

import pandas as pd
import numpy as np

import gensim
from gensim.models import Word2Vec
from glove import Glove
import nltk

from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model

import matplotlib.pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
glove_model_filepath = os.path.join("../models","glove.6B.50d.txt")

In [3]:
class GloveExtended(Glove):
    
    @classmethod
    def load_stanford(cls, filename):
        """
        Load model from the output files generated by
        the C code from http://nlp.stanford.edu/projects/glove/.
        The entries of the word dictionary will be of type
        unicode in Python 2 and str in Python 3.
        """

        dct = {}
        vectors = array.array('d')

        # Read in the data.
        with io.open(filename, 'r', encoding='utf-8') as savefile:
            for i, line in enumerate(savefile):
                tokens = line.split(' ')

                word = tokens[0]
                entries = tokens[1:]

                dct[word] = i
                vectors.extend(float(x) for x in entries)

        # Infer word vectors dimensions.
        no_components = len(entries)
        no_vectors = len(dct)

        # Set up the model instance.
        instance = GloveExtended() # here i change the instance to my current class name
        instance.no_components = no_components
        instance.word_vectors = (np.array(vectors)
                                 .reshape(no_vectors,
                                          no_components))
        instance.word_biases = np.zeros(no_vectors)
        instance.add_dictionary(dct)

        return instance
    
    def most_similar_to_word_vector(self, word_vector, number=5):
        if self.word_vectors is None:
            raise Exception('Model must be fit before querying')

        if self.dictionary is None:
            raise Exception('No word dictionary supplied')

        return self._similarity_query(word_vector, number)[1:]
    
    def similarity(self,word1,word2):
        vector1 = self.word_vectors[glv.dictionary[word1]]
        vector2 = self.word_vectors[glv.dictionary[word2]]
        vector1 =  vector1.reshape(1,vector1.shape[0])
        vector2 =  vector2.reshape(1,vector2.shape[0])
        return cosine_similarity(vector1,vector2)[0][0]        

In [4]:
%%time 
glv = GloveExtended.load_stanford(glove_model_filepath)

CPU times: user 39.8 s, sys: 1.16 s, total: 41 s
Wall time: 41.5 s


In [5]:
def glove_mean_vector(word_list):
    vector_list = []
    for word in word_list:
        vector_list.append(glv.word_vectors[glv.dictionary[word]])
    return np.mean(np.stack(vector_list),axis = 0)

In [6]:
mean_vector = glove_mean_vector(["mexico"])
glv.most_similar_to_word_vector(mean_vector,number=10)

[(u'mexican', 0.8550674735062006),
 (u'venezuela', 0.84968989901002756),
 (u'colombia', 0.849031767897101),
 (u'peru', 0.84464826419836103),
 (u'chile', 0.84392901369166462),
 (u'puerto', 0.83626278721957459),
 (u'rico', 0.81946955303500257),
 (u'cuba', 0.81252054023253284),
 (u'guatemala', 0.81138109161210081),
 (u'panama', 0.80967559123222543),
 (u'brazil', 0.80768006021078875),
 (u'costa', 0.80469082550556659),
 (u'bolivia', 0.79278940477367765),
 (u'ecuador', 0.79041663921967487),
 (u'argentina', 0.7792276485929357),
 (u'rica', 0.77619745108727212),
 (u'honduras', 0.77205183034845315),
 (u'nicaragua', 0.76766200804852081),
 (u'salvador', 0.7545959658101794),
 (u'spain', 0.75137645100707895),
 (u'philippines', 0.75002143680699229),
 (u'dominican', 0.73882111230848269),
 (u'san', 0.73853777921996433),
 (u'paraguay', 0.73149171475091945),
 (u'california', 0.73001444734832066),
 (u'uruguay', 0.72562287128822289),
 (u'chilean', 0.71203996543230963),
 (u'francisco', 0.71055504193806407),

In [143]:
def get_all_connections_query(model,query_word,number=5,similarity_cut = 0.25):
    query = model.most_similar(query_word,number=number)
    all_words = [item[0] for item in query]+[query_word]
    
    i=0
    all_words_dict = {}
    for word in all_words:
        all_words_dict[word]=i
        i+=1
    nodes_list = []
    for key in all_words_dict:
        nodes_list.append({"name":key})    

    all_links_list = []
    for word1,word2 in itertools.combinations(all_words,r=2):
        distance =1- model.similarity(word1,word2)

        if distance<similarity_cut:
            all_links_list.append({"source":all_words_dict[word1],
                               "target":all_words_dict[word2],
                                   "value":distance
                                  })
        
    final_dict = {"nodes":nodes_list,
                  "links":all_links_list
                 }
    return final_dict

In [146]:
word_distance_pairs =  get_all_connections_query(glv,"people",number=30,similarity_cut=0.15)

In [147]:
output_file = os.path.join("../d3_visualization","word_distance.json")
with open(output_file, 'w') as fp:
    json.dump(word_distance_pairs, fp)