In [None]:
import gensim
from gensim import matutils
import numpy as np
from numpy import dot
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# load Google News word2vec model
google_model = gensim.models.Word2Vec.load_word2vec_format('../models/google-vectors.w2v'
                                                           ,binary=True)

In [None]:
# what do we know about this model?
vocab_size, dim = google_model.syn0.shape
print("vocab:", vocab_size)
print("depth:", dim)

In [None]:
# useful functions
def concept_distance(term1,term2):
    distance = 1 - cosine_similarity([google_model[term1],google_model[term2]])
    distance = np.round(distance[0][1],5)
    return(distance)

# return vocab index
def vidx(term):
    return(google_model.vocab[term].index)

# this will return distances from queried term all the way through vocab
def get_distances(term):
    vectors = google_model.syn0norm[vidx(term)]
    dists = dot(google_model.syn0norm, vectors)
    best = matutils.argsort(dists, reverse=True)
    return(best)

# locating binary terms?
def get_binary(term):
    idx = get_distances(term)[-1:][0]
    term2 = google_model.index2word[idx]
    return(term2)

In [None]:
# concept cluster: top ranked words from labMT sentiment lexicon

#laughter,8.50
#happiness,8.44
#love,8.42
#happy,8.30
#laughed,8.26
#laugh,8.22
#laughing,8.20
#excellent,8.18
#laughs,8.18
#joy,8.16
#successful,8.16

google_model.most_similar(positive=["laughter","happiness","love","happy","laughed",
                                    "laughing","excellent","laughs","successful"],topn=25)

In [None]:
# plot top twenty-five neighbors
response = google_model.most_similar(positive=["laughter","happiness","love","happy","laughed",
                                    "laughing","excellent","laughs","successful"],topn=25)
neighbor_list=list()
words=list()
for i in response:
    words.append(i[0])
    neighbor_list.append(google_model.wv[i[0]])

In [None]:
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
dist_matrix = 1 - cosine_similarity(neighbor_list)

pos = mds.fit_transform(dist_matrix)

xs, ys = pos[:, 0], pos[:, 1]
fig = plt.figure(figsize=(20, 15))
                        
plt.clf()
plt.title("MDS: Neighboring Terms")
plt.style.use('ggplot')
plt.scatter(xs, ys, marker = '^')
for i, w in enumerate(words):
     plt.annotate(w, xy = (xs[i], ys[i]), xytext = (3, 3),
            textcoords = 'offset points', ha = 'left', va = 'top')
plt.show()

In [None]:
pca = PCA(n_components=2)

plot_data = pca.fit_transform(neighbor_list)
xs, ys = plot_data[:, 0], plot_data[:, 1]

fig = plt.figure(figsize=(20, 15))
plt.clf()
plt.title("PCA: Neighboring Terms")
plt.style.use('ggplot')
plt.scatter(xs, ys, marker = '^')
for i, w in enumerate(words):
     plt.annotate(w, xy = (xs[i], ys[i]), xytext = (3, 3),
            textcoords = 'offset points', ha = 'left', va = 'top')
plt.show()

In [None]:
# kmeans clustering of terms into three groups (why 3? I don't know)
import nltk
from nltk.cluster import KMeansClusterer
from sklearn import cluster
from sklearn import metrics

In [None]:
kmeans = KMeansClusterer(3, distance=nltk.cluster.util.cosine_distance, 
                             repeats=25)
clusters = kmeans.cluster(neighbor_list, assign_clusters=True)

In [None]:
words = list(words)
for i, word in enumerate(words):  
    print(word + ":" + str(clusters[i]))
 

In [None]:
kmeans = cluster.KMeans(n_clusters=3)
kmeans.fit(neighbor_list)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
centers = np.array(centroids)
plt.scatter(centers[:,0], centers[:,1], marker="x", color='r')

In [None]:
# now extract least happy terms from labMT:
#died,1.56
#kill,1.56
#killed,1.56
#cancer,1.54
#death,1.54
#murder,1.48
#terrorism,1.48
#rape,1.44
#suicide,1.30
#terrorist,1.3

pos_terms = ["laughter","happiness","love","happy","laughed",
             "laughing","excellent","laughs","successful"]

neg_terms = ["died","kill","killed","cancer","death","murder",
             "terrorism","rape","suicide","terrorist"]

vectors = [google_model[i] for i in pos_terms + neg_terms]

In [None]:
kmeans = KMeansClusterer(2, distance=nltk.cluster.util.cosine_distance, 
                             repeats=25)
clusters = kmeans.cluster(vectors, assign_clusters=True)
words = pos_terms + neg_terms
for i, word in enumerate(words):  
    print(word + ":" + str(clusters[i]))

In [None]:
kmeans = cluster.KMeans(n_clusters=2)
kmeans.fit(vectors)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
centers = np.array(centroids)
plt.scatter(centers[:,0], centers[:,1], marker="x", color='r')