In [1]:
from gensim.models import Word2Vec

import numpy as np
import os
from random import shuffle
import re


import urllib.request
import zipfile
import lxml.etree


# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

# For now, we're only interested in the subtitle text, so let's extract that from the XML:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()'))
del doc

input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

del input_text, input_text_noparens

sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

model_ted = Word2Vec(sentences_ted, min_count=10)
model_ted.init_sims(replace=True)

In [16]:
animals = ['cat', 'dog', 'elephant', 'giraffe', 'bear', 'wolf', 'fox', 'penguin', 'snake', 'chicken']
furniture = ['table', 'chair', 'sofa', 'stool', 'bed', 'closet', 'shelf', 'desk', 'mirror', 'cabinet']
places = ['zoo', 'park', 'cinema', 'cafe', 'restaurant', 'hospital', 'gallery', 'museum', 'library', 'pharmacy']

animals_vec = []
animals_words = []

for a in animals:
    animals_vec.append(model_ted.wv[a])
    animals_words.append(a)
    
for a in furniture:
    animals_vec.append(model_ted.wv[a])
    animals_words.append(a)
    
for a in places:
    animals_vec.append(model_ted.wv[a])
    animals_words.append(a)


animals_vec = np.array(animals_vec).transpose()    
    
from matplotlib.mlab import PCA
animals_pca = PCA(animals_vec)

print(len(animals_pca.Y[0]))

for i in range(30):
    print("%f,%f,%s" % (animals_pca.Wt[0][i], animals_pca.Wt[1][i], animals_words[i]))
    

30
-4.308300,-0.197988,cat
-0.420302,1.015423,dog
-0.503546,0.874350,elephant
0.834504,1.603340,giraffe
-1.244697,0.482636,bear
-1.102225,-1.979993,wolf
-0.713723,0.116548,fox
0.243910,-0.682727,penguin
0.122817,-0.106765,snake
0.023230,0.412090,chicken
0.364310,-0.817988,table
-0.541521,0.238706,chair
0.451682,0.938970,sofa
0.543496,0.534135,stool
-0.074774,-0.442796,bed
-0.094137,-0.001414,closet
0.115469,-0.296658,shelf
-0.071789,-0.305716,desk
-0.253398,-0.068996,mirror
0.190186,-0.152916,cabinet
0.232463,0.298612,zoo
0.102019,0.623228,park
0.270629,0.418343,cinema
-0.306541,-0.344101,cafe
0.135607,0.298225,restaurant
0.024670,-0.322040,hospital
-0.295718,0.007113,gallery
-0.070559,-0.156286,museum
0.085344,-0.138745,library
0.002489,0.070077,pharmacy


In [4]:

for a in furniture:
    print(model_ted.wv[a])

[ 0.00665514 -0.0028701   0.05502357 -0.14490736  0.05870106 -0.0631328
  0.0886133   0.14035547  0.21570417  0.02439884  0.02796276 -0.04826117
  0.01067871  0.09645253  0.02635578  0.09659922 -0.01476734 -0.09966331
  0.13639861 -0.07045158  0.04819423  0.10813646  0.02191059  0.04858215
  0.11819333  0.05445026 -0.0365449   0.07541605 -0.03601131  0.00562567
 -0.00902637  0.0756612   0.08831648  0.07851404 -0.04497873  0.06505337
 -0.07409434  0.0165261  -0.10363711 -0.04261329 -0.04933982  0.22601244
 -0.00267923 -0.00663213  0.10986363 -0.0612139   0.18130834 -0.03070197
 -0.22674231  0.09326417 -0.08689286  0.1163154   0.04644994  0.06241805
  0.11929404  0.12035665  0.05713857 -0.06968801  0.03625325 -0.08344513
 -0.06093274 -0.07574762 -0.10644704 -0.12487313 -0.12700571 -0.16066277
  0.11508939  0.20786861 -0.04713384  0.0121372   0.17050338  0.11672831
 -0.04735322  0.16035362  0.22718887  0.02068787 -0.1981729   0.12470064
 -0.0530601  -0.1157311   0.11903319 -0.15480831  0.

In [5]:
for a in places:
    print(model_ted.wv[a])

[ 0.01161407 -0.00035326 -0.03374523 -0.11664262  0.06043397  0.22825627
  0.07185744  0.14239599  0.03316464  0.01670026 -0.08343898  0.0248838
 -0.08612535  0.18227522 -0.05806027  0.12881733 -0.02462071 -0.07309113
  0.06080913  0.03440526 -0.01082236  0.05575469 -0.03431833  0.04023211
  0.05074368 -0.06915589 -0.0866803   0.06724066 -0.12042639  0.02855667
 -0.00561599  0.02091489 -0.02721581 -0.04567772  0.03405954 -0.02091208
 -0.10714971 -0.12113775 -0.07622553 -0.21601093  0.05267151  0.20943236
 -0.10224131  0.03557533 -0.07926216 -0.04398584 -0.07714873  0.193914
 -0.12076356  0.15387025 -0.0746684   0.14002222 -0.04869927  0.06844931
  0.14967136  0.12528184  0.10688824 -0.08248647  0.11164425 -0.10406178
  0.02741563 -0.05176911 -0.05356438 -0.11696988 -0.06953131 -0.08629781
  0.10162694  0.10484894 -0.01257708  0.03855427  0.2286578   0.01761858
  0.11014634  0.05490702  0.00747268 -0.08589087 -0.16630724  0.18818976
 -0.05007906 -0.05457372  0.10777244 -0.11515334  0.04