## Word Embedding Examples
- Word Embedding Examples based on pretrained Wikipedia2vec model.
- Pororo Word Embedding has two diffrent kinds of output format following below.
  - 'something' (word) : word2vec result (non-hyperlink in wikipedia documents)
  - 'something' (other) : entity2vec result (hyperlink in wikipedia documents)

In [1]:
from pororo import Pororo

In [2]:
word2vec = Pororo("word2vec", lang="ko")

In [3]:
# return vectors
word2vec("사과")

OrderedDict([('사과 (word)',
              tensor([-0.2660, -0.2157, -0.3058, -0.5231,  0.1120,  0.7693,  0.2678, -0.1033,
                      -0.0479,  0.0664, -0.3498,  0.2202, -0.0234, -0.1349,  0.4098,  0.8039,
                       0.0413,  0.6941,  0.1307, -0.3302,  0.4336,  0.3558, -0.0167, -0.3845,
                       0.4276,  0.0262,  0.2969,  0.2929,  0.2493,  0.4035, -0.2013,  0.8023,
                      -0.0409, -0.4895, -0.3734,  0.3468,  0.2544,  0.0825,  0.1088, -0.2386,
                       0.1706, -0.0170,  0.0328, -0.7536,  0.4117,  0.2149,  1.0250, -0.1620,
                      -0.4702,  0.0747,  0.1141,  0.0617, -0.4757,  0.6541, -0.2435, -0.0269,
                       0.2096,  0.1726,  0.3799, -0.0632, -0.7104,  0.2563,  0.8391,  0.3343,
                      -0.5193,  0.2105,  0.6058,  0.2799, -0.3542, -0.1775, -0.2572, -0.0785,
                      -0.2138,  0.3131,  0.8374,  0.1755, -0.2771, -0.0209, -0.5831, -0.0366,
                      -0.6521, -0

### method `find_similar_words`
- You can find similar words from query word.

In [4]:
# find similar words
word2vec.find_similar_words("사과")

OrderedDict([('사과 (word)',
              ['사죄 (word)',
               '해명 (word)',
               '드린다고 (word)',
               '사과드린다 (word)',
               '고소 (word)']),
             ('사과 (pome;fruit of Maloideae;fruit)',
              ['분류:사과 (Wikimedia category)',
               '배나무속 (taxon)',
               '딸기 (taxon)',
               '자두나무아속 (taxon)',
               '오이 (taxon)']),
             ('사향사과 (religious concept)',
              ['아라한과 (word)', '2승 (misc)', '무상정 ()', '삼도 ()', '阿羅漢 (word)']),
             ('사과 (교육) (liberal arts education)',
              ['분류:학문 분야 (Wikimedia category)',
               '삼학 (liberal arts education)',
               '자유과 (word)',
               '교육철학 (philosophy;branch of philosophy)',
               '분류:고등 교육 (Wikimedia category)']),
             ('사과 (영화) (film)',
              ['미인 (영화) (film)',
               '빗자루, 금붕어 되다 (film)',
               '쉐어 더 비전 (film)',
               '두 얼굴의 여친 (film)',
               '트라이앵글 (2009년 대한민국 영화

### argument `top_n`
- you can control the number of output entities using argument `top_n`.

In [5]:
# find similar words
word2vec.find_similar_words("사과", top_n=3)

OrderedDict([('사과 (word)', ['사죄 (word)', '해명 (word)', '드린다고 (word)']),
             ('사과 (pome;fruit of Maloideae;fruit)',
              ['분류:사과 (Wikimedia category)', '배나무속 (taxon)', '딸기 (taxon)']),
             ('사향사과 (religious concept)',
              ['아라한과 (word)', '2승 (misc)', '무상정 ()']),
             ('사과 (교육) (liberal arts education)',
              ['분류:학문 분야 (Wikimedia category)',
               '삼학 (liberal arts education)',
               '자유과 (word)']),
             ('사과 (영화) (film)',
              ['미인 (영화) (film)', '빗자루, 금붕어 되다 (film)', '쉐어 더 비전 (film)']),
             ('사과 (행위) (intentional human action)',
              ['분류:영향력 (Wikimedia category)',
               '공감 (social skills;psychology terminology)',
               '태도 (disposition;opinion)'])])

### argument `group`
- If you want group output entities, set `group=True`.

In [6]:
# find similar words
word2vec.find_similar_words("사과", top_n=3, group=True)

OrderedDict([('사과 (word)', OrderedDict([('word', ['사죄', '해명', '드린다고'])])),
             ('사과 (pome;fruit of Maloideae;fruit)',
              OrderedDict([('Wikimedia category', ['분류:사과']),
                           ('taxon', ['배나무속', '딸기'])])),
             ('사향사과 (religious concept)',
              OrderedDict([('word', ['아라한과']),
                           ('misc', ['2승']),
                           ('', ['무상정'])])),
             ('사과 (교육) (liberal arts education)',
              OrderedDict([('Wikimedia category', ['분류:학문 분야']),
                           ('liberal arts education', ['삼학']),
                           ('word', ['자유과'])])),
             ('사과 (영화) (film)',
              OrderedDict([('film', ['미인 (영화)', '빗자루, 금붕어 되다', '쉐어 더 비전'])])),
             ('사과 (행위) (intentional human action)',
              OrderedDict([('Wikimedia category', ['분류:영향력']),
                           ('social skills', ['공감']),
                           ('psychology terminology', ['공감']),
     

In [7]:
word2vec = Pororo("word2vec", lang="en")

In [8]:
word2vec("apple")

OrderedDict([('apple (word)',
              tensor([-1.8115e-01,  1.1258e+00, -3.3197e-01,  1.6572e-01,  1.3385e-01,
                      -1.9124e-01, -2.9481e-01,  9.4349e-02, -1.0922e-01, -4.2748e-01,
                      -4.6422e-02, -2.7049e-01,  2.5298e-01,  4.7728e-01, -4.9263e-01,
                      -8.2164e-01,  3.4270e-01, -9.3955e-02,  3.1955e-02,  9.3517e-02,
                       1.1431e-01, -3.0355e-01, -4.0796e-01, -5.7379e-01, -4.4223e-01,
                      -3.8100e-01,  2.2550e-01, -3.5723e-01, -8.1266e-02,  6.4951e-01,
                       7.1121e-02, -2.2219e-01,  7.4239e-01,  7.4116e-01,  2.8685e-01,
                      -3.6707e-01, -1.5618e-01, -3.6777e-02,  1.2592e+00,  4.8464e-01,
                       6.7569e-03, -4.0327e-01, -4.0716e-01, -7.5113e-02,  1.3573e-01,
                      -2.0801e-01, -3.0632e-02,  1.6976e-01, -6.2413e-02,  1.6968e-01,
                       3.3180e-02,  5.4541e-01, -5.0439e-01,  1.6068e-02,  3.8434e-02,
             

In [9]:
word2vec.find_similar_words("apple")

OrderedDict([('apple (word)',
              ['blackberry (word)',
               'silentype (word)',
               'Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)',
               'paulared (word)',
               'trueimage (word)']),
             ('Apple (fruit;pome;fruit of Maloideae)',
              ['Pear (taxon)',
               'Apricot (fruit)',
               'Plum (taxon)',
               'Peach (taxon)',
               'Cherry (fruit;drupe)']),
             ('Muggsy Bogues (human)',
              ['Tom Gugliotta (human)',
               'Billy Owens (human)',
               'David Wingate (basketball) (human)',
               '1995–96 Cleveland Cavaliers season (basketball team season)',
               ':1989–90 Denver Nuggets season (misc)']),
             ('Ariane Passenger Payload Experiment (communications satellite)',
              ['INSAT-3E (communications satellite)',
               'INSAT-3B (communications satellite)',
 

In [10]:
word2vec.find_similar_words("apple", top_n=3, group=True)

OrderedDict([('apple (word)',
              OrderedDict([('word', ['blackberry', 'silentype']),
                           ('business', ['Apple Inc.']),
                           ('enterprise', ['Apple Inc.']),
                           ('NASDAQ-100', ['Apple Inc.']),
                           ('giants of the web', ['Apple Inc.']),
                           ('Dow Jones Industrial Average', ['Apple Inc.'])])),
             ('Apple (fruit;pome;fruit of Maloideae)',
              OrderedDict([('taxon', ['Pear', 'Plum']),
                           ('fruit', ['Apricot'])])),
             ('Muggsy Bogues (human)',
              OrderedDict([('human',
                            ['Tom Gugliotta',
                             'Billy Owens',
                             'David Wingate (basketball)'])])),
             ('Ariane Passenger Payload Experiment (communications satellite)',
              OrderedDict([('communications satellite',
                            ['INSAT-3E', 'INSAT-3B',

In [11]:
word2vec = Pororo("word2vec", lang="ja")

In [12]:
word2vec("リンゴ")

OrderedDict([('リンゴ (word)',
              tensor([ 0.1310, -0.1558,  0.8368,  0.3689,  0.4422, -0.0418,  0.1641,  0.4188,
                      -0.2567, -0.3033,  0.6205, -0.4926,  0.5230,  0.0518, -0.0300, -0.1409,
                       0.1582, -0.2530, -0.5454, -0.2452,  0.0099,  0.6365,  0.4313, -0.1550,
                      -0.0535,  0.7163,  0.0013,  0.4618, -0.4046,  0.2726,  0.3179,  0.0376,
                      -0.4326,  0.0054, -0.2747,  0.7512,  0.0274, -0.4402,  0.2850, -0.2233,
                      -0.1414, -0.3110,  0.8709,  0.2841,  0.1205, -0.0464, -0.5412, -0.1070,
                       0.4027, -0.2069,  0.0165,  0.0256,  0.2051, -0.0647, -0.3629,  0.0649,
                      -0.3896, -0.1622, -0.0697,  0.1292,  0.1048, -0.2001, -0.0260,  0.3329,
                       0.0640, -0.0474,  0.2108,  0.5559, -0.5122,  1.0869,  0.1343, -0.3058,
                      -0.6324,  0.0446,  0.6196,  0.7900, -0.6034,  0.0059,  0.0678,  0.5734,
                      -0.0020, -

In [13]:
word2vec.find_similar_words("リンゴ")

OrderedDict([('リンゴ (word)',
              ['サクランボ (word)',
               'イチゴ (word)',
               'スターキングデリシャス (word)',
               'ジュース (word)',
               'アスパラガス (word)']),
             ('リンゴ (fruit;fruit of Maloideae;pome)',
              ['イチゴ (taxon)',
               'モモ (taxon)',
               'ブドウ (grape juice;berry)',
               'ナシ (taxon)',
               'サクランボ (drupe;fruit)']),
             ('リンゴ (アルバム) (album)',
              ['グッドナイト・ウィーン (album;studio album)',
               '想い出のフォトグラフ (Ringo;single;song)',
               '明日への願い (single)',
               "オール・シングス・マスト・パス (George Harrison's albums in chronological order;triple album;studio album)",
               'バック・オフ・ブーガルー (Stop and Smell the Roses;single;song)'])])

In [14]:
word2vec.find_similar_words("リンゴ", top_n=3, group=True)

OrderedDict([('リンゴ (word)',
              OrderedDict([('word', ['サクランボ', 'イチゴ', 'スターキングデリシャス'])])),
             ('リンゴ (fruit;fruit of Maloideae;pome)',
              OrderedDict([('taxon', ['イチゴ', 'モモ']),
                           ('grape juice', ['ブドウ']),
                           ('berry', ['ブドウ'])])),
             ('リンゴ (アルバム) (album)',
              OrderedDict([('album', ['グッドナイト・ウィーン']),
                           ('studio album', ['グッドナイト・ウィーン']),
                           ('Ringo', ['想い出のフォトグラフ']),
                           ('single', ['想い出のフォトグラフ', '明日への願い']),
                           ('song', ['想い出のフォトグラフ'])]))])

In [15]:
word2vec = Pororo("word2vec", lang="zh")

In [16]:
word2vec("苹果")

OrderedDict([('苹果 (word)',
              tensor([-0.1839,  0.5122, -0.1008,  0.0722,  0.4457,  0.9738,  0.4418, -0.6516,
                       0.8440,  0.4250,  0.4523,  0.9163, -0.3826, -0.1576,  0.0421, -0.0268,
                      -0.6435, -0.5359, -0.5615, -0.0412, -0.0995,  0.0274,  0.6562, -0.7448,
                       0.1082,  0.1580, -0.2407,  0.2441, -0.2375, -0.5885, -0.1168, -0.2187,
                       0.3831, -0.2568, -0.3288,  0.3200,  0.3182, -0.2485,  0.5260, -0.3296,
                       0.1426,  0.1314,  0.3923, -0.8193,  0.3724, -0.3045, -0.5827,  0.2696,
                      -0.2920, -1.2506,  0.5526,  0.5085,  0.2361, -0.0078,  0.3751,  0.4210,
                       0.3462,  0.0346,  0.8090,  0.1075,  0.5013,  0.4497,  0.1830, -0.4328,
                      -0.3204,  0.1639, -0.4527,  0.3798,  0.8444,  0.8128,  0.5646,  0.5998,
                       0.9775, -0.5508, -0.0043, -0.3345,  0.0428,  0.1791, -0.0371,  0.2009,
                      -0.0159,  0

In [17]:
word2vec.find_similar_words("苹果")

OrderedDict([('苹果 (word)',
              ['苹果公司 (word)',
               '黑莓 (word)',
               '苹果皮 (word)',
               '树莓 (word)',
               'ibookstore (word)']),
             ('苹果 (fruit;fruit of Maloideae;pome)',
              ['杏仁 (apricot;stone;culinary nuts)',
               '梨 (taxon)',
               '無花果 (taxon)',
               '葡萄 (grape juice;berry)',
               '桃 (taxon)']),
             ('苹果 (电影) (film)',
              ['盲山 (film)',
               '我的父親母親 (misc)',
               '闯关东 (电视剧) (television program)',
               '摇摆de婚约 (misc)',
               '北京遇上西雅圖 (misc)']),
             ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)',
              ['苹果公司 (word)',
               'IOS 9 (iOS;operating system;mobile operating system)',
               '苹果公司 (misc)',
               'MacBook Air (Ultrabook;computer model;MacBook;Apple Macintosh)',
               'WWDC (misc)'])])

In [18]:
word2vec.find_similar_words("苹果", top_n=3, group=True)

OrderedDict([('苹果 (word)', OrderedDict([('word', ['苹果公司', '黑莓', '苹果皮'])])),
             ('苹果 (fruit;fruit of Maloideae;pome)',
              OrderedDict([('apricot', ['杏仁']),
                           ('stone', ['杏仁']),
                           ('culinary nuts', ['杏仁']),
                           ('taxon', ['梨', '無花果'])])),
             ('苹果 (电影) (film)',
              OrderedDict([('film', ['盲山']),
                           ('misc', ['我的父親母親']),
                           ('television program', ['闯关东 (电视剧)'])])),
             ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)',
              OrderedDict([('word', ['苹果公司']),
                           ('iOS', ['IOS 9']),
                           ('operating system', ['IOS 9']),
                           ('mobile operating system', ['IOS 9']),
                           ('misc', ['苹果公司'])]))])