In [27]:
import tensorflow as tf
import math
import numpy as np

In [28]:
text =['he is the king','the king is royal','she is the royal queen']

In [29]:
def __is_bounded(direction,range,index,tokens_leng):
    cover = range*direction
    if cover+index<0 or cover+index >= tokens_leng:
        return True
    else:
        return False

def get_context(tokens, window_size):
    context_pair = []
    for i, token in enumerate(tokens):
        for j in range(1, window_size+1):
            if not __is_bounded(1,j,i,len(tokens)):
                context_pair.append((tokens[i],tokens[i+j]))
            if not __is_bounded(-1,j,i,len(tokens)):
                context_pair.append((tokens[i],tokens[i-j]))
    return context_pair

def __get_word_set(tokens):
    word_set = set()
    for token in tokens:
        word_set.add(token)
    return word_set

def __get_word_index(word_set):
    word_index_dic = dict()
    inverse_word_dic = dict()
    for i,word in enumerate(word_set):
        word_index_dic[word] = i
        inverse_word_dic[i] = word
    return word_index_dic,inverse_word_dic

def generate_batch(context_pair,batch_size):
    batch_list =[]
    batch=[]
    for i,pair in enumerate(context_pair):

        if i %batch_size==0 and i !=0:
            batch_list.append(batch)
            batch = []

        batch.append(pair)
    return batch_list

def get_vec(word,session):
    return session.run(embeddings[word_index_dic[word]])

def __dis(vec1, vec2):
    dis = 0.0
    for i in range(0,len(vec1)):
        dis+=math.pow((vec1[i]-vec2[i]),2)
    return dis

def get_cos_similarity(vec1, vec2):
    vec1_leng=0
    for value in vec1:
        vec1_leng+=(value*value)
    vec1_leng=math.sqrt(vec1_leng)
    vec2_leng=0
    for value in vec2:
        vec2_leng+=(value*value)
    vec2_leng=math.sqrt(vec2_leng)
    product=np.dot(vec1,vec2)

    return product/(vec1_leng*vec2_leng)

def __sim(vec1, vec2):
    return (1 - math.acos(get_cos_similarity(vec1,vec2)) / math.pi)

def one_hot(data, label_size):
    vector = np.zeros((len(data),label_size),dtype='f')
    for i,single in enumerate(data):
        vector[i][single]=1.0
    return vector

def find_cloest_word(word_set,session,target_word):
    sim = 0.0
    vec1 = get_vec(target_word,session)
    result = ''
    for word in word_set:
        if word == target_word:
            continue
        vec2 = get_vec(word,session)
        tmp_sim=__sim(vec1, vec2)
        print('%s : %s : %s' %(target_word,word,tmp_sim))
        if tmp_sim>sim:
            sim = tmp_sim
            result = word
    return result

In [30]:
window_size = 2
embedding_size = 5
if __name__ == '__main__':
    context_pair=[]
    word_set = set()

    for sentence in text:
        tokens = sentence.lower().split(' ')
        context_pair += get_context(tokens,window_size)
        tmp_word_set = __get_word_set(tokens)
        for word in tmp_word_set:
            word_set.add(word)

In [31]:
print(word_set)

{'royal', 'the', 'queen', 'is', 'king', 'she', 'he'}


In [32]:
print(context_pair)

[('he', 'is'), ('he', 'the'), ('is', 'the'), ('is', 'he'), ('is', 'king'), ('the', 'king'), ('the', 'is'), ('the', 'he'), ('king', 'the'), ('king', 'is'), ('the', 'king'), ('the', 'is'), ('king', 'is'), ('king', 'the'), ('king', 'royal'), ('is', 'royal'), ('is', 'king'), ('is', 'the'), ('royal', 'is'), ('royal', 'king'), ('she', 'is'), ('she', 'the'), ('is', 'the'), ('is', 'she'), ('is', 'royal'), ('the', 'royal'), ('the', 'is'), ('the', 'queen'), ('the', 'she'), ('royal', 'queen'), ('royal', 'the'), ('royal', 'is'), ('queen', 'royal'), ('queen', 'the')]


In [33]:
word_index_dic,inverse_word_dic=__get_word_index(word_set)
word_size = len(word_set)
batch_size = len(context_pair)

In [34]:
print(word_index_dic)

{'royal': 0, 'the': 1, 'queen': 2, 'is': 3, 'king': 4, 'she': 5, 'he': 6}


In [35]:
print(inverse_word_dic)

{0: 'royal', 1: 'the', 2: 'queen', 3: 'is', 4: 'king', 5: 'she', 6: 'he'}


In [36]:
print(batch_size)

34


In [37]:
print(word_size)

7


In [38]:
inputs = [word_index_dic[x[0]] for x in context_pair]
labels = [word_index_dic[x[1]] for x in context_pair]

In [39]:
print(inputs)

[6, 6, 3, 3, 3, 1, 1, 1, 4, 4, 1, 1, 4, 4, 4, 3, 3, 3, 0, 0, 5, 5, 3, 3, 3, 1, 1, 1, 1, 0, 0, 0, 2, 2]


In [40]:
print(labels)

[3, 1, 1, 6, 4, 4, 3, 6, 1, 3, 4, 3, 3, 1, 0, 0, 4, 1, 3, 4, 3, 1, 1, 5, 0, 0, 3, 2, 5, 2, 1, 3, 0, 1]


In [41]:
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size,])

In [42]:
embeddings = tf.Variable(tf.random_uniform([word_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [43]:
nce_weights = tf.Variable(tf.truncated_normal([word_size,embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([word_size]))

In [44]:
prediction = tf.add(tf.matmul(embed, tf.transpose(nce_weights)), nce_biases)
train_labels_vector = tf.one_hot(train_labels,word_size)

In [45]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=train_labels_vector))

In [46]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
session = tf.Session()

In [47]:
init = tf.global_variables_initializer()
session.run(init)

In [48]:
for iteration in range(0,10000):
        total_loss = 0

        feed_dict = {train_inputs: inputs, train_labels: labels}
        _, cur_loss,pred= session.run([optimizer, loss, prediction], feed_dict=feed_dict)
        print('%s: loss: %s' %(iteration,cur_loss))

0: loss: 2.109287
1: loss: 1.9693857
2: loss: 1.8767495
3: loss: 1.8127109
4: loss: 1.7667828
5: loss: 1.7325444
6: loss: 1.7058764
7: loss: 1.6841049
8: loss: 1.6655138
9: loss: 1.6490179
10: loss: 1.6339417
11: loss: 1.619867
12: loss: 1.6065382
13: loss: 1.5938026
14: loss: 1.5815722
15: loss: 1.5697992
16: loss: 1.5584625
17: loss: 1.5475535
18: loss: 1.537071
19: loss: 1.5270144
20: loss: 1.5173818
21: loss: 1.5081682
22: loss: 1.4993658
23: loss: 1.4909633
24: loss: 1.4829484
25: loss: 1.4753069
26: loss: 1.4680251
27: loss: 1.4610885
28: loss: 1.4544842
29: loss: 1.448199
30: loss: 1.4422203
31: loss: 1.4365367
32: loss: 1.4311363
33: loss: 1.4260081
34: loss: 1.4211411
35: loss: 1.4165246
36: loss: 1.4121478
37: loss: 1.408
38: loss: 1.4040709
39: loss: 1.4003505
40: loss: 1.3968283
41: loss: 1.393495
42: loss: 1.3903412
43: loss: 1.3873572
44: loss: 1.3845348
45: loss: 1.3818653
46: loss: 1.3793403
47: loss: 1.3769522
48: loss: 1.3746934
49: loss: 1.3725568
50: loss: 1.3705356

471: loss: 1.3218031
472: loss: 1.3217993
473: loss: 1.3217955
474: loss: 1.3217915
475: loss: 1.3217878
476: loss: 1.321784
477: loss: 1.3217802
478: loss: 1.3217765
479: loss: 1.321773
480: loss: 1.3217692
481: loss: 1.3217655
482: loss: 1.321762
483: loss: 1.3217584
484: loss: 1.3217549
485: loss: 1.3217512
486: loss: 1.3217477
487: loss: 1.3217441
488: loss: 1.3217404
489: loss: 1.3217369
490: loss: 1.3217335
491: loss: 1.32173
492: loss: 1.3217266
493: loss: 1.321723
494: loss: 1.3217196
495: loss: 1.3217164
496: loss: 1.321713
497: loss: 1.3217096
498: loss: 1.3217062
499: loss: 1.321703
500: loss: 1.3216996
501: loss: 1.3216963
502: loss: 1.3216927
503: loss: 1.3216896
504: loss: 1.3216865
505: loss: 1.3216832
506: loss: 1.32168
507: loss: 1.3216767
508: loss: 1.3216735
509: loss: 1.3216702
510: loss: 1.3216672
511: loss: 1.3216639
512: loss: 1.3216608
513: loss: 1.3216577
514: loss: 1.3216547
515: loss: 1.3216515
516: loss: 1.3216485
517: loss: 1.3216454
518: loss: 1.3216423
51

937: loss: 1.3209968
938: loss: 1.320996
939: loss: 1.3209952
940: loss: 1.3209944
941: loss: 1.3209938
942: loss: 1.3209931
943: loss: 1.3209924
944: loss: 1.3209915
945: loss: 1.3209906
946: loss: 1.3209898
947: loss: 1.3209891
948: loss: 1.3209884
949: loss: 1.3209877
950: loss: 1.3209869
951: loss: 1.3209863
952: loss: 1.3209854
953: loss: 1.3209847
954: loss: 1.320984
955: loss: 1.3209832
956: loss: 1.3209825
957: loss: 1.3209819
958: loss: 1.320981
959: loss: 1.3209804
960: loss: 1.3209795
961: loss: 1.3209789
962: loss: 1.3209782
963: loss: 1.3209774
964: loss: 1.3209767
965: loss: 1.320976
966: loss: 1.3209753
967: loss: 1.3209745
968: loss: 1.3209739
969: loss: 1.3209732
970: loss: 1.3209723
971: loss: 1.3209716
972: loss: 1.320971
973: loss: 1.3209703
974: loss: 1.3209696
975: loss: 1.3209689
976: loss: 1.320968
977: loss: 1.3209676
978: loss: 1.3209667
979: loss: 1.320966
980: loss: 1.3209654
981: loss: 1.3209648
982: loss: 1.3209641
983: loss: 1.3209633
984: loss: 1.3209627

1426: loss: 1.3207628
1427: loss: 1.3207625
1428: loss: 1.3207622
1429: loss: 1.3207618
1430: loss: 1.3207616
1431: loss: 1.3207613
1432: loss: 1.3207608
1433: loss: 1.3207606
1434: loss: 1.3207604
1435: loss: 1.32076
1436: loss: 1.3207598
1437: loss: 1.3207595
1438: loss: 1.3207592
1439: loss: 1.3207589
1440: loss: 1.3207585
1441: loss: 1.3207583
1442: loss: 1.3207581
1443: loss: 1.3207577
1444: loss: 1.3207575
1445: loss: 1.3207572
1446: loss: 1.3207569
1447: loss: 1.3207566
1448: loss: 1.3207562
1449: loss: 1.3207561
1450: loss: 1.3207558
1451: loss: 1.3207556
1452: loss: 1.3207551
1453: loss: 1.3207549
1454: loss: 1.3207545
1455: loss: 1.3207542
1456: loss: 1.320754
1457: loss: 1.3207538
1458: loss: 1.3207535
1459: loss: 1.3207532
1460: loss: 1.3207529
1461: loss: 1.3207525
1462: loss: 1.3207524
1463: loss: 1.3207521
1464: loss: 1.3207518
1465: loss: 1.3207515
1466: loss: 1.3207513
1467: loss: 1.320751
1468: loss: 1.3207506
1469: loss: 1.3207502
1470: loss: 1.32075
1471: loss: 1.32

1856: loss: 1.3206675
1857: loss: 1.3206674
1858: loss: 1.3206671
1859: loss: 1.3206669
1860: loss: 1.3206669
1861: loss: 1.3206667
1862: loss: 1.3206666
1863: loss: 1.3206663
1864: loss: 1.3206662
1865: loss: 1.3206661
1866: loss: 1.320666
1867: loss: 1.3206657
1868: loss: 1.3206656
1869: loss: 1.3206654
1870: loss: 1.3206652
1871: loss: 1.3206651
1872: loss: 1.3206648
1873: loss: 1.3206646
1874: loss: 1.3206645
1875: loss: 1.3206644
1876: loss: 1.3206643
1877: loss: 1.320664
1878: loss: 1.3206639
1879: loss: 1.3206638
1880: loss: 1.3206636
1881: loss: 1.3206635
1882: loss: 1.3206632
1883: loss: 1.3206632
1884: loss: 1.320663
1885: loss: 1.3206626
1886: loss: 1.3206626
1887: loss: 1.3206625
1888: loss: 1.3206624
1889: loss: 1.3206623
1890: loss: 1.3206619
1891: loss: 1.3206619
1892: loss: 1.3206617
1893: loss: 1.3206615
1894: loss: 1.3206613
1895: loss: 1.3206612
1896: loss: 1.320661
1897: loss: 1.3206608
1898: loss: 1.3206606
1899: loss: 1.3206606
1900: loss: 1.3206604
1901: loss: 1.

2314: loss: 1.3206084
2315: loss: 1.3206083
2316: loss: 1.320608
2317: loss: 1.320608
2318: loss: 1.3206078
2319: loss: 1.3206078
2320: loss: 1.3206077
2321: loss: 1.3206075
2322: loss: 1.3206075
2323: loss: 1.3206073
2324: loss: 1.3206072
2325: loss: 1.3206072
2326: loss: 1.3206071
2327: loss: 1.3206071
2328: loss: 1.320607
2329: loss: 1.3206068
2330: loss: 1.3206068
2331: loss: 1.3206065
2332: loss: 1.3206065
2333: loss: 1.3206065
2334: loss: 1.3206062
2335: loss: 1.3206062
2336: loss: 1.3206061
2337: loss: 1.320606
2338: loss: 1.3206059
2339: loss: 1.3206056
2340: loss: 1.3206056
2341: loss: 1.3206055
2342: loss: 1.3206054
2343: loss: 1.3206054
2344: loss: 1.3206053
2345: loss: 1.3206053
2346: loss: 1.3206052
2347: loss: 1.3206052
2348: loss: 1.3206049
2349: loss: 1.3206049
2350: loss: 1.3206047
2351: loss: 1.3206047
2352: loss: 1.3206046
2353: loss: 1.3206043
2354: loss: 1.3206043
2355: loss: 1.3206042
2356: loss: 1.3206041
2357: loss: 1.320604
2358: loss: 1.320604
2359: loss: 1.32

2827: loss: 1.3205667
2828: loss: 1.3205667
2829: loss: 1.3205665
2830: loss: 1.3205664
2831: loss: 1.3205663
2832: loss: 1.3205662
2833: loss: 1.3205662
2834: loss: 1.3205662
2835: loss: 1.3205659
2836: loss: 1.320566
2837: loss: 1.3205659
2838: loss: 1.3205659
2839: loss: 1.3205659
2840: loss: 1.3205658
2841: loss: 1.3205656
2842: loss: 1.3205656
2843: loss: 1.3205656
2844: loss: 1.3205655
2845: loss: 1.3205653
2846: loss: 1.3205653
2847: loss: 1.3205652
2848: loss: 1.3205652
2849: loss: 1.3205652
2850: loss: 1.3205652
2851: loss: 1.3205652
2852: loss: 1.3205651
2853: loss: 1.3205649
2854: loss: 1.3205649
2855: loss: 1.3205649
2856: loss: 1.3205647
2857: loss: 1.3205646
2858: loss: 1.3205646
2859: loss: 1.3205646
2860: loss: 1.3205645
2861: loss: 1.3205644
2862: loss: 1.3205644
2863: loss: 1.3205644
2864: loss: 1.3205642
2865: loss: 1.320564
2866: loss: 1.320564
2867: loss: 1.320564
2868: loss: 1.320564
2869: loss: 1.3205639
2870: loss: 1.3205638
2871: loss: 1.3205638
2872: loss: 1.3

3319: loss: 1.3205397
3320: loss: 1.3205397
3321: loss: 1.3205397
3322: loss: 1.3205396
3323: loss: 1.3205395
3324: loss: 1.3205395
3325: loss: 1.3205394
3326: loss: 1.3205394
3327: loss: 1.3205394
3328: loss: 1.3205394
3329: loss: 1.3205391
3330: loss: 1.3205392
3331: loss: 1.3205391
3332: loss: 1.3205391
3333: loss: 1.320539
3334: loss: 1.320539
3335: loss: 1.3205389
3336: loss: 1.3205389
3337: loss: 1.3205389
3338: loss: 1.3205389
3339: loss: 1.3205386
3340: loss: 1.3205386
3341: loss: 1.3205385
3342: loss: 1.3205385
3343: loss: 1.3205385
3344: loss: 1.3205385
3345: loss: 1.3205385
3346: loss: 1.3205384
3347: loss: 1.3205384
3348: loss: 1.3205384
3349: loss: 1.3205383
3350: loss: 1.3205384
3351: loss: 1.3205383
3352: loss: 1.3205383
3353: loss: 1.3205382
3354: loss: 1.3205382
3355: loss: 1.3205382
3356: loss: 1.320538
3357: loss: 1.320538
3358: loss: 1.3205379
3359: loss: 1.3205378
3360: loss: 1.3205378
3361: loss: 1.3205378
3362: loss: 1.3205378
3363: loss: 1.3205377
3364: loss: 1.

3790: loss: 1.3205211
3791: loss: 1.3205209
3792: loss: 1.3205209
3793: loss: 1.3205209
3794: loss: 1.3205209
3795: loss: 1.3205209
3796: loss: 1.3205209
3797: loss: 1.3205209
3798: loss: 1.3205208
3799: loss: 1.3205206
3800: loss: 1.3205208
3801: loss: 1.3205206
3802: loss: 1.3205206
3803: loss: 1.3205206
3804: loss: 1.3205206
3805: loss: 1.3205206
3806: loss: 1.3205205
3807: loss: 1.3205204
3808: loss: 1.3205203
3809: loss: 1.3205203
3810: loss: 1.3205203
3811: loss: 1.3205203
3812: loss: 1.3205203
3813: loss: 1.3205203
3814: loss: 1.32052
3815: loss: 1.32052
3816: loss: 1.32052
3817: loss: 1.32052
3818: loss: 1.32052
3819: loss: 1.3205199
3820: loss: 1.32052
3821: loss: 1.32052
3822: loss: 1.3205199
3823: loss: 1.3205198
3824: loss: 1.3205198
3825: loss: 1.3205198
3826: loss: 1.3205198
3827: loss: 1.3205198
3828: loss: 1.3205197
3829: loss: 1.3205194
3830: loss: 1.3205196
3831: loss: 1.3205194
3832: loss: 1.3205194
3833: loss: 1.3205194
3834: loss: 1.3205194
3835: loss: 1.3205194
38

4289: loss: 1.3205061
4290: loss: 1.3205061
4291: loss: 1.3205061
4292: loss: 1.3205061
4293: loss: 1.3205061
4294: loss: 1.320506
4295: loss: 1.3205059
4296: loss: 1.3205059
4297: loss: 1.3205059
4298: loss: 1.3205059
4299: loss: 1.3205059
4300: loss: 1.3205057
4301: loss: 1.3205057
4302: loss: 1.3205056
4303: loss: 1.3205057
4304: loss: 1.3205056
4305: loss: 1.3205056
4306: loss: 1.3205056
4307: loss: 1.3205055
4308: loss: 1.3205055
4309: loss: 1.3205055
4310: loss: 1.3205055
4311: loss: 1.3205055
4312: loss: 1.3205055
4313: loss: 1.3205055
4314: loss: 1.3205054
4315: loss: 1.3205053
4316: loss: 1.3205053
4317: loss: 1.3205053
4318: loss: 1.3205053
4319: loss: 1.3205053
4320: loss: 1.3205051
4321: loss: 1.3205053
4322: loss: 1.3205053
4323: loss: 1.3205053
4324: loss: 1.3205051
4325: loss: 1.320505
4326: loss: 1.320505
4327: loss: 1.320505
4328: loss: 1.320505
4329: loss: 1.320505
4330: loss: 1.3205049
4331: loss: 1.3205049
4332: loss: 1.3205049
4333: loss: 1.3205049
4334: loss: 1.32

4754: loss: 1.320495
4755: loss: 1.3204951
4756: loss: 1.320495
4757: loss: 1.3204949
4758: loss: 1.3204949
4759: loss: 1.320495
4760: loss: 1.3204949
4761: loss: 1.3204949
4762: loss: 1.3204949
4763: loss: 1.3204949
4764: loss: 1.3204949
4765: loss: 1.3204949
4766: loss: 1.3204949
4767: loss: 1.3204949
4768: loss: 1.3204949
4769: loss: 1.3204948
4770: loss: 1.3204947
4771: loss: 1.3204949
4772: loss: 1.3204947
4773: loss: 1.3204947
4774: loss: 1.3204947
4775: loss: 1.3204947
4776: loss: 1.3204947
4777: loss: 1.3204947
4778: loss: 1.3204947
4779: loss: 1.3204945
4780: loss: 1.3204947
4781: loss: 1.3204947
4782: loss: 1.3204944
4783: loss: 1.3204947
4784: loss: 1.3204945
4785: loss: 1.3204947
4786: loss: 1.3204944
4787: loss: 1.3204944
4788: loss: 1.3204944
4789: loss: 1.3204944
4790: loss: 1.3204944
4791: loss: 1.3204944
4792: loss: 1.3204944
4793: loss: 1.3204944
4794: loss: 1.3204944
4795: loss: 1.3204944
4796: loss: 1.3204943
4797: loss: 1.3204944
4798: loss: 1.3204942
4799: loss: 1

5221: loss: 1.3204863
5222: loss: 1.3204863
5223: loss: 1.3204862
5224: loss: 1.3204863
5225: loss: 1.3204863
5226: loss: 1.3204862
5227: loss: 1.3204862
5228: loss: 1.3204862
5229: loss: 1.3204862
5230: loss: 1.3204862
5231: loss: 1.3204861
5232: loss: 1.3204861
5233: loss: 1.3204861
5234: loss: 1.3204861
5235: loss: 1.3204861
5236: loss: 1.3204861
5237: loss: 1.320486
5238: loss: 1.320486
5239: loss: 1.320486
5240: loss: 1.3204858
5241: loss: 1.3204858
5242: loss: 1.3204858
5243: loss: 1.3204858
5244: loss: 1.3204858
5245: loss: 1.3204858
5246: loss: 1.3204857
5247: loss: 1.3204858
5248: loss: 1.3204857
5249: loss: 1.3204858
5250: loss: 1.3204857
5251: loss: 1.3204857
5252: loss: 1.3204857
5253: loss: 1.3204857
5254: loss: 1.3204856
5255: loss: 1.3204856
5256: loss: 1.3204856
5257: loss: 1.3204856
5258: loss: 1.3204856
5259: loss: 1.3204856
5260: loss: 1.3204856
5261: loss: 1.3204856
5262: loss: 1.3204856
5263: loss: 1.3204856
5264: loss: 1.3204855
5265: loss: 1.3204856
5266: loss: 1

5679: loss: 1.3204792
5680: loss: 1.320479
5681: loss: 1.320479
5682: loss: 1.320479
5683: loss: 1.320479
5684: loss: 1.320479
5685: loss: 1.320479
5686: loss: 1.320479
5687: loss: 1.320479
5688: loss: 1.320479
5689: loss: 1.320479
5690: loss: 1.320479
5691: loss: 1.3204789
5692: loss: 1.320479
5693: loss: 1.3204789
5694: loss: 1.3204789
5695: loss: 1.3204789
5696: loss: 1.3204789
5697: loss: 1.3204788
5698: loss: 1.320479
5699: loss: 1.3204788
5700: loss: 1.3204788
5701: loss: 1.3204788
5702: loss: 1.3204788
5703: loss: 1.3204788
5704: loss: 1.3204788
5705: loss: 1.3204788
5706: loss: 1.3204788
5707: loss: 1.3204788
5708: loss: 1.3204788
5709: loss: 1.3204788
5710: loss: 1.3204788
5711: loss: 1.3204788
5712: loss: 1.3204788
5713: loss: 1.3204786
5714: loss: 1.3204784
5715: loss: 1.3204784
5716: loss: 1.3204786
5717: loss: 1.3204784
5718: loss: 1.3204784
5719: loss: 1.3204784
5720: loss: 1.3204784
5721: loss: 1.3204784
5722: loss: 1.3204784
5723: loss: 1.3204784
5724: loss: 1.3204784
5

6172: loss: 1.3204726
6173: loss: 1.3204726
6174: loss: 1.3204726
6175: loss: 1.3204726
6176: loss: 1.3204726
6177: loss: 1.3204726
6178: loss: 1.3204726
6179: loss: 1.3204726
6180: loss: 1.3204726
6181: loss: 1.3204725
6182: loss: 1.3204725
6183: loss: 1.3204726
6184: loss: 1.3204725
6185: loss: 1.3204724
6186: loss: 1.3204724
6187: loss: 1.3204725
6188: loss: 1.3204724
6189: loss: 1.3204725
6190: loss: 1.3204725
6191: loss: 1.3204724
6192: loss: 1.3204724
6193: loss: 1.3204724
6194: loss: 1.3204724
6195: loss: 1.3204724
6196: loss: 1.3204724
6197: loss: 1.3204724
6198: loss: 1.3204724
6199: loss: 1.3204724
6200: loss: 1.3204722
6201: loss: 1.3204724
6202: loss: 1.3204724
6203: loss: 1.3204724
6204: loss: 1.3204722
6205: loss: 1.3204722
6206: loss: 1.3204721
6207: loss: 1.3204722
6208: loss: 1.3204721
6209: loss: 1.3204722
6210: loss: 1.3204722
6211: loss: 1.3204721
6212: loss: 1.3204721
6213: loss: 1.3204721
6214: loss: 1.3204721
6215: loss: 1.3204721
6216: loss: 1.3204721
6217: loss

6602: loss: 1.320468
6603: loss: 1.320468
6604: loss: 1.320468
6605: loss: 1.320468
6606: loss: 1.320468
6607: loss: 1.320468
6608: loss: 1.3204678
6609: loss: 1.320468
6610: loss: 1.320468
6611: loss: 1.320468
6612: loss: 1.3204678
6613: loss: 1.3204677
6614: loss: 1.3204677
6615: loss: 1.3204678
6616: loss: 1.3204678
6617: loss: 1.3204677
6618: loss: 1.3204677
6619: loss: 1.3204677
6620: loss: 1.3204677
6621: loss: 1.3204678
6622: loss: 1.3204677
6623: loss: 1.3204676
6624: loss: 1.3204676
6625: loss: 1.3204676
6626: loss: 1.3204676
6627: loss: 1.3204676
6628: loss: 1.3204676
6629: loss: 1.3204676
6630: loss: 1.3204676
6631: loss: 1.3204676
6632: loss: 1.3204676
6633: loss: 1.3204676
6634: loss: 1.3204676
6635: loss: 1.3204676
6636: loss: 1.3204676
6637: loss: 1.3204676
6638: loss: 1.3204676
6639: loss: 1.3204675
6640: loss: 1.3204675
6641: loss: 1.3204675
6642: loss: 1.3204675
6643: loss: 1.3204676
6644: loss: 1.3204675
6645: loss: 1.3204674
6646: loss: 1.3204675
6647: loss: 1.32046

7001: loss: 1.320464
7002: loss: 1.3204639
7003: loss: 1.3204639
7004: loss: 1.3204639
7005: loss: 1.3204639
7006: loss: 1.320464
7007: loss: 1.320464
7008: loss: 1.3204639
7009: loss: 1.3204639
7010: loss: 1.3204639
7011: loss: 1.3204639
7012: loss: 1.3204639
7013: loss: 1.3204639
7014: loss: 1.3204638
7015: loss: 1.3204639
7016: loss: 1.3204639
7017: loss: 1.3204639
7018: loss: 1.3204639
7019: loss: 1.3204639
7020: loss: 1.3204639
7021: loss: 1.3204639
7022: loss: 1.3204639
7023: loss: 1.3204639
7024: loss: 1.3204638
7025: loss: 1.3204637
7026: loss: 1.3204638
7027: loss: 1.3204638
7028: loss: 1.3204638
7029: loss: 1.3204638
7030: loss: 1.3204637
7031: loss: 1.3204639
7032: loss: 1.3204637
7033: loss: 1.3204638
7034: loss: 1.3204638
7035: loss: 1.3204637
7036: loss: 1.3204638
7037: loss: 1.3204637
7038: loss: 1.3204637
7039: loss: 1.3204637
7040: loss: 1.3204637
7041: loss: 1.3204637
7042: loss: 1.3204637
7043: loss: 1.3204637
7044: loss: 1.3204637
7045: loss: 1.3204637
7046: loss: 1

7471: loss: 1.32046
7472: loss: 1.32046
7473: loss: 1.32046
7474: loss: 1.32046
7475: loss: 1.32046
7476: loss: 1.32046
7477: loss: 1.32046
7478: loss: 1.32046
7479: loss: 1.32046
7480: loss: 1.32046
7481: loss: 1.32046
7482: loss: 1.32046
7483: loss: 1.32046
7484: loss: 1.32046
7485: loss: 1.32046
7486: loss: 1.32046
7487: loss: 1.32046
7488: loss: 1.32046
7489: loss: 1.3204601
7490: loss: 1.32046
7491: loss: 1.32046
7492: loss: 1.32046
7493: loss: 1.32046
7494: loss: 1.32046
7495: loss: 1.32046
7496: loss: 1.32046
7497: loss: 1.3204598
7498: loss: 1.3204598
7499: loss: 1.3204598
7500: loss: 1.3204598
7501: loss: 1.3204598
7502: loss: 1.3204598
7503: loss: 1.3204597
7504: loss: 1.3204598
7505: loss: 1.3204597
7506: loss: 1.3204598
7507: loss: 1.3204597
7508: loss: 1.3204597
7509: loss: 1.3204597
7510: loss: 1.3204597
7511: loss: 1.3204597
7512: loss: 1.3204598
7513: loss: 1.3204598
7514: loss: 1.3204597
7515: loss: 1.3204597
7516: loss: 1.3204597
7517: loss: 1.3204597
7518: loss: 1.32

7948: loss: 1.3204565
7949: loss: 1.3204565
7950: loss: 1.3204565
7951: loss: 1.3204565
7952: loss: 1.3204565
7953: loss: 1.3204565
7954: loss: 1.3204565
7955: loss: 1.3204565
7956: loss: 1.3204563
7957: loss: 1.3204565
7958: loss: 1.3204565
7959: loss: 1.3204564
7960: loss: 1.3204565
7961: loss: 1.3204564
7962: loss: 1.3204565
7963: loss: 1.3204565
7964: loss: 1.3204564
7965: loss: 1.3204564
7966: loss: 1.3204564
7967: loss: 1.3204564
7968: loss: 1.3204565
7969: loss: 1.3204564
7970: loss: 1.3204565
7971: loss: 1.3204564
7972: loss: 1.3204565
7973: loss: 1.3204564
7974: loss: 1.3204563
7975: loss: 1.3204563
7976: loss: 1.3204563
7977: loss: 1.3204563
7978: loss: 1.3204563
7979: loss: 1.3204563
7980: loss: 1.3204563
7981: loss: 1.3204563
7982: loss: 1.3204563
7983: loss: 1.3204563
7984: loss: 1.3204563
7985: loss: 1.3204563
7986: loss: 1.3204563
7987: loss: 1.3204563
7988: loss: 1.3204563
7989: loss: 1.3204563
7990: loss: 1.3204563
7991: loss: 1.3204563
7992: loss: 1.3204563
7993: loss

8323: loss: 1.320455
8324: loss: 1.320455
8325: loss: 1.3204551
8326: loss: 1.3204552
8327: loss: 1.3204552
8328: loss: 1.3204552
8329: loss: 1.3204552
8330: loss: 1.3204552
8331: loss: 1.3204553
8332: loss: 1.3204553
8333: loss: 1.3204554
8334: loss: 1.3204554
8335: loss: 1.3204554
8336: loss: 1.3204556
8337: loss: 1.3204557
8338: loss: 1.3204557
8339: loss: 1.3204558
8340: loss: 1.3204557
8341: loss: 1.3204559
8342: loss: 1.3204559
8343: loss: 1.3204561
8344: loss: 1.3204563
8345: loss: 1.3204563
8346: loss: 1.3204564
8347: loss: 1.3204564
8348: loss: 1.3204565
8349: loss: 1.3204565
8350: loss: 1.3204566
8351: loss: 1.3204567
8352: loss: 1.320457
8353: loss: 1.320457
8354: loss: 1.3204571
8355: loss: 1.3204572
8356: loss: 1.3204575
8357: loss: 1.3204576
8358: loss: 1.3204577
8359: loss: 1.3204579
8360: loss: 1.3204579
8361: loss: 1.3204582
8362: loss: 1.3204584
8363: loss: 1.3204584
8364: loss: 1.3204588
8365: loss: 1.3204589
8366: loss: 1.320459
8367: loss: 1.3204592
8368: loss: 1.3

8702: loss: 1.3205326
8703: loss: 1.3205326
8704: loss: 1.3205297
8705: loss: 1.3205297
8706: loss: 1.3205268
8707: loss: 1.3205271
8708: loss: 1.3205242
8709: loss: 1.3205242
8710: loss: 1.3205218
8711: loss: 1.3205218
8712: loss: 1.3205194
8713: loss: 1.3205194
8714: loss: 1.3205171
8715: loss: 1.3205171
8716: loss: 1.3205148
8717: loss: 1.3205147
8718: loss: 1.3205127
8719: loss: 1.3205125
8720: loss: 1.3205106
8721: loss: 1.3205106
8722: loss: 1.3205087
8723: loss: 1.3205085
8724: loss: 1.3205067
8725: loss: 1.3205066
8726: loss: 1.3205049
8727: loss: 1.3205047
8728: loss: 1.3205032
8729: loss: 1.3205031
8730: loss: 1.3205014
8731: loss: 1.3205013
8732: loss: 1.3204999
8733: loss: 1.3204998
8734: loss: 1.3204985
8735: loss: 1.3204982
8736: loss: 1.3204967
8737: loss: 1.3204966
8738: loss: 1.3204952
8739: loss: 1.3204951
8740: loss: 1.3204939
8741: loss: 1.3204938
8742: loss: 1.3204927
8743: loss: 1.3204925
8744: loss: 1.3204913
8745: loss: 1.3204911
8746: loss: 1.3204902
8747: loss

9178: loss: 1.3204553
9179: loss: 1.3204552
9180: loss: 1.3204553
9181: loss: 1.3204553
9182: loss: 1.3204553
9183: loss: 1.3204554
9184: loss: 1.3204554
9185: loss: 1.3204554
9186: loss: 1.3204554
9187: loss: 1.3204556
9188: loss: 1.3204556
9189: loss: 1.3204556
9190: loss: 1.3204556
9191: loss: 1.3204557
9192: loss: 1.3204556
9193: loss: 1.3204556
9194: loss: 1.3204557
9195: loss: 1.3204558
9196: loss: 1.3204558
9197: loss: 1.3204557
9198: loss: 1.3204558
9199: loss: 1.3204558
9200: loss: 1.3204558
9201: loss: 1.3204558
9202: loss: 1.3204559
9203: loss: 1.3204559
9204: loss: 1.320456
9205: loss: 1.320456
9206: loss: 1.320456
9207: loss: 1.320456
9208: loss: 1.3204561
9209: loss: 1.3204561
9210: loss: 1.3204563
9211: loss: 1.3204563
9212: loss: 1.3204563
9213: loss: 1.3204563
9214: loss: 1.3204564
9215: loss: 1.3204564
9216: loss: 1.3204565
9217: loss: 1.3204565
9218: loss: 1.3204565
9219: loss: 1.3204565
9220: loss: 1.3204565
9221: loss: 1.3204566
9222: loss: 1.3204566
9223: loss: 1.

9661: loss: 1.3207706
9662: loss: 1.320756
9663: loss: 1.3207655
9664: loss: 1.3207511
9665: loss: 1.3207603
9666: loss: 1.3207464
9667: loss: 1.3207551
9668: loss: 1.3207413
9669: loss: 1.32075
9670: loss: 1.3207366
9671: loss: 1.3207448
9672: loss: 1.3207316
9673: loss: 1.3207397
9674: loss: 1.320727
9675: loss: 1.3207347
9676: loss: 1.3207221
9677: loss: 1.3207297
9678: loss: 1.3207175
9679: loss: 1.3207248
9680: loss: 1.3207127
9681: loss: 1.3207198
9682: loss: 1.3207083
9683: loss: 1.3207151
9684: loss: 1.3207036
9685: loss: 1.3207102
9686: loss: 1.320699
9687: loss: 1.3207055
9688: loss: 1.3206946
9689: loss: 1.3207006
9690: loss: 1.32069
9691: loss: 1.3206961
9692: loss: 1.3206857
9693: loss: 1.3206913
9694: loss: 1.3206812
9695: loss: 1.3206869
9696: loss: 1.320677
9697: loss: 1.3206824
9698: loss: 1.3206729
9699: loss: 1.320678
9700: loss: 1.3206685
9701: loss: 1.3206738
9702: loss: 1.3206644
9703: loss: 1.3206694
9704: loss: 1.3206605
9705: loss: 1.3206652
9706: loss: 1.32065

In [49]:
#最下面的詞是最相似的詞彙，跟king最像的是queen，跟queen最像的是king，跟royal最像的是he。
print(find_cloest_word(word_set,session,'king'))
print(find_cloest_word(word_set, session, 'queen'))
print(find_cloest_word(word_set, session, 'royal'))

king : royal : 0.42829706118976796
king : the : 0.432654347918907
king : queen : 0.7038536450624304
king : is : 0.4148768217568367
king : she : 0.62598928069681
king : he : 0.6815452681310055
queen
queen : royal : 0.3988639124457104
queen : the : 0.2962021234165022
queen : is : 0.6035794969455115
queen : king : 0.7038536450624304
queen : she : 0.4554731595544457
queen : he : 0.49504036235149984
king
royal : the : 0.4012367045487718
royal : queen : 0.3988639124457104
royal : is : 0.345852228542322
royal : king : 0.42829706118976796
royal : she : 0.6104492627948609
royal : he : 0.5472441758267554
she


In [53]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize



In [55]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     D:\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [56]:
data = ["I love machine learning. Its awesome.",
        "I love coding in python",
        "I love building chatbots",
        "they chat amagingly well"]

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [57]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")



iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteratio

In [58]:
from gensim.models.doc2vec import Doc2Vec

model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)


# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])

V1_infer [ 0.00112234 -0.01450536  0.01524183  0.00732648  0.01099953  0.00433188
 -0.04448784  0.00564415  0.0334186  -0.00218492 -0.01646851 -0.00274537
 -0.03524528  0.02969047  0.01931373  0.00570897  0.03834916  0.03724717
 -0.03057521  0.00620591]
[('0', 0.9924670457839966), ('2', 0.9906097650527954), ('3', 0.9779882431030273)]
[ 0.00299269 -0.30312914 -0.08241992 -0.08183824  0.27034333  0.15273513
 -0.4920029   0.17166834  0.21045505 -0.01212543  0.06406254  0.05592326
 -0.12100182  0.1873077   0.25842386  0.1769873   0.46296185  0.21042728
 -0.3307561  -0.02665861]
