forked from candlewill/document_rating
-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc2vec_baseline_CVAT.py
47 lines (44 loc) · 1.64 KB
/
doc2vec_baseline_CVAT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
__author__ = 'NLP-PC'
import gensim
import os
import time
from file_name import get_file_path
from load_data import load_corpus, load_lexicon, load_mark
from load_data import load_embeddings
from word2vec_fn import buill_word_vector
from word2vec_fn import gold_valence_arousal
import numpy as np
from sklearn import cross_validation
from cross_validation import cv
from word2vec_fn import build_doc_vector
# '''
model = load_embeddings('CVAT_docvecs')
print(model.docvecs[1])
print(model.docvecs['SENT_23'])
print(len(model.vocab.keys()))
corpus = load_corpus(get_file_path('cn_corpus'))
mark = load_mark(get_file_path('mark'))
vecs = build_doc_vector(corpus, model)
valence, arousal = gold_valence_arousal(corpus, mark)
cv(vecs, valence, multivariant=True)
cv(vecs, arousal, multivariant=True)
# '''
# from save_data import dump_picle
# dump_picle(model.key(), get_file_path('words_in_wordvec'))
# print('ok')
#
# # print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
# # print(model.doesnt_match("breakfast cereal dinner lunch".split()))
# # print(model.similarity('woman', 'man'))
# # print(model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'], topn=10))
# # print(model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']))
#
# from load_data import load_pickle
# words = load_pickle(get_file_path('words_in_wordvec'))
# print(words)
################################################
# Train doc2vec
# from word2vec_fn import TaggedLineSentence, train_docvecs
# sentence = TaggedLineSentence(load_corpus(get_file_path('cn_corpus')))
# train_docvecs(sentence)
#################################################