# Gensim Doc2Vec

모든 문장의 vector를 구한 다음 가장 비슷한 vector를 가진 문장을 찾는다.

In [1]:
import boto3
import pickle
import os
from nltk import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile

from tqdm import tqdm_notebook

pickle 파일이 존재하지 않으면 S3에서 문장 데이터를 다운받아 pickle 파일로 저장한다.

In [2]:
# Set up a corpus
common_texts = []
pickle_file = 'corpus.pickle'
# pickle 파일이 존재하지 않으면
if not os.path.exists(pickle_file):
    bucket = boto3.resource('s3').Bucket('learningdatajchswm9')
    download_cnt = 0
    # temp_file = 's3_download_temp.txt'
    local_file = 'sentences/HOO' + str(i) + 'content.txt'

    total = tqdm_notebook(range(12876))
    for i in total:
    #for i in range(12876):
        try:
            if not os.path.exists(local_file):
                bucket.download_file(local_file, local_file)
            with open(local_file, 'r', encoding='UTF-8') as f:
                sentences = f.read().splitlines()
                common_texts += [word_tokenize(sent) for sent in sentences]
            download_cnt += 1
            # print(i, 'downloaded.')

        except Exception as e:
            pass
            # print(i, e)

    os.remove(temp_file)
    print('%d files downloaded. %d sentences saved.' % (download_cnt, len(common_texts)))

    with open(pickle_file, 'wb') as f:
        pickle.dump(common_texts, f, pickle.HIGHEST_PROTOCOL)
# pickle 파일이 이미 존재하면
else:
    with open(pickle_file, 'rb') as f:
        common_texts = pickle.load(f)

In [3]:
len(common_texts)

6903759

모델을 학습시키거나 학습된 모델을 가져온다.

In [4]:
model_file = get_tmpfile("my_doc2vec_model")
learning_on = input('Learning on? (y/n)...').lower()

Learning on? (y/n)...y


In [5]:
%%time
if learning_on == 'y':
    # Initialize & train a model
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
    model = Doc2Vec(documents, dm=1, vector_size=150, window=3, min_count=100, workers=4, epochs=10, dbow_words=1)
    # print('documents =', documents)

    # Persist a model to disk
    model.save(model_file)
else:
    model = Doc2Vec.load(model_file)
# Delete temporary training data
# model.delete_temporary_training_data()
model_parameters = 'vector_size=%d,window=%d,min_count=%d,epochs=%d,dbow_words=1' % (model.vector_size, model.window, model.vocabulary.min_count, model.epochs)

CPU times: user 10min 31s, sys: 5min 19s, total: 15min 50s
Wall time: 12min 39s


사용자 입력을 받아 입력 문장과 가장 유사한 문장을 보여준다.

In [6]:
# input_sent = word_tokenize(input('User typing : '))
# input_sent = word_tokenize('Software is becoming an increasingly large part in the automotive industry')
# print(input_sent)

# input_sent를 여러개 만들자
input_sents = ['Software is becoming an increasingly large part in the automotive industry',
               'We present a novel approach to modeling stories using recurrent neural networks',
              'Software for the module is a part of operating system',
              'An example illustrating the situation where the number of starting positions is limited because of the forest',
              'TSP is difficult to solve in large problems of aerial mapping because of a high number nodes']

In [7]:
%%time
same_cnt = 0

for input_sent in input_sents:
    input_sent = word_tokenize(input_sent)
    # Infer a vector for a new document
    # vector = model.infer_vector(input_sent)
    # infer vector 평균해볼까
    vector = 0
    vector_average_n = 100
    for i in range(vector_average_n):
        vector += model.infer_vector(input_sent)
    vector = vector / vector_average_n
    # print(vector)

    # Search for the most similar sentences
    similar_sentences = model.docvecs.most_similar(positive=[vector], topn=5)
    #print('Top 5 most similar sentences are:')
    for pair in similar_sentences:
        sent_num = pair[0]
        similarity = pair[1]
        # print(common_texts[sent_num], similarity)

    # Log
    log_path = 'logs/' + model_parameters + '.txt'
    with open(log_path, 'a+') as f:
        f.write('input_sent:' + str(input_sent) + '\n')
        for pair in similar_sentences:
            sent_num = pair[0]
            similarity = pair[1]
            f.write(str(common_texts[sent_num]) + str(similarity) + '\n')
        f.write('\n')

    # top 5에 같은 문장이 몇 개 나왔는지 센다.
    for pair in similar_sentences:
            sent_num = pair[0]
            similarity = pair[1]
            if common_texts[sent_num] == input_sent:
                same_cnt += 1
print(same_cnt)

1
CPU times: user 11.5 s, sys: 39.9 s, total: 51.4 s
Wall time: 1min 35s
