# Gensim Doc2Vec

모든 문장의 vector를 구한 다음 가장 비슷한 vector를 가진 문장을 찾는다.

1. 전체 문장을 소문자화 한다.
2. 알파벳으로만 된 단어 개수가 5개 이상, 단어 개수가 15개 이하인 문장만 사용한다.

In [4]:
import boto3
import pickle
import os
import re
from nltk import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from tqdm import tqdm_notebook

model_file = 'doc2vec.model'
pickle_file = 'corpus.pickle'
learning_on = input('Learning on? (y/n)...').lower()

Learning on? (y/n)...n


pickle 파일이 존재하지 않으면 S3에서 문장 데이터를 다운받아 pickle 파일로 저장한다.

In [None]:
p = re.compile('[a-z]+')  # 알파벳으로만 된 단어 패턴
common_texts = []

overwrite = 'n'
if os.path.exists(pickle_file):  # pickle 파일이 존재하면
    overwrite = input('Overwrite? (y/n)...').lower()

if overwrite == 'y' or not os.path.exists(pickle_file):
    bucket = boto3.resource('s3').Bucket('learningdatajchswm9')
    download_cnt = 0

    for i in tqdm_notebook(range(12876)):
        # local_file = 'sents/HOO' + str(i) + 'abstract.txt'
        local_file = 'sents/HOO' + str(i) + 'content.txt'
        
        try:
            if not os.path.exists(local_file):
                bucket.download_file(local_file, local_file)
                
            with open(local_file, 'r', encoding='UTF-8') as f:
                sentences = f.read().splitlines()
                for sent in sentences:
                    tokenized = word_tokenize(sent.lower())
                    common_texts.append(tokenized)
                        
            download_cnt += 1
            
        except Exception as e:
            print(i, e)

    with open(pickle_file, 'wb') as f:
        pickle.dump(common_texts, f, pickle.HIGHEST_PROTOCOL)
    print('%d files downloaded. %d sentences saved.' % (download_cnt, len(common_texts)))
else:
    with open(pickle_file, 'rb') as f:
        common_texts = pickle.load(f)
        print(len(common_texts), 'sentences loaded.')

Overwrite? (y/n)...n


모델을 학습시키거나 학습된 모델을 가져온다.

In [None]:
%%time

if learning_on == 'y':
    # Initialize & train a model
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
    model = Doc2Vec(documents, vector_size=300, window=7, min_count=20, workers=8, epochs=25, dm=1, dm_mean=1, dbow_words=1)

    model.save(model_file)
    
else:
    model = Doc2Vec.load(model_file)
    
model_parameters = 'vector_size=%d,window=%d,min_count=%d,epochs=%d' % (model.vector_size, model.window, model.vocabulary.min_count, model.epochs)

In [None]:
model.delete_temporary_training_data()

In [3]:
%%time

input_sents = ['Software is becoming an increasingly large part in the automotive industry.',
               'Experimental results on real and synthetic datasets demonstrate the effectiveness of our model.',
              'This thesis introduces dynamic software updating.',
              'We also describe a variant that scales to high-dimensional domains.',
              'Cloud computing faces many problems.',
              'update',
              'We consider that we can face problems',
              'We faces many problems',
              'We are developing something good.']

for input_sent in input_sents:
    input_sent = word_tokenize(input_sent.lower())
    
    average_n = 10
    vector = 0
    for k in range(average_n):
        vector += model.infer_vector(input_sent)
    vector /= average_n

    # Search for the most similar sentences
    similar_sentences = model.docvecs.most_similar(positive=[vector], topn=10)

    # topn개의 문장 중에서 이상한 문장 지우기
    good_sents = []
    for sent_index in range(len(similar_sentences)):
        sent = similar_sentences[sent_index]
        is_good_sent = True
        # 문장의 단어 수가 5보다 작고
        if len(sent) < 5:
            is_good_sent = False
        # 한 단어의 길이가 20을 넘지 않고, 영어가 아닌 문장이 너무 많이 들어가 있지 않아야
        good_word_cnt = 0
        for word in sent:
            if len(word) >= 20:
                is_good_sent = False
                break
            if p.match(word):
                good_word_cnt += 1
        if good_word_cnt < 0.5 * len(sent):
            is_good_sent = False
            
        if is_good_sent:
            good_sents.append(sent)
    
    # Log
    log_path = 'logs/' + model_parameters + '.txt'
    with open(log_path, 'a+', encoding='UTF-8') as f:
        for word in input_sent:
            f.write(word + ' ')
        f.write('<- input_sent\n')
        for sent_num, similarity in similar_sentences:
            for word in common_texts[sent_num]:
                f.write(word + ' ')
            f.write(str(similarity) + '\n')
        f.write('\n')

NameError: name 'model' is not defined

In [223]:
model.most_similar(positive=['describe'])

  """Entry point for launching an IPython kernel.


[('explore', 0.7097293734550476),
 ('outline', 0.69687819480896),
 ('discuss', 0.6799103617668152),
 ('introduce', 0.6348254680633545),
 ('investigate', 0.6238253116607666),
 ('illustrate', 0.5804440975189209),
 ('implement', 0.5782642364501953),
 ('analyze', 0.5741323232650757),
 ('employ', 0.572858452796936),
 ('deﬁne', 0.5725528001785278)]

문장을 전체 돌면서 유사도가 가장 높은 문장을 뽑아서 보여주기

In [117]:
similarity = [0.0] * len(common_texts)

for i in tqdm_notebook(range(len(common_texts))):
    # 해당 문장의 vector 값 추론
    input_sent = common_texts[i]
    vector = 0
    vector_average_n = 5
    for j in range(vector_average_n):
        vector += model.infer_vector(input_sent)
    vector = vector / vector_average_n
    
    # Search for the most similar sentences
    similar_sentences = model.docvecs.most_similar(positive=[vector], topn=5)
    average = 0.0
    cnt = 0
    for j, simil_point in similar_sentences:
        if common_texts[j] != input_sent:
            average += simil_point
    if cnt:
        average = average / cnt
    
    similarity[i] = average
    
sorted_by_similarity = sorted(enumerate(similarity), key=lambda tup: tup[1], reverse=True)

HBox(children=(IntProgress(value=0, max=31586), HTML(value='')))

In [119]:
for tup in sorted_by_similarity[1000:1100]:
    input_sent = common_texts[tup[0]]
    vector = 0
    vector_average_n = 5
    for i in range(vector_average_n):
        vector += model.infer_vector(input_sent)
    vector = vector / vector_average_n
    
    # Search for the most similar sentences
    similar_sentences = model.docvecs.most_similar(positive=[vector], topn=5)
    
    # Log
    log_path = 'logs/' + model_parameters + '.txt'
    with open(log_path, 'a+', encoding='UTF-8') as f:
        for word in input_sent:
            f.write(word + ' ')
        f.write('<- input_sent\n')
        for pair in similar_sentences:
            sent_num = pair[0]
            similarity = pair[1]
            for word in common_texts[sent_num]:
                f.write(word + ' ')
            f.write('\n')
        f.write('\n')
        
print('done')

done


랜덤 문장을 선택해 그것과 가장 비슷한 문장을 뽑아주는 것

In [None]:
import random

num_sents = len(common_texts)  # 문장의 총 개수

for i in range(5):
    randn = random.randrange(num_sents)
    input_sent = common_texts[randn]
    
    vector = 0
    vector_average_n = 100
    for i in range(vector_average_n):
        vector += model.infer_vector(input_sent)
    vector = vector / vector_average_n
        
    # Search for the most similar sentences
    similar_sentences = model.docvecs.most_similar(positive=[vector], topn=5)

    # Log
    log_path = 'logs/' + model_parameters + '.txt'
    with open(log_path, 'a+', encoding='UTF-8') as f:
        for word in input_sent:
            f.write(word + ' ')
        f.write('<- input_sent\n')
        for sent_num, similarity in similar_sentences:
            for word in common_texts[sent_num]:
                f.write(word + ' ')
            f.write('\n')
        f.write('\n')