# Gensim Doc2Vec

모든 문장의 vector를 구한 다음 가장 비슷한 vector를 가진 문장을 찾는다.

1. 전체 문장을 소문자화 한다.
2. 알파벳으로만 된 단어 개수가 5개 이상, 단어의 글자 수가 20 이하인 문장만 보여준다.

In [1]:
import boto3
import pickle
import os
import re
from nltk import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from tqdm import tqdm_notebook



In [2]:
if not os.path.exists('txtData/'):
    os.mkdir('txtData')
    
model_file = 'doc2vec.model'
pickle_file = 'corpus.pickle'
common_texts_pickle = 'common_texts.pickle'

p = re.compile('[a-zA-Z]+')  # 알파벳으로만 된 단어 패턴

learning_on = input('Learning on? (y/n)...').lower()

overwrite = 'n'
if os.path.exists(pickle_file):  # pickle 파일이 존재하면
    overwrite = input('Overwrite? (y/n)...').lower()

Learning on? (y/n)...y
Overwrite? (y/n)...n


pickle 파일이 존재하지 않으면 S3에서 문장 데이터를 다운받아 pickle 파일로 저장한다.

In [3]:
original_sents = []
common_texts = []

if overwrite == 'y' or not os.path.exists(pickle_file):
    bucket = boto3.resource('s3').Bucket('learningdatajchswm9')
    download_cnt = 0

    for i in tqdm_notebook(range(12876)):
        local_file = 'txtData/HOO' + str(i) + '.txt'
        
        try:
            if not os.path.exists(local_file):
                bucket.download_file(local_file, local_file)
                
            with open(local_file, 'r', encoding='UTF-8') as f:
                for sent in f.read().splitlines():
                    sent2 = sent.strip()
                    if sent2 != '':
                        original_sents.append(sent2)
                        common_texts.append(word_tokenize(sent2))
                        
            download_cnt += 1
            
        except Exception as e:
            print(i, e)

    with open(pickle_file, 'wb') as f:
        pickle.dump(original_sents, f, pickle.HIGHEST_PROTOCOL)
    with open(common_texts_pickle, 'wb') as f:
        pickle.dump(common_texts, f, pickle.HIGHEST_PROTOCOL)
    print('%d files downloaded. %d sentences saved.' % (download_cnt, len(original_sents)))
else:
    with open(pickle_file, 'rb') as f:
        original_sents = pickle.load(f)
    with open(common_texts_pickle, 'rb') as f:
        common_texts = pickle.load(f)
    print(len(original_sents), 'sentences loaded.')

3722837 sentences loaded.


모델을 학습시키거나 학습된 모델을 가져온다.

In [None]:
%%time

if learning_on == 'y':
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
    
    # Initialize & train a model
    model = Doc2Vec(documents, vector_size=300, window=5, min_count=50, workers=8, dm=1, dm_mean=1, dbow_words=1)
    model.train(documents, total_examples=model.corpus_count, epochs=25, start_alpha=0.025, end_alpha=0.001)

    model.save(model_file)
    
else:
    model = Doc2Vec.load(model_file)
    
model_parameters = 'vector_size=%d,window=%d,min_count=%d,epochs=%d' % (model.vector_size, model.window, model.vocabulary.min_count, model.epochs)

In [None]:
model.delete_temporary_training_data()

In [None]:
%%time

input_sents = ['Software is becoming an increasingly large part in the automotive industry.',
               'Experimental results on real and synthetic datasets demonstrate the effectiveness of our model.',
              'This thesis introduces dynamic software updating.',
              'We also describe a variant that scales to high-dimensional domains.',
              'Cloud computing faces many problems.',
              'update',
              'We consider that we can face problems',
              'We faces many problems',
              'We are developing something good.',
              'The revised open-ended questions were then posed to 249 students during an end-of-term final exam study session.',
              'This paper suggests the use of formal models',
              'In this paper, we prove that',
              'We showed that it is used to',
              'Why do this?',
              'In this paper, we focus on the problem of generating adversarial examples for Natural Language Inference (NLI) models in order to gain insights about the inner workings of such systems, and regularising them.']

for input_sent in input_sents:
    input_token = word_tokenize(input_sent.lower())
    
    average_n = 10
    vector = 0
    for k in range(average_n):
        vector += model.infer_vector(input_token)
    vector /= average_n

    # Search for the most similar sentences
    similar_sentences = model.docvecs.most_similar(positive=[vector], topn=10)
    
    # topn개의 문장 중에서 이상한 문장 지우기
    good_sents = []
    
    for sent_index in range(len(similar_sentences)):
        sent = original_sents[similar_sentences[sent_index][0]]
        
        is_good_sent = True
        
        # 문장의 단어 수가 5보다 작으면 좋은 문장이 아니다.
        if len(sent) < 5:
            is_good_sent = False
            
        # 한 단어의 길이가 20을 넘지 않고, 영어가 아닌 문장이 너무 많이 들어가 있지 않아야
        good_word_cnt = 0
        for word in sent:
            if len(word) >= 20:
                is_good_sent = False
                break
            if p.match(word):
                good_word_cnt += 1
                
        # 특수 문자가 너무 많을 경우
        if good_word_cnt < 0.67 * len(sent):
            is_good_sent = False
            
        # 이미 있는 문장의 경우
        if sent in good_sents:
            is_good_sent = False
            
        if is_good_sent:
            good_sents.append(similar_sentences[sent_index])

    show_cnt = 0  # 3개 문장만 보여줄거다.
    # Log
    log_path = 'logs/' + model_parameters + '.txt'
    with open(log_path, 'a+', encoding='UTF-8') as f:
        f.write(str(input_sent) + ' <- input_sent\n')
        for sent_num, similarity in good_sents:
            if similarity < 0.5:
                break
            f.write(str(original_sents[sent_num]) + ' ' + str(similarity) + '\n')
            show_cnt += 1
            if show_cnt >= 3:  # 3개 문장만 보여줄거다.
                break
        f.write('\n')

In [None]:
model.most_similar(positive=['describe'])

## To do
## Done
- 중복 문장 제거하기