In [9]:
from gensim.models import Word2Vec
import nltk
from os import listdir,path,rename,walk,makedirs
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from pandas import DataFrame
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# import numpy as np
#from gensim.test.utils import common_texts

In [10]:
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return 'n' #defult pos tag

def is_Part_of_speech(tag):
    if tag in ['NN', 'NNS', 'NNP', 'NNPS'] :
        return 'noun'
    
    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] :
        return 'verb'
    
    elif tag in ['RB', 'RBR', 'RBS'] :
        return 'adverb'
    
    elif tag in ['JJ', 'JJR', 'JJS'] :
        return 'adjective'
    else:
        return 'X'

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
save_path = 'C:/Users/Tingchun.TC.Hung/Desktop/Record/知識圖譜/Word2vec/'
stop_word_path = 'C:/Users/Tingchun.TC.Hung/Desktop/Record/知識圖譜/Word2vec/'
target_path = 'C:/Users/Tingchun.TC.Hung/Desktop/Record/知識圖譜/語料庫/PubMed資料集/'

In [13]:
#載入停用詞
stop_word = []
with open(stop_word_path+'英文停用詞.txt','r',encoding='utf-8') as file :
    for i in file:
        stop_word.append(i.split()[0])

In [14]:
def replace_word(txt):
    txt = txt.replace('\ufeff','')
    txt = txt.replace('-------------------------------','')
    for ch in '!"#$&()*+,/:;<=>?@[\\]^_{|}·~‘’⦁': #-.
        txt = txt.replace(ch,"")
        
    stop_sentence = False
    if len(txt)!= 0 :
        if txt[-1]=='.':
            txt = txt.replace('.',"")
            stop_sentence = True
        
    return txt,stop_sentence

In [15]:
import re
import string
import spacy


class HearstPatterns(object):

    def __init__(self, extended=False):

        self.__adj_stopwords = [
            'able', 'available', 'brief', 'certain',
            'different', 'due', 'enough', 'especially', 'few', 'fifth',
            'former', 'his', 'howbeit', 'immediate', 'important', 'inc',
            'its', 'last', 'latter', 'least', 'less', 'likely', 'little',
            'many', 'ml', 'more', 'most', 'much', 'my', 'necessary',
            'new', 'next', 'non', 'old', 'other', 'our', 'ours', 'own',
            'particular', 'past', 'possible', 'present', 'proud', 'recent',
            'same', 'several', 'significant', 'similar', 'such', 'sup', 'sure'
        ]

        # now define the Hearst patterns
        # format is <hearst-pattern>, <general-term>
        # so, what this means is that if you apply the first pattern,
        # the first Noun Phrase (NP)
        # is the general one, and the rest are specific NPs
        self.__hearst_patterns = [
            (
                '(NP_\\w+ (, )?such as (NP_\\w+ ?(, )?(and |or )?)+)',
                'first'
            ),
            (
                '(such NP_\\w+ (, )?as (NP_\\w+ ?(, )?(and |or )?)+)',
                'first'
            ),
            (
                '((NP_\\w+ ?(, )?)+(and |or )?other NP_\\w+)',
                'last'
            ),
            (
                '(NP_\\w+ (, )?include (NP_\\w+ ?(, )?(and |or )?)+)',
                'first'
            ),
            (
                '(NP_\\w+ (, )?especially (NP_\\w+ ?(, )?(and |or )?)+)',
                'first'
            ),
        ]

        if extended:
            self.__hearst_patterns.extend([
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?any other NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?some other NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?be a NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?like (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    'such (NP_\\w+ (, )?as (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?like other NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?one of the NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?one of these NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?one of those NP_\\w+)',
                    'last'
                ),
                (
                    'example of (NP_\\w+ (, )?be (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?be example of NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?for example (, )?'
                    '(NP_\\w+ ?(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?which be call NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?which be name NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?mainly (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?mostly (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?notably (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?particularly (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?principally (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?in particular (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?except (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?other than (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?e.g. (, )?(NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ \\( (e.g.|i.e.) (, )?(NP_\\w+ ? (, )?(and |or )?)+'
                    '(\\. )?\\))',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?i.e. (, )?(NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and|or)? a kind of NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and|or)? kind of NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and|or)? form of NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?which look like NP_\\w+)',
                    'last'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?which sound like NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?which be similar to (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?example of this be (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?type (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )? NP_\\w+ type)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?whether (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(compare (NP_\\w+ ?(, )?)+(and |or )?with NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?compare to (NP_\\w+ ? (, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '(NP_\\w+ (, )?among -PRON- (NP_\\w+ ? '
                    '(, )?(and |or )?)+)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and |or )?as NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )? (NP_\\w+ ? (, )?(and |or )?)+ '
                    'for instance)',
                    'first'
                ),
                (
                    '((NP_\\w+ ?(, )?)+(and|or)? sort of NP_\\w+)',
                    'last'
                ),
                (
                    '(NP_\\w+ (, )?which may include (NP_\\w+ '
                    '?(, )?(and |or )?)+)',
                    'first'
                )
            ])

        self.__spacy_nlp = spacy.load('en_core_web_sm')

    def chunk(self, rawtext):
        doc = self.__spacy_nlp(rawtext)
        chunks = []
        for sentence in doc.sents:
            sentence_text = sentence.lemma_
            for chunk in sentence.noun_chunks:
                if chunk.lemma_.lower() == "example":
                    start = chunk.start
                    pre_token = sentence[start - 1].lemma_.lower()
                    post_token = sentence[start + 1].lemma_.lower()
                    if start > 0 and\
                            (pre_token == "for" or post_token == "of"):
                        continue
                if chunk.lemma_.lower() == "type":
                    continue
                chunk_arr = []
                replace_arr = []
                # print("chunk:", chunk)
                for token in chunk:
                    if token.lemma_ in self.__adj_stopwords + ["i.e.", "e.g."]:
                        continue
                    chunk_arr.append(token.lemma_)
                    # Remove punctuation and stopword adjectives
                    # (generally quantifiers of plurals)
                    if token.lemma_.isalnum():
                        replace_arr.append(token.lemma_)
                    else:
                        replace_arr.append(''.join(
                            char for char in token.lemma_ if char.isalnum()
                        ))
                if len(chunk_arr) == 0:
                    chunk_arr.append(chunk[-1].lemma_)
                chunk_lemma = ' '.join(chunk_arr)
                # print(chunk_lemma)
                replacement_value = 'NP_' + '_'.join(replace_arr)
                if chunk_lemma:
                    sentence_text = re.sub(r'\b%s\b' % re.escape(chunk_lemma),
                                           r'%s' % replacement_value,
                                           sentence_text)
            chunks.append(sentence_text)
        return chunks

    """
        This is the main entry point for this code.
        It takes as input the rawtext to process and returns a list
        of tuples (specific-term, general-term)
        where each tuple represents a hypernym pair.
    """
    def find_hyponyms(self, rawtext):

        hyponyms = []
        np_tagged_sentences = self.chunk(rawtext)

        for sentence in np_tagged_sentences:
            # two or more NPs next to each other should be merged
            # into a single NP, it's a chunk error

            for (hearst_pattern, parser) in self.__hearst_patterns:
                matches = re.search(hearst_pattern, sentence)
                if matches:
                    match_str = matches.group(0)

                    nps = [a for a in match_str.split() if a.startswith("NP_")]

                    if parser == "first":
                        general = nps[0]
                        specifics = nps[1:]
                    else:
                        general = nps[-1]
                        specifics = nps[:-1]

                    for i in range(len(specifics)):
                        pair = (
                            self.clean_hyponym_term(specifics[i]),
                            self.clean_hyponym_term(general)
                        )
                        # reduce duplicates
                        if pair not in hyponyms:
                            hyponyms.append(pair)

        return hyponyms

    def clean_hyponym_term(self, term):
        # good point to do the stemming or lemmatization
        return term.replace("NP_", "").replace("_", " ")

In [19]:
h = HearstPatterns(extended = True)

In [16]:
def tokenize_English(each_sentence,Near_word_title,Near_word,near_range):
    Near_split = each_sentence.split()
    Near_split2 = Near_split[:-1]
    new_word = replace_word(Near_split[-1])[0]
    if(new_word!=''):
        Near_split2.append(new_word)
    Near_split = Near_split2  
    sentence_long = len(Near_split)
    for run_near_word in range(sentence_long):
        #若第一次當開頭
        if(Near_split[run_near_word] not in Near_word_title):
            Near_word_title.append(Near_split[run_near_word])
            Near_word.append([[Near_split[run_near_word]],[1]])
            if(sentence_long-(run_near_word+1)>(near_range-1)):
                for near_words in range(near_range):
                    str_near_word = ''
                    for add_near_word in range(run_near_word,run_near_word+(near_words+1)+1):
                        if(add_near_word!=run_near_word+(near_words+1)):
                            str_near_word += Near_split[add_near_word] + ' '
                        else:
                            str_near_word += Near_split[add_near_word]
                    Near_word[-1][0].append(str_near_word)
                    Near_word[-1][1].append(1)
            else:
                str_near_word = Near_split[run_near_word] + ' '
                for near_words in range(run_near_word+1,sentence_long):
                    if((near_words+1)!=sentence_long):
                        str_near_word += Near_split[near_words]
                        Near_word[-1][0].append(str_near_word)
                        Near_word[-1][1].append(1)
                        str_near_word += ' '
                    else:
                        str_near_word += Near_split[near_words]
                        Near_word[-1][0].append(str_near_word)
                        Near_word[-1][1].append(1)

        #若非第一次當開頭
        else :
            Near_word_position = Near_word_title.index(Near_split[run_near_word])
            Near_word[Near_word_position][1][0] += 1
            if(sentence_long-(run_near_word+1)>(near_range-1)):
                for near_words in range(near_range):
                    str_near_word = ''
                    for add_near_word in range(run_near_word,run_near_word+(near_words+1)+1):
                        if(add_near_word!=run_near_word+(near_words+1)):
                            str_near_word += Near_split[add_near_word] + ' '
                        else:
                            str_near_word += Near_split[add_near_word]
                    if(str_near_word not in Near_word[Near_word_position][0]):
                        Near_word[Near_word_position][0].append(str_near_word)
                        Near_word[Near_word_position][1].append(1)
                    else:
                        position_in_apper_list = Near_word[Near_word_position][0].index(str_near_word)
                        Near_word[Near_word_position][1][position_in_apper_list] += 1
            else:
                str_near_word = Near_split[run_near_word] + ' '
                for near_words in range(run_near_word+1,sentence_long):
                    if((near_words+1)!=sentence_long):
                        str_near_word += Near_split[near_words]
                        if(str_near_word not in Near_word[Near_word_position][0]):
                            Near_word[Near_word_position][0].append(str_near_word)
                            Near_word[Near_word_position][1].append(1)
                        else:
                            position_in_apper_list = Near_word[Near_word_position][0].index(str_near_word)
                            Near_word[Near_word_position][1][position_in_apper_list] += 1
                        str_near_word += ' '
                    else:
                        str_near_word += Near_split[near_words]
                        if(str_near_word not in Near_word[Near_word_position][0]):
                            Near_word[Near_word_position][0].append(str_near_word)
                            Near_word[Near_word_position][1].append(1)
                        else:
                            position_in_apper_list = Near_word[Near_word_position][0].index(str_near_word)
                            Near_word[Near_word_position][1][position_in_apper_list] += 1     
    return Near_word_title,Near_word

In [17]:
Attributes=[
'is known as',
'is composed of',
'is a kind of',
'is a',
'is',
'are known as',
'are composed of',
'are a kind of',
'are',
'called',
'belong'
'is belong',
'isolated from',
'isolates',
'was known as',
'was composed of',
'was a kind of',
'was a',
'was',
'were known as',
'were composed of',
'were a kind of',
'were',
'used for'
]

## Word2Vec

### 參考文獻

https://sfhsu29.medium.com/nlp-%E5%B0%88%E6%AC%84-1-2-%E5%A6%82%E4%BD%95%E8%A8%93%E7%B7%B4%E8%87%AA%E5%B7%B1%E7%9A%84-word2vec-5a0754c5cb09

In [7]:
word2vec_trainning_texts = []
word2vec_trainning_texts_n = []
word2vec_trainning_texts_v = []
word2vec_trainning_texts_4type = []

word_texts_n = []
word_texts_v = []

paper_num = 0



for root, dirs, files in walk(target_path):
    for each_paper in files :
#         print(each_paper)
        with open(target_path+each_paper,'r',encoding='utf-8') as file :
            text = file.read()
            sentences = nltk.sent_tokenize(text)
            for each_sentence in sentences :
                
                pass_sentence = []
                pass_sentence_n = []
                pass_sentence_v = []
                pass_sentence_4type = []
                each_sentence_list = each_sentence.split()
                
                tagged=pos_tag(each_sentence_list)
                for each_word, pos in tagged:
                    input_Attributes = penn_to_wn(pos)
                    Part_of_speech = is_Part_of_speech(pos)
                    if(Part_of_speech != 'X'):
                        Reduction = lemmatizer.lemmatize(each_word,pos=input_Attributes)
                        new_word,end = replace_word(Reduction)
                        if(new_word != ''):
                            if(new_word.isupper() and len(new_word)>1):
                                pass_sentence_4type.append(new_word)
                            else:
                                if(new_word.lower() not in stop_word ):
                                    pass_sentence_4type.append(new_word.lower())
                    if(Part_of_speech == 'noun' or Part_of_speech == 'verb'): # if(1):
                        Reduction = lemmatizer.lemmatize(each_word,pos=input_Attributes)  
                        new_word,end = replace_word(Reduction)
                        if(new_word != ''):
                            if(new_word.isupper() and len(new_word)>1):
                                pass_sentence.append(new_word)
                                if(Part_of_speech == 'noun'):
                                    pass_sentence_n.append(new_word)
                                    if new_word not in word_texts_n:
                                        word_texts_n.append(new_word)
                                elif(Part_of_speech == 'verb'):
                                    pass_sentence_v.append(new_word)
                                    if new_word not in word_texts_v:
                                        word_texts_v.append(new_word)
                            else:
                                if(new_word.lower() not in stop_word ):
                                    pass_sentence.append(new_word.lower())
                                    if(Part_of_speech == 'noun'):
                                        pass_sentence_n.append(new_word.lower())
                                        if new_word.lower() not in word_texts_n:
                                            word_texts_n.append(new_word.lower())
                                    elif(Part_of_speech == 'verb'):
                                        pass_sentence_v.append(new_word.lower())
                                        if new_word.lower() not in word_texts_v:
                                            word_texts_v.append(new_word.lower())
                if(pass_sentence_4type!=[]):
                    word2vec_trainning_texts_4type.append(pass_sentence_4type)
                if(pass_sentence!=[]):
                    word2vec_trainning_texts.append(pass_sentence)
                if(pass_sentence_n!=[]):
                    word2vec_trainning_texts_n.append(pass_sentence_n)
                if(pass_sentence_v!=[]):
                    word2vec_trainning_texts_v.append(pass_sentence_v)
            paper_num += 1
print('執行完'+str(paper_num)+'篇PubMed論文摘要')

執行完243篇PubMed論文摘要


In [8]:
print('包含4種type句子 : ',len(word2vec_trainning_texts_4type))
print('包含名詞動詞句子 : ',len(word2vec_trainning_texts))
print('名詞句子 : ',len(word2vec_trainning_texts_n))
print('動詞句子 : ',len(word2vec_trainning_texts_v))

包含4種type句子 :  2474
包含名詞動詞句子 :  2470
名詞句子 :  2466
動詞句子 :  1963


In [9]:
print('看到名詞(不重複)出現字數 : ',len(word_texts_n))
print('看到動詞(不重複)出現字數 : ',len(word_texts_v))

看到名詞(不重複)出現字數 :  3858
看到動詞(不重複)出現字數 :  935


In [10]:
model = Word2Vec(sentences=word2vec_trainning_texts, window=5, min_count=1, workers=4) #, vector_size=100
model.save(save_path+"word2vec.model")

### 提出訓練好的model

In [11]:
model = Word2Vec.load(save_path+"word2vec.model")

In [12]:
print('一個字用幾維vector表示 : '+str(model.vector_size))

一個字用幾維vector表示 : 100


In [13]:
vocab_list = list(model.wv.vocab.keys())
print('word2vec 字數 : '+str(len(vocab_list)))
print('實例 : ')
for i in vocab_list[:5]:
    print(i)

word2vec 字數 : 4422
實例 : 
removal
infectivity
blood
product
prepare


In [14]:
ans = model.wv.most_similar(positive=['virus'], negative=[], topn=10)
# ans = model.wv.most_similar(positive=['virus','PCR'], negative=[], topn=10)
# ans = model.wv.most_similar(positive=['virus'], negative=['PCR'], topn=10)
for i in ans :
    print(i)

('RNA', 0.9881764650344849)
('gene', 0.987464964389801)
('infection', 0.9867336750030518)
('PCR', 0.9846532940864563)
('particle', 0.9832759499549866)
('protein', 0.9822303652763367)
('sequence', 0.9820447564125061)
('detection', 0.9819107055664062)
('assay', 0.9817799925804138)
('cell', 0.9816744923591614)


In [15]:
word_a = 'virus'
word_b = 'PCR'

two_word_distance = model.wv.distance(word_a,word_b)
print(word_a+' & '+word_b+'距離 : ')
print(two_word_distance)

virus & PCR距離 : 
0.01534658670425415


In [16]:
vector = model.wv.get_vector('virus')
print(vector)

[-0.05425971 -0.03182517 -0.0096318  -0.01050142 -0.00216749 -0.06463587
 -0.01695463 -0.02169042  0.02870228 -0.01211303 -0.00588886  0.02702897
 -0.05834166 -0.05605504 -0.07637904  0.05305203  0.04512161  0.00370761
  0.03353662  0.00847748  0.00713624 -0.00921729 -0.00103337  0.03733119
 -0.00148212  0.0012035   0.01043321  0.04415888 -0.0044337  -0.01072861
  0.05931643  0.01644801 -0.03724993  0.02106954  0.03184151  0.05113804
  0.01292499  0.03384535  0.00741403 -0.01483723 -0.04139743  0.07878213
  0.03536803  0.00037794  0.04338795 -0.03010762  0.013257   -0.02036365
 -0.03146609  0.01029766  0.01629391 -0.00337227  0.033557    0.03696047
 -0.03500364  0.02248935  0.04040173 -0.00290783 -0.00404452  0.04770268
  0.04094021  0.04918053 -0.01345428  0.01729371  0.0282013   0.00074109
  0.03087345 -0.00090332  0.01267883 -0.09072983  0.0052141  -0.01190906
  0.00413738 -0.03697195 -0.02612349 -0.03975829  0.05680097  0.01543936
  0.03902845 -0.02407588  0.00382254  0.01514424 -0

In [17]:
model.wv.similar_by_vector(vector)

[('virus', 1.0),
 ('RNA', 0.9881764650344849),
 ('gene', 0.987464964389801),
 ('infection', 0.9867337346076965),
 ('PCR', 0.9846532940864563),
 ('particle', 0.9832760095596313),
 ('protein', 0.9822304248809814),
 ('sequence', 0.9820447564125061),
 ('detection', 0.9819107055664062),
 ('assay', 0.9817799925804138)]

### 挑選成NE字典

In [18]:
word_texts_n_NE = word_texts_n[:1000]
threshold = 0.5

In [19]:
list_n_sort_dis = []
index_name = []
for each_n in word_texts_n_NE :
    sort_word = model.wv.most_similar(positive=[each_n], negative=[], topn=len(word_texts_n)+len(word_texts_v))
    list_n_sort_dis_this = []
    for each_distance in sort_word :
        if(each_distance[1]>=threshold):
            if each_distance[0] in word_texts_n_NE:
                list_n_sort_dis_this.append(each_distance)
    list_n_sort_dis.append(list_n_sort_dis_this)
    index_name.append(each_n)

In [20]:
Dictionary_word2vec = DataFrame(list_n_sort_dis,index=index_name)

In [21]:
Dictionary_word2vec.loc["DNA"]

0           (virus, 0.9779177308082581)
1             (RNA, 0.9754770398139954)
2        (sequence, 0.9753614664077759)
3       (infection, 0.9752068519592285)
4            (gene, 0.9737394452095032)
                     ...               
319       (monodon, 0.5008969306945801)
320    (difficulty, 0.5002762079238892)
321                                None
322                                None
323                                None
Name: DNA, Length: 324, dtype: object

### 用於找領域

In [22]:
word_a = 'virus'
word_b = 'PCR'

two_word_distance = model.wv.distance(word_a,word_b)
print(word_a+' & '+word_b+'距離 : ')
print(two_word_distance)

virus & PCR距離 : 
0.01534658670425415


In [23]:
word_a = 'virus'
word_b = 'acid'

two_word_distance = model.wv.distance(word_a,word_b)
print(word_a+' & '+word_b+'距離 : ')
print(two_word_distance)

virus & acid距離 : 
0.045733869075775146


## HearstPatterns

https://github.com/mmichelsonIF/hearst_patterns_python/tree/master/hearstPatterns

In [26]:
h = HearstPatterns(extended = True)

In [27]:
a = "There are such benefits as postharvest losses reduction, food increase and soil fertility improvement."
h.find_hyponyms(a)

[('postharvest loss reduction', 'benefit'),
 ('food increase', 'benefit'),
 ('soil fertility improvement', 'benefit'),
 ('benefit', 'postharvest loss reduction')]

### 資料庫統計成字典

In [28]:
paper_num = 0

Dictionary_HearstPatterns_word = []
Dictionary_HearstPatterns_time = []

for root, dirs, files in walk(target_path):
    for each_paper in files :
#         print(each_paper)
        with open(target_path+each_paper,'r',encoding='utf-8') as file :
            text = file.read()
            sentences = nltk.sent_tokenize(text)
            for each_sentence in sentences :
                Dictionary_HearstPatterns = h.find_hyponyms(each_sentence)
                for each_same in Dictionary_HearstPatterns :
                    if each_same in Dictionary_HearstPatterns_word :
                        position = Dictionary_HearstPatterns_word.index(each_same)
                        Dictionary_HearstPatterns_time[position] += 1
                    else:
                        Dictionary_HearstPatterns_word.append(each_same)
                        Dictionary_HearstPatterns_time.append(1)
            paper_num += 1
print('執行完'+str(paper_num)+'篇PubMed論文摘要')

執行完243篇PubMed論文摘要


In [29]:
D_H_set = []
D_H_set_w = []
for First_w in range(len(Dictionary_HearstPatterns_word)) :
    if Dictionary_HearstPatterns_word[First_w][0] not in D_H_set_w :
        D_H_set.append([[Dictionary_HearstPatterns_word[First_w][1]],[Dictionary_HearstPatterns_time[First_w]]])
        D_H_set_w.append(Dictionary_HearstPatterns_word[First_w][0])
    else:
        position = D_H_set_w.index(Dictionary_HearstPatterns_word[First_w][0])
        D_H_set[position][0].append(Dictionary_HearstPatterns_word[First_w][1])
        D_H_set[position][1].append(Dictionary_HearstPatterns_time[First_w])

In [30]:
Dictionart_HearstPatterns = DataFrame(D_H_set,index=D_H_set_w)

In [31]:
Dictionart_HearstPatterns[:10]

Unnamed: 0,0,1
the cell growth promote factor,[protein],[2]
RIV,[a putative HIV vaccine],[1]
CS mutation,[the clinical isolate],[1]
a high sensitivity,[the EM analysis],[1]
some virus,[microbe],[1]
the Enhanced Green Fluorescent Protein,[a marker],[1]
chromosomal analysis,[further study],[1]
drug treatment complication,[noninfectious entity],[1]
radiation effect,[noninfectious entity],[1]
recurrent tumor,[noninfectious entity],[1]


In [32]:
Dictionart_HearstPatterns.loc["some virus"]

0    [microbe]
1          [1]
Name: some virus, dtype: object

In [33]:
for index_ in Dictionart_HearstPatterns.index :
    if(index_.isupper() and len(index_)>1):
        print(index_)
        print(Dictionart_HearstPatterns.loc[index_])
        print('\n')

RIV
0    [a putative HIV vaccine]
1                         [1]
Name: RIV, dtype: object


PCR
0    [systematic laboratory testing]
1                                [1]
Name: PCR, dtype: object


CSF
0    [sample, a false  positive result]
1                                [1, 1]
Name: CSF, dtype: object


HIV1
0    [sexually transmit disease]
1                            [1]
Name: HIV1, dtype: object


EHV2
0    [etiological agent]
1                    [1]
Name: EHV2, dtype: object


EHV5
0    [etiological agent]
1                    [1]
Name: EHV5, dtype: object


PCV2
0    [10 swine virus]
1                 [1]
Name: PCV2, dtype: object


PRRSV
0    [10 swine virus, pathogen transmission]
1                                     [1, 1]
Name: PRRSV, dtype: object


CHIKV
0    [positive result]
1                  [1]
Name: CHIKV, dtype: object


ELISA
0    [serological test]
1                   [1]
Name: ELISA, dtype: object


HIV1 RNA
0    [a model]
1          [1]
Name: HIV1 RNA, dtype: 

## 英文斷詞統計

In [35]:
Near_word_title = []
Near_word = []
near_range = 2

EX_Sentence = 'I have a fat cat.'
tokenize_English(EX_Sentence,Near_word_title,Near_word,near_range)

(['I', 'have', 'a', 'fat', 'cat'],
 [[['I', 'I have', 'I have a'], [1, 1, 1]],
  [['have', 'have a', 'have a fat'], [1, 1, 1]],
  [['a', 'a fat', 'a fat cat'], [1, 1, 1]],
  [['fat', 'fat cat'], [1, 1]],
  [['cat'], [1]]])

## 屬性字典

![title](C:/Users/Tingchun.TC.Hung/Desktop/Record/知識圖譜/關係預定詞.png)

In [295]:
Search_Attributes_target = 'virus'

a = 0

for root, dirs, files in walk(target_path):
    for each_paper in files :

        with open(target_path+each_paper,'r',encoding='utf-8') as file :
            each_prepose_paper = file.read()

            a+=1
            text = each_prepose_paper
            sentences = nltk.sent_tokenize(text)
            for each_sentence in sentences :
                for Attributes_situation in Attributes :
                    target_position = each_sentence.find(Search_Attributes_target+' '+Attributes_situation+' ')
                    if(target_position!=-1):

                        Attributes_Sentence = each_sentence[target_position:target_position+each_sentence[target_position:].find('.')]
                        Attributes_Sentence2 = Attributes_Sentence.split()
                        Attributes_Sentence_target =  len(Attributes_Sentence2)-len(each_sentence[len(Search_Attributes_target+' '+Attributes_situation)+target_position+1 :].split())

                        tagged = pos_tag(Attributes_Sentence2)
                        case = []
                        for each_word, pos in tagged[Attributes_Sentence_target:] :

                            Part_of_speech = is_Part_of_speech(pos)
                            case.append(Part_of_speech)
        #                     print(each_word,Part_of_speech)
                        if('adjective' in case or 'noun' in case):
                            print(Attributes_Sentence)
                            print()
                        break

virus is unimpaired

virus was removed by the sCV-N, leaving behind a relatively larger fraction of non-infectious virus in the supernatant which we designated as replication incompetent virions (RIV)

virus is non-infectious in transfected cell culture and in injected sheep

virus was exchanged with the prt region from the Belgian provirus

virus were obtained, JV persistently infected cells became morphologically undistinguishable from Vero cells and virus production dropped to undetectable levels

virus was purified by plaque assay on Sf9 cells

virus are common in infancy, causing mostly asymptomatic infections

virus was associated with fever and with symptoms of cold but not with diarrhoea and vomiting

virus was enhanced notably at that moment

virus was performed, and the resulting products were spliced into a fragment which was packaged into armored RNA for use as a noninfectious, quantifiable synthetic substitute

virus isolated from China (TMV-Cv)

virus was detected in lymp

## 看第一遍

In [20]:
word2vec_trainning_texts = []
word2vec_trainning_texts_n = []
word2vec_trainning_texts_v = []
word2vec_trainning_texts_4type = []

word_texts_n = []
word_texts_v = []

paper_num = 0

raw_papers = []
new_papers = []

Dictionary_HearstPatterns_word = []
Dictionary_HearstPatterns_time = []

Dictionary_abbreviation = []

Near_word_title = []
Near_word = []
near_range = 2

not_abbreviation = ['I','II','III','IV','V','VI','VII','VIII','VV']

for root, dirs, files in walk(target_path):
    for each_paper in files :
#         print(each_paper)
        paper_now = ''
        with open(target_path+each_paper,'r',encoding='utf-8') as file :
            text = file.read()
            raw_papers.append(text)
            sentences = nltk.sent_tokenize(text)
            for each_sentence in sentences :
                
                #####
                #統計斷詞頻率
                Near_word_title,Near_word = tokenize_English(each_sentence,Near_word_title,Near_word,near_range)      
                #####
                
                #####
                #尋找符合HearstPatterns之關係
                Dictionary_HearstPatterns = h.find_hyponyms(each_sentence)
                for each_same in Dictionary_HearstPatterns :
                    if each_same in Dictionary_HearstPatterns_word :
                        position = Dictionary_HearstPatterns_word.index(each_same)
                        Dictionary_HearstPatterns_time[position] += 1
                    else:
                        Dictionary_HearstPatterns_word.append(each_same)
                        Dictionary_HearstPatterns_time.append(1)      
                #####
                
                pass_sentence = []
                pass_sentence_n = []
                pass_sentence_v = []
                pass_sentence_4type = []
                each_sentence_list = each_sentence.split()
                
                # POS Tagging -(1)
                tagged = pos_tag(each_sentence_list)
                for each_word, pos in tagged:
                    
                    #####
                    # POS Tagging -(2)簡單化 4 Type 
                    Part_of_speech = is_Part_of_speech(pos)
                    #####
                    
                    # 詞形還原 -(1)
                    input_Attributes = penn_to_wn(pos)
                    
                    #####
                    # 過濾成新的 Papers
                    # 收集word2vec用的 4 Type 庫
                    if(Part_of_speech != 'X'): #唯有 4 Type才做
                        # 詞形還原 -(2)還原
                        Reduction = lemmatizer.lemmatize(each_word,pos=input_Attributes)
                        new_word,end = replace_word(Reduction)
                        if(new_word != ''):
                            if(new_word.isupper() and len(new_word)>1):
                                pass_sentence_4type.append(new_word)
                                paper_now += new_word +' '
                            else:
                                if(new_word.lower() not in stop_word ):
                                    pass_sentence_4type.append(new_word.lower())
                                    paper_now += new_word.lower() +' '
                            if(end):
                                paper_now += '. '
                    #####
                    
                    #####
                    # 收集word2vec用的 名詞庫、動詞庫、名動詞庫
                    if(Part_of_speech == 'noun' or Part_of_speech == 'verb'): #唯有 N / V才做 # if(1):
                        Reduction = lemmatizer.lemmatize(each_word,pos=input_Attributes)  
                        new_word,end = replace_word(Reduction)
                        if(new_word != ''):
                            # ~~~如果是縮寫~~~
                            if(new_word.isupper() and len(new_word)>1 and new_word not in not_abbreviation):
                                pass_sentence.append(new_word)
                                if(new_word not in Dictionary_abbreviation) :
                                    Dictionary_abbreviation.append(new_word)
                                if(Part_of_speech == 'noun'):
                                    pass_sentence_n.append(new_word)
                                    if new_word not in word_texts_n:
                                        word_texts_n.append(new_word)
                                elif(Part_of_speech == 'verb'):
                                    pass_sentence_v.append(new_word)
                                    if new_word not in word_texts_v:
                                        word_texts_v.append(new_word)
                                ###
                                
                                ###
                            else:
                                if(new_word.lower() not in stop_word ):
                                    pass_sentence.append(new_word.lower())
                                    if(Part_of_speech == 'noun'):
                                        pass_sentence_n.append(new_word.lower())
                                        if new_word.lower() not in word_texts_n:
                                            word_texts_n.append(new_word.lower())
                                    elif(Part_of_speech == 'verb'):
                                        pass_sentence_v.append(new_word.lower())
                                        if new_word.lower() not in word_texts_v:
                                            word_texts_v.append(new_word.lower())
                    #####
                    
                ##### 收集word2vec用
                if(pass_sentence_4type!=[]):
                    word2vec_trainning_texts_4type.append(pass_sentence_4type)
                if(pass_sentence!=[]):
                    word2vec_trainning_texts.append(pass_sentence)
                if(pass_sentence_n!=[]):
                    word2vec_trainning_texts_n.append(pass_sentence_n)
                if(pass_sentence_v!=[]):
                    word2vec_trainning_texts_v.append(pass_sentence_v)
                    
            ##### 過濾成新的 Papers
            new_papers.append(paper_now) 
            
            paper_num += 1
print('執行完'+str(paper_num)+'篇PubMed論文摘要')

執行完243篇PubMed論文摘要


## 斷詞字典

In [21]:
Near_word_final = []
for sorting in range(len(Near_word)):
    list1 , list2 = (list(t) for t in zip(*sorted(zip(Near_word[sorting][1],Near_word[sorting][0]))))
    list1 = list1[::-1]
    list2 = list2[::-1]
    Near_word_final.append([list2,list1])

In [24]:
ID = Near_word_title.index('propidium') #polymerase #deoxyribonucleic #propidium monoazide
threshold_tokenize = 0.5
for showwing in range(5):
    try:
        tokenize_yes_no_number = Near_word_final[ID][1][showwing]/Near_word[ID][1][0] #用原始單獨自己來算
        if(tokenize_yes_no_number>=threshold_tokenize):
            print(Near_word_final[ID][0][showwing],Near_word_final[ID][1][showwing],' True ')
        else:
            print(Near_word_final[ID][0][showwing],Near_word_final[ID][1][showwing],' False ')
    except:
        print('...')

propidium monoazide (PMA) 5  True 
propidium monoazide 5  True 
propidium 5  True 
...
...


## Word2vec

#### 訓練模型

In [25]:
print('包含4種type句子 : ',len(word2vec_trainning_texts_4type))
print('包含名詞動詞句子 : ',len(word2vec_trainning_texts))
print('名詞句子 : ',len(word2vec_trainning_texts_n))
print('動詞句子 : ',len(word2vec_trainning_texts_v))

包含4種type句子 :  2474
包含名詞動詞句子 :  2470
名詞句子 :  2466
動詞句子 :  1963


In [26]:
print('看到名詞(不重複)出現字數 : ',len(word_texts_n))
print('看到動詞(不重複)出現字數 : ',len(word_texts_v))

看到名詞(不重複)出現字數 :  3856
看到動詞(不重複)出現字數 :  935


In [27]:
model = Word2Vec(sentences=word2vec_trainning_texts, window=5, min_count=1, workers=4) #, vector_size=100
model.save(save_path+"word2vec.model")

#### 提出訓練好的model

In [28]:
model = Word2Vec.load(save_path+"word2vec.model")

In [29]:
print('一個字用幾維vector表示 : '+str(model.vector_size))

一個字用幾維vector表示 : 100


In [30]:
vocab_list = list(model.wv.vocab.keys())
print('word2vec 字數 : '+str(len(vocab_list)))
print('實例 : ')
for i in vocab_list[:5]:
    print(i)

word2vec 字數 : 4420
實例 : 
removal
infectivity
blood
product
prepare


In [31]:
ans = model.wv.most_similar(positive=['virus'], negative=[], topn=10)
# ans = model.wv.most_similar(positive=['virus','PCR'], negative=[], topn=10)
# ans = model.wv.most_similar(positive=['virus'], negative=['PCR'], topn=10)
for i in ans :
    print(i)

('infection', 0.9886000752449036)
('RNA', 0.9872105121612549)
('sequence', 0.9859580397605896)
('DNA', 0.9849257469177246)
('gene', 0.9842233657836914)
('assay', 0.9836938381195068)
('PCR', 0.983464777469635)
('cell', 0.983388364315033)
('infect', 0.9817866086959839)
('detection', 0.9798007607460022)


In [32]:
word_a = 'virus'
word_b = 'PCR'

two_word_distance = model.wv.distance(word_a,word_b)
print(word_a+' & '+word_b+'距離 : ')
print(two_word_distance)

virus & PCR距離 : 
0.016535162925720215


In [33]:
vector = model.wv.get_vector('virus')
print(vector)

[-0.05876313 -0.05356771  0.03455362  0.05839892 -0.10250296  0.00908248
 -0.04107442 -0.00453115  0.0524398  -0.04141151 -0.06854161 -0.00735937
 -0.00759399  0.05934818  0.00347914  0.04573302  0.00324655  0.01283675
 -0.02702908  0.02888013 -0.03884978 -0.01668879  0.01754783 -0.03461919
  0.00933287  0.01970799  0.00562799  0.0256945  -0.06604835  0.01200862
  0.00087849  0.00362029 -0.04118326  0.0092281  -0.02482081 -0.00998114
  0.02297064  0.03125003 -0.03126125 -0.00669038 -0.04350164  0.0069631
 -0.02895527 -0.06841522  0.01686334  0.01327391  0.04455603  0.00570064
  0.00470009  0.02595395  0.00952964 -0.0266767   0.04134976 -0.02948535
 -0.01852163 -0.00595095 -0.03237408  0.06278611  0.01658367  0.03790691
 -0.02345408  0.03374553 -0.02021487 -0.07476577  0.00677003  0.02588035
 -0.0045572  -0.02993117  0.04127441  0.00429658 -0.04622592 -0.00047428
  0.04528968 -0.02828533  0.04694491  0.00957428 -0.06808792 -0.00245372
  0.01973607  0.03016105  0.02067111  0.01927081  0.

In [34]:
model.wv.similar_by_vector(vector)

[('virus', 1.0),
 ('infection', 0.9886000752449036),
 ('RNA', 0.9872105121612549),
 ('sequence', 0.9859580397605896),
 ('DNA', 0.9849257469177246),
 ('gene', 0.9842233657836914),
 ('assay', 0.9836938381195068),
 ('PCR', 0.9834648370742798),
 ('cell', 0.9833883047103882),
 ('infect', 0.9817866683006287)]

#### 用於找領域

In [36]:
chrome_options=Options()
chrome_options.add_argument('--headless')#無介面操作
driver = webdriver.Chrome(chrome_options=chrome_options)

def google_search(word_a,Domain,open_or_not = False):
    try:
        driver.get("https://www.google.com/search?q=what is the full name of "+word_a+" in "+Domain+" field")
        Abstract_wiki = driver.find_element_by_xpath('//*[@id="rso"]/div[1]/div[1]/div/div[1]/div').text
        if(open_or_not):
            print(Abstract_wiki)
            print('-----------')
        if(Abstract_wiki[:6] != '翻譯這個網頁'):
            chinese = Abstract_wiki.find('網路上的精選摘要')
            chinese2 = Abstract_wiki.find('全部顯示')
            if(chinese!=-1 and chinese2!=-1):
                if(open_or_not):
                    print('網路上的精選摘要&全部顯示')
                if(Abstract_wiki.find(' ('+word_a+')')!=-1):
                    sentence_abbreviation = Abstract_wiki[14:Abstract_wiki.find(' ('+word_a+')')]
                else:
                    if(Abstract_wiki.find(word_a+' (')!=-1):
                        start_position = Abstract_wiki.find(word_a+' (')+len(word_a)+2
                        end_position = Abstract_wiki[start_position:].find(')')
                        sentence_abbreviation = Abstract_wiki[start_position:start_position+end_position]
                        if(open_or_not):
                            print(sentence_abbreviation)
                    else:
                        sentence_abbreviation = Abstract_wiki[14:Abstract_wiki.find(' ('+word_a+')')]
            elif(chinese!=-1):
                if(open_or_not):
                    print('網路上的精選摘要')
                if(Abstract_wiki.find(' ('+word_a+')')!=-1):
                    sentence_abbreviation = Abstract_wiki[9:Abstract_wiki.find(' ('+word_a+')')]
                else:
                    if(Abstract_wiki.find(word_a+' (')!=-1):
                        start_position = Abstract_wiki.find(word_a+' (')+len(word_a)+2
                        end_position = Abstract_wiki[start_position:].find(')')
                        sentence_abbreviation = Abstract_wiki[start_position:start_position+end_position]
                        if(open_or_not):
                            print(sentence_abbreviation)
                    else:
                        sentence_abbreviation = Abstract_wiki[9:Abstract_wiki.find(' ('+word_a+')')]
            elif(chinese2!=-1):
                if(open_or_not):
                    print('全部顯示')
                if(Abstract_wiki.find(' ('+word_a+')')!=-1):
                    sentence_abbreviation = Abstract_wiki[5:Abstract_wiki.find(' ('+word_a+')')]
                else:
                    if(Abstract_wiki.find(word_a+' (')!=-1):
                        start_position = Abstract_wiki.find(word_a+' (')+len(word_a)+2
                        end_position = Abstract_wiki[start_position:].find(')')
                        sentence_abbreviation = Abstract_wiki[start_position:start_position+end_position]
                        if(open_or_not):
                            print(sentence_abbreviation)
                    else:
                        sentence_abbreviation = Abstract_wiki[5:Abstract_wiki.find(' ('+word_a+')')]
            else:
                if(Abstract_wiki.find(' ('+word_a+')')!=-1):
                    sentence_abbreviation = Abstract_wiki[:Abstract_wiki.find(' ('+word_a+')')]
                else:
                    if(Abstract_wiki.find(word_a+' (')!=-1):
                        start_position = Abstract_wiki.find(word_a+' (')+len(word_a)+2
                        end_position = Abstract_wiki[start_position:].find(')')
                        sentence_abbreviation = Abstract_wiki[start_position:start_position+end_position]
                        if(open_or_not):
                            print(sentence_abbreviation)
                    else:
                        sentence_abbreviation = Abstract_wiki[:Abstract_wiki.find(' ('+word_a+')')]


            if ( sentence_abbreviation.find('\n') != -1 ):
            #     print(sentence_abbreviation.find('\n'))
                if(open_or_not):
                    print('有換行')
                sentence_abbreviation = sentence_abbreviation[:sentence_abbreviation.find('\n')]
                if(open_or_not):
                    print('-----------')
                    print(sentence_abbreviation)
            else:
                if(open_or_not):
                    print('-----------')
                    print(sentence_abbreviation)
        else:
            sentence_abbreviation = ''

        return sentence_abbreviation
    except:
        return ''

  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
word_a = 'virus'
word_b = 'acid'

two_word_distance = model.wv.distance(word_a,word_b)
print(word_a+' & '+word_b+'距離 : ')
print(two_word_distance)

virus & acid距離 : 
0.04287189245223999


In [38]:
print('可能之縮寫字 : ',len(Dictionary_abbreviation))

可能之縮寫字 :  519


In [39]:
Dictionary_abbreviation[:10]

['FMDV', 'RNA', 'PCR', 'VP1', 'HERV-K', 'LTR', 'HCV', 'DNA', 'CV-N', 'HIV-1']

In [40]:
want_domain = ['nucleic','PCR','virus'] #,'noninfectious'

In [44]:
word_a = 'FMDV'

dis = []
for each_domain in want_domain:
    word_b = each_domain

    two_word_distance = model.wv.distance(word_a,word_b)
    print(word_a+' & '+word_b+'距離 : ')
    print(two_word_distance)
    dis.append(two_word_distance)
    print()

Domain = want_domain[dis.index(min(dis))]

sec_min = 100
sec_min_position = 0

pos = 0
for each_dis in dis :
    if((each_dis-min(dis))!=0 and each_dis<sec_min):
        sec_min = each_dis
        Sec_Domain = want_domain[pos]
    pos += 1

print('搜尋目標 : ', Domain)
print('第二搜尋目標 : ', Sec_Domain)

FMDV & nucleic距離 : 
0.313747763633728

FMDV & PCR距離 : 
0.12094348669052124

FMDV & virus距離 : 
0.10767906904220581

搜尋目標 :  virus
第二搜尋目標 :  PCR


In [45]:
google_search(word_a,Domain) #,True

'Foot-and-mouth disease virus'

In [47]:
chrome_options=Options()
chrome_options.add_argument('--headless')#無介面操作
driver = webdriver.Chrome(chrome_options=chrome_options)

want_domain = ['nucleic','PCR','virus']
Dictionary_abbreviation_final = []

strange_word = '網路上的精選摘要全部顯示翻譯這個網頁'

for seach_target in Dictionary_abbreviation[:10] :
    word_a = seach_target

    dis = []
    for each_domain in want_domain:
        word_b = each_domain
        two_word_distance = model.wv.distance(word_a,word_b)
        dis.append(two_word_distance)

    Domain = want_domain[dis.index(min(dis))] 
    sec_min = 100
    sec_min_position = 0

    pos = 0
    for each_dis in dis :
        if((each_dis-min(dis))!=0 and each_dis<sec_min):
            sec_min = each_dis
            Sec_Domain = want_domain[pos]
        pos += 1

    Full_name = google_search(word_a,Domain)
    if(Full_name==''):
        Full_name = google_search(word_a,Sec_Domain)
    for strange in strange_word :
        if(strange in Full_name):
            Full_name = google_search(word_a,Sec_Domain)
            break
    if(Full_name==''):
        Full_name = google_search(word_a,'')
    Dictionary_abbreviation_final.append([word_a,Full_name])
driver.quit()

  This is separate from the ipykernel package so we can avoid doing imports until


In [48]:
Dictionary_abbreviation_final

[['FMDV', 'Foot-and-mouth disease virus'],
 ['RNA', 'ribonucleic acid'],
 ['PCR', 'Polymerase chain reaction'],
 ['VP1', 'Major capsid protein'],
 ['HERV-K', 'human endogenous retrovirus-K'],
 ['LTR', 'long terminal repeat'],
 ['HCV', 'Virology. The hepatitis C virus'],
 ['DNA', 'Polymerase chain reaction'],
 ['CV-N', 'curriculum vitae'],
 ['HIV-1', 'The human immunodeficiency virus type 1']]

#### 挑選成NE字典
#### (在意名詞之間關係排序)

In [49]:
word_texts_n_NE = word_texts_n[:1000]
threshold = 0.5

In [50]:
list_n_sort_dis = []
index_name = []
for each_n in word_texts_n_NE :
    sort_word = model.wv.most_similar(positive=[each_n], negative=[], topn=len(word_texts_n)+len(word_texts_v))
    list_n_sort_dis_this = []
    for each_distance in sort_word :
        #高於threshold
        if(each_distance[1]>=threshold):
            #在"NE"內
            if each_distance[0] in word_texts_n_NE:
                list_n_sort_dis_this.append(each_distance)
    list_n_sort_dis.append(list_n_sort_dis_this)
    index_name.append(each_n)

In [51]:
Dictionary_word2vec = DataFrame(list_n_sort_dis,index=index_name)

In [52]:
Dictionary_word2vec.loc["DNA"]

0          (virus, 0.9849257469177246)
1       (sequence, 0.9782947301864624)
2            (gene, 0.977288007736206)
3      (infection, 0.9770529866218567)
4            (RNA, 0.9758744239807129)
                    ...               
313    (objective, 0.5020682215690613)
314     (research, 0.5005376935005188)
315                               None
316                               None
317                               None
Name: DNA, Length: 318, dtype: object

## HearstPatterns

In [53]:
D_H_set = []
D_H_set_w = []
for First_w in range(len(Dictionary_HearstPatterns_word)) :
    if Dictionary_HearstPatterns_word[First_w][0] not in D_H_set_w :
        D_H_set.append([[Dictionary_HearstPatterns_word[First_w][1]],[Dictionary_HearstPatterns_time[First_w]]])
        D_H_set_w.append(Dictionary_HearstPatterns_word[First_w][0])
    else:
        position = D_H_set_w.index(Dictionary_HearstPatterns_word[First_w][0])
        D_H_set[position][0].append(Dictionary_HearstPatterns_word[First_w][1])
        D_H_set[position][1].append(Dictionary_HearstPatterns_time[First_w])

In [54]:
Dictionart_HearstPatterns = DataFrame(D_H_set,index=D_H_set_w)

In [55]:
D_H_set = []
D_H_set_w = []
for First_w in range(len(Dictionary_HearstPatterns_word)) :
    if Dictionary_HearstPatterns_word[First_w][0] not in D_H_set_w :
        D_H_set.append([[Dictionary_HearstPatterns_word[First_w][1]],[Dictionary_HearstPatterns_time[First_w]]])
        D_H_set_w.append(Dictionary_HearstPatterns_word[First_w][0])
    else:
        position = D_H_set_w.index(Dictionary_HearstPatterns_word[First_w][0])
        D_H_set[position][0].append(Dictionary_HearstPatterns_word[First_w][1])
        D_H_set[position][1].append(Dictionary_HearstPatterns_time[First_w])

Dictionart_HearstPatterns = DataFrame(D_H_set,index=D_H_set_w)

In [56]:
Dictionart_HearstPatterns[:10]

Unnamed: 0,0,1
the cell growth promote factor,[protein],[2]
RIV,[a putative HIV vaccine],[1]
CS mutation,[the clinical isolate],[1]
a high sensitivity,[the EM analysis],[1]
some virus,[microbe],[1]
the Enhanced Green Fluorescent Protein,[a marker],[1]
chromosomal analysis,[further study],[1]
drug treatment complication,[noninfectious entity],[1]
radiation effect,[noninfectious entity],[1]
recurrent tumor,[noninfectious entity],[1]


In [75]:
Dictionart_HearstPatterns.loc["the cell growth  promote factor"]

0    [protein]
1          [2]
Name: the cell growth  promote factor, dtype: object

In [76]:
Dictionart_HearstPatterns.loc["RIV"]

0    [a putative HIV vaccine]
1                         [1]
Name: RIV, dtype: object

## 屬性字典

In [77]:
Attributes=[
'is known as',
'is composed of',
'is a kind of',
'is a',
'is',
'are known as',
'are composed of',
'are a kind of',
'are',
'called',
'belong'
'is belong',
'isolated from',
'isolates',
]

In [78]:
Search_Attributes_target = 'virus' # virus DNA RNA

a = 0

for each_prepose_paper in raw_papers :
    a+=1
    text = each_prepose_paper
    sentences = nltk.sent_tokenize(text)
    for each_sentence in sentences :
        for Attributes_situation in Attributes :
            target_position = each_sentence.find(Search_Attributes_target+' '+Attributes_situation+' ')
            if(target_position!=-1):
                
                Attributes_Sentence = each_sentence[target_position:target_position+each_sentence[target_position:].find('.')]
                Attributes_Sentence2 = Attributes_Sentence.split()
                Attributes_Sentence_target =  len(Attributes_Sentence2)-len(each_sentence[len(Search_Attributes_target+' '+Attributes_situation)+target_position+1 :].split())

                tagged = pos_tag(Attributes_Sentence2)
                case = []
                for each_word, pos in tagged[Attributes_Sentence_target:] :
                    
                    Part_of_speech = is_Part_of_speech(pos)
                    case.append(Part_of_speech)
#                     print(each_word,Part_of_speech)
                if('adjective' in case or 'noun' in case):
                    print(Attributes_Sentence)
                    print()
                break

virus is unimpaired

virus is non-infectious in transfected cell culture and in injected sheep

virus are common in infancy, causing mostly asymptomatic infections

virus isolated from China (TMV-Cv)

virus isolates were identified by neutralization, immunofluorescence assay, or enzyme immunoassay

virus are important to curtail the spread of this virus

virus is the most important foodborne virus in Japan

virus are increasingly recognized worldwide as the most important cause of food borne gastro-intestinal illness

virus is probably the main causative agent of Fuchs uveitis, but other viruses may also be involved in the pathogenesis of this disease

virus is commonly found in environmental waters and is very resistant to water disinfection and environmental stressors, especially UV light inactivation

virus is regarded as the major risk factor in the development of cervical cancer

virus is mainly maintained through human-mosquito-human cycle

virus is a globally spread zoonotic arb

In [297]:
chrome_options=Options()
chrome_options.add_argument('--headless')#無介面操作
driver = webdriver.Chrome(chrome_options=chrome_options)

want_domain = ['nucleic','PCR','virus']
Dictionary_abbreviation_final = []

strange_word = '網路上的精選摘要全部顯示翻譯這個網頁'

for seach_target in [Search_Attributes_target] :
    word_a = seach_target

    dis = []
    for each_domain in want_domain:
        word_b = each_domain
        two_word_distance = model.wv.distance(word_a,word_b)
        dis.append(two_word_distance)

    Domain = want_domain[dis.index(min(dis))] 
    sec_min = 100
    sec_min_position = 0

    pos = 0
    for each_dis in dis :
        if((each_dis-min(dis))!=0 and each_dis<sec_min):
            sec_min = each_dis
            Sec_Domain = want_domain[pos]
        pos += 1

    Full_name = google_search(word_a,Domain)
    if(Full_name==''):
        Full_name = google_search(word_a,Sec_Domain)
    for strange in strange_word :
        if(strange in Full_name):
            Full_name = google_search(word_a,Sec_Domain)
            break
    if(Full_name==''):
        Full_name = google_search(word_a,'')
    Dictionary_abbreviation_final.append([word_a,Full_name])
driver.quit()

  This is separate from the ipykernel package so we can avoid doing imports until


### 經過 POS Tagging(4-type) 、 詞形還原 、 lower 篩選過後的論文

##### 進入第二遍之前還需要看過斷詞字典、縮寫字典
##### 統計詞頻前還需確認是否不在stop word

In [47]:
new_papers[0]

'removal virus infectivity blood biopharmaceutical product prepare blood issue considerable importance . irrespective choose vital biological activity product impaired . blood unfractionated plasma serum problem challenging . inactivation genome key kill virus vaccines . imines inactivated foot-and-mouth virus vaccine evidence survival virus infectivity . immunogenicity virus unimpaired . viruses belong recognised inactivate imines . biological property proteins cell growth-promoting factor calf serum impaired condition ensure inactivation infectious poliovirus foot-and-mouth virus FMDV . virus inactivate imines degree provide remove infectivity protein unstable temperatures . mechanism FMDV inactivate studied . find RNA extract virus inactivation degree degrade hidden break non-infectious . amplify PCR primer correspond gene cod portion viral RNA polymerase cod VP1 structural proteins alteration base base occur . '