In [9]:
import spacy
import pandas as pd
nlp = spacy.load('en')

In [4]:
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')
doc

Apple is looking at buying U.K. startup for $1 billion

In [48]:
doc1 = nlp("Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.")
doc1

Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.

## Token & attributes

In [23]:
lemmas = [] #원형
poss = [] #품사
tags = [] #태크
deps = [] #의존성
shapes = [] #모양
alphas = [] #알파벳 여부
stopwords = [] 
entities = []

for token in doc:
    lemmas.append(token.lemma_)
    poss.append(token.pos_)
    tags.append(token.tag_)
    deps.append(token.dep_)
    shapes.append(token.shape_)
    alphas.append(token.is_alpha)
    stopwords.append(token.is_stop)
    entities.append(token.ent_type_)

In [24]:
compare_df = pd.DataFrame({
                'origin':doc,
                'lemma':lemmas,
                'pos':poss,
                'tag':tags,
                'deps':deps,
                'shape':shapes,
                'is_alphabet':alphas,
                'is_stopword':stopwords,
                'entity':entities,
            }, columns=['origin', 'lemma', 'pos', 'tag', 'deps', 'entity','is_alphabet', 'is_stopword'])
compare_df

Unnamed: 0,origin,lemma,pos,tag,deps,entity,is_alphabet,is_stopword
0,Apple,apple,PROPN,NNP,nsubj,ORG,True,False
1,is,be,VERB,VBZ,aux,,True,True
2,looking,look,VERB,VBG,ROOT,,True,False
3,at,at,ADP,IN,prep,,True,True
4,buying,buy,VERB,VBG,pcomp,,True,False
5,U.K.,u.k.,PROPN,NNP,compound,GPE,False,False
6,startup,startup,NOUN,NN,dobj,,True,False
7,for,for,ADP,IN,prep,,True,True
8,$,$,SYM,$,quantmod,MONEY,False,False
9,1,1,NUM,CD,compound,MONEY,False,False


In [14]:
spacy.explain('ADP')

'adposition'

In [15]:
# 고유명사
spacy.explain('PROPN')

'proper noun'

In [16]:
spacy.explain('NOUN')

'noun'

In [17]:
spacy.explain('NNP')

'noun, proper singular'

In [19]:
spacy.explain('nsubj')

'nominal subject'

## display

In [29]:
#dependency
from spacy import displacy
displacy.render(doc, style='dep', jupyter=True, options={'distance':80})

In [30]:
#entity recognition
displacy.render(doc, style='ent', jupyter=True, options={'distance':90})

In [33]:
# nouns
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

Apple Apple nsubj looking
U.K. startup startup dobj buying


#### nsubj = 주격명사구
#### dobj = 직접 목적어

In [50]:
# tree내 각 항목 확인
for token in doc:
    if len([c for c in token.children])>0:
        print(token.text, token.dep_, [c for c in token.children])

looking ROOT [Apple, is, at]
at prep [buying]
buying pcomp [startup, for]
startup dobj [U.K.]
for prep [billion]
billion pobj [$, 1]


In [38]:
# 의존관계를 하나의 스트링으로 표시하는게 가능한가
for i in range(len(doc)):
    token = doc[i]
    if len([c for c in token.children])>0:
        lefts = token.n_lefts
        right = token.n_rights
        print(token.text, ':', doc[i-lefts:i+right+1])

looking : Apple is looking at
at : at buying
buying : buying U.K. startup
startup : U.K. startup
for : for $
billion : $1 billion


In [51]:
for i in range(len(doc1)):
    token = doc1[i]
    if len([c for c in token.children])>0:
        lefts = token.n_lefts
        right = token.n_rights
        print(token.text, ':', doc1[i-lefts:i+right+1])

warfarin : warfarin (Coumadin,
Jantoven : (Coumadin, Jantoven)
used : used to
prevent : to prevent blood
clots : blood clots
works : prevent blood clots, usually works well and isn't bothersome
is : isn't bothersome,
bleeding : serious internal bleeding
happen : bleeding can happen in the
in : in the
situation : the wrong situation


In [43]:
# head단어들만 찾는다면
root = [token for token in doc if token.head ==token][0]
root

looking

In [54]:
root1 = [token for token in doc1 if token.head ==token][0]
root1

works

In [55]:
subject = list(root.rights)[0]
subject

at

In [58]:
subject1_r = list(root1.rights)
subject1_l = list(root1.lefts)
print('right: ', subject1_r)
print('left: ', subject1_l)

right:  [well, and, is, but, happen]
left:  [Typically, ,, warfarin, ,, usually]


In [60]:
#tree 조상찾기
for desc in subject.subtree:
    print(desc)
    assert subject is desc or subject.is_ancestor(desc)
    print([anc.text for anc in desc.ancestors])

at
['looking']
buying
['at', 'looking']
U.K.
['startup', 'buying', 'at', 'looking']
startup
['buying', 'at', 'looking']
for
['buying', 'at', 'looking']
$
['billion', 'for', 'buying', 'at', 'looking']
1
['billion', 'for', 'buying', 'at', 'looking']
billion
['for', 'buying', 'at', 'looking']


## Named Entity Recognition

In [62]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [61]:
# 인식되는 엔티티 추가 필요
for ent in doc1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Coumadin 21 29 ORG
Jantoven 31 39 GPE


In [67]:
#BIO - B = entity시작, O = entity밖, I = entity내부
for token in doc:
    print(token.text, token.ent_iob_, token.ent_type_)

Apple B ORG
is O 
looking O 
at O 
buying O 
U.K. B GPE
startup O 
for O 
$ B MONEY
1 I MONEY
billion I MONEY


## add additional entity type

In [68]:
doc1

Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.

In [69]:
#위의 약물이름, 증상을 예제로 추가한다면
from spacy.tokens import Span

In [None]:
DRUG = doc1.vocab.