In [None]:
# 언어 모델 다운로드
!python -m spacy download en_core_web_md

In [None]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp("I went there")

for token in doc:
    print(token, type(token), token.text, type(token.text))

In [1]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp("I own a pretty cat.")

print ([ token.text for token in doc ], type([ token.text for token in doc ]))

['I', 'own', 'a', 'pretty', 'cat', '.'] <class 'list'>


In [None]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp("It's been a crazy week!!!")

print ([ token.text for token in doc ], type([ token.text for token in doc ]))

In [None]:
# custom tokenizer
import spacy
from spacy.symbols import ORTH # orthography를 의미 (맞춤법)

import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp("lemme that")
print ([ token.text for token in doc ], type([ token.text for token in doc ]))

special_case1 = [ {ORTH: "lem"}, {ORTH: "me"} ]
special_case2 = [ {ORTH: "Lem"}, {ORTH: "me"} ]
nlp.tokenizer.add_special_case("lemme", special_case1)
nlp.tokenizer.add_special_case("Lemme", special_case2)
doc = nlp("lemme that!!!")
print ([ token.text for token in doc ], type([ token.text for token in doc ]))

doc = nlp("Let's try again! Lemme that!, lemme")
print ([ token.text for token in doc ], type([ token.text for token in doc ]))

In [None]:
# custom tokenizer - 문장기호도 custom tokenizer에 포함될 수 있는 경우
import spacy
from spacy.symbols import ORTH # orthography를 의미 (맞춤법)

special_case = [ {ORTH: "...lemme...?"} ]
nlp.tokenizer.add_special_case("...lemme...?", special_case)
doc = nlp("I have a dream. ...lemme...?")
print ([ token.text for token in doc ], type([ token.text for token in doc ]))

In [None]:
# tokenizer의 debugging
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

text = "Let's go! Lemme"
doc = nlp(text)
print ([ token.text for token in doc ])

detail_tokens = nlp.tokenizer.explain(text) 
for detail_token in detail_tokens:
    print(detail_token[1], "\t", detail_token[0])

In [None]:
# Sentence segmentation은 tokenization보다 좀더 복잡한 작업
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
text = "부산 해운대해수욕장에서 중학생 3명이 물놀이를 하던 중 1명이 실종되고 1명이 숨지는 사고가 발생했다. 25일 경찰과 소방당국에 따르면 이날 오전 3시 41분께 부산 해운대해수욕장에서 중학생 3명이 물놀이 하던 중 실종됐다는 신고가 접수됐다."
doc = nlp(text)
for sentence in doc.sents:
    print(sentence)

In [None]:
# lemma : token의 기본 형태 (base form), 사전에서 token의 기본형으로 찾을 수 있다.
# eating의 lemma => eat / eats의 lemma => eat / ate의 lemma => eat
# lemmatization : token을 자신의 lemma로 찾아가는 과정 

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
text = "I went there for working and worked for 3 years."
doc = nlp(text)
for token in doc:
    print(token.text, "\t", token.lemma_)

In [None]:
import spacy
from spacy.symbols import ORTH, LEMMA
import en_core_web_sm
nlp = en_core_web_sm.load()
special_case = [ {ORTH: "Angeltown", LEMMA: "Los Angeles"} ]
nlp.tokenizer.add_special_case("Angeltown", special_case)

doc = nlp("I am flying to Angeltown")
for token in doc:
    print(token.text, token.lemma_)


In [None]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp("I know that you have been to Korea.")
for token in doc:
    print(token)

print(doc[2:4])
print(doc[4:])
print(doc[3:-1])
print(doc[6:])


In [None]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp("I know that you have been to Korea.")
span = doc[2:4]
for token in span:
    print(token)
    

In [None]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp("Hello, hi!")

In [None]:
doc[0].lower_

In [None]:
doc = nlp("HELLO, Hello, hello, hEllo")
for token in doc:
    print(token.text)

In [None]:
print(doc[0].is_upper)
print(doc[0].is_lower)
print(doc[1].is_upper)
print(doc[1].is_lower)

In [None]:
doc = nlp("Cat and Cat123")
print(doc[0].is_alpha)
print(doc[1].is_alpha)
print(doc[2].is_alpha) # nonalphabetic에는 숫자, 기호, 공백 문자를 포함

In [None]:
doc = nlp("English and 한글!")
print(doc[0].is_ascii)
print(doc[2].is_ascii)
print(doc[3].is_ascii)

In [None]:
doc = nlp("Cat Cat123 123")
print(doc[0].is_digit)
print(doc[1].is_digit)
print(doc[2].is_digit)

In [None]:
doc = nlp("Hey, You and me!")
print(doc[1].is_punct)
print(doc[4].is_punct)
print(doc[5].is_punct)

In [2]:
doc = nlp("([ He said yes. ])")
print(doc[0])
print(doc[0].is_left_punct)
print(doc[1])
print(doc[1].is_left_punct)



(
True
[
True


In [3]:
#길이 상관없이 space는 True로 인식
doc = nlp(" ")
print(doc[0])
print(len(doc[0]))
print(doc[0].is_space)


 
1
True


In [4]:
# is_bracket : {} [] () 유무 확인
doc = nlp("( You said [1] and {2} is not applicable.)")
print(doc[0].is_bracket, doc[-1].is_bracket)
print(doc[3].is_bracket, doc[5].is_bracket)

True True
True True


In [5]:
# is_quote : 인용문자 확인
doc = nlp("( You said '1\"' is not applicable.)")
print(doc[3], doc[3].is_quote)

' True


In [6]:
# is_currency : 화폐문자 유무 확인
doc = nlp("I paid $12 for the t-shirt")
print(doc[2], doc[2].is_currency)

$ True


In [8]:
# like_num : 문자열이 숫자를 의미하는지 확인, 숫자와 숫자를 나타내는 글자 모두 인식
doc = nlp("I emailed you at leat thousand times")
print(doc[-2], doc[-2].like_num)


thousand True


In [9]:
# like_email : 문자열이 이메일 형태인지 확인
doc = nlp("My email is kaikim98@naver.com and you can visit me at http://www.naver.com any time you want")
print(doc[3], doc[3].like_email)

kaikim98@naver.com True


In [11]:
# like_url : url형태인지 확인
doc = nlp("My email is kaikim98@naver.com and you can visit me at http://www.naver.com any time you want")
print(doc[-5], doc[-5].like_url)

http://www.naver.com True


In [14]:
# shape_ : 단어들이 가진 형태를 찾아주는 기능 (X: 대문자, x: 소문자, d: 숫자)
# token의 orthographic 특징을 나타내는 문자열을 출력 => 머신러닝 알고리즘에서 문자열의 featuer을 부여하고 할 때 사용가능
doc = nlp("Girl called Kathy has a nickname Cat123.")
for token in doc:
    print(token.text, token.lemma_, token.shape_)

Girl girl Xxxx
called call xxxx
Kathy Kathy Xxxxx
has have xxx
a a x
nickname nickname xxxx
Cat123 Cat123 Xxxddd
. . .


In [16]:
# is_oov : oov(Out of Vocabulary) 특정 토큰이 vocabulary 안에 있는지 확인
doc = nlp("I visited Jenny at Korean Resort")
for token in doc:
    print(token, token.is_oov)

I True
visited True
Jenny True
at True
Korean True
Resort True


In [19]:
# is_stop: stop word(불용어 확인)
doc = nlp("I just want to inform you that I was with the principle")
for token in doc:
    print(token, token.is_stop)

I True
just True
want False
to True
inform False
you True
that True
I True
was True
with True
the True
principle False
