In [4]:
#!conda install -y -c pytorch pytorch torchvision

In [2]:
#!pip install stanza

In [1]:
import stanza

In [3]:
#stanza.download('ko', package='kaist')

In [5]:
nlp = stanza.Pipeline('ko')

2021-03-28 02:11:54 INFO: Loading these models for language: ko (Korean):
| Processor | Package |
-----------------------
| tokenize  | kaist   |
| pos       | kaist   |
| lemma     | kaist   |
| depparse  | kaist   |

2021-03-28 02:11:55 INFO: Use device: gpu
2021-03-28 02:11:55 INFO: Loading: tokenize
2021-03-28 02:11:57 INFO: Loading: pos
2021-03-28 02:11:57 INFO: Loading: lemma
2021-03-28 02:11:58 INFO: Loading: depparse
2021-03-28 02:11:58 INFO: Done loading processors!


In [6]:
text = '오늘은 자연어 처리를 배우기 좋은 날이다. 자연어 처리는 재미있다.'

In [7]:
doc = nlp(text)

In [8]:
doc

[
  [
    {
      "id": 1,
      "text": "오늘은",
      "lemma": "오늘+은",
      "upos": "NOUN",
      "xpos": "ncn+jxt",
      "head": 6,
      "deprel": "dislocated",
      "misc": "start_char=0|end_char=3"
    },
    {
      "id": 2,
      "text": "자연어",
      "lemma": "자연어",
      "upos": "NOUN",
      "xpos": "ncn",
      "head": 3,
      "deprel": "compound",
      "misc": "start_char=4|end_char=7"
    },
    {
      "id": 3,
      "text": "처리를",
      "lemma": "처리+를",
      "upos": "NOUN",
      "xpos": "ncpa+jco",
      "head": 4,
      "deprel": "obj",
      "misc": "start_char=8|end_char=11"
    },
    {
      "id": 4,
      "text": "배우기",
      "lemma": "배우+기",
      "upos": "VERB",
      "xpos": "pvg+etn",
      "head": 5,
      "deprel": "nmod",
      "misc": "start_char=12|end_char=15"
    },
    {
      "id": 5,
      "text": "좋은",
      "lemma": "좋+ㄴ",
      "upos": "ADJ",
      "xpos": "paa+etm",
      "head": 6,
      "deprel": "amod",
      "misc": "start_char=16|end_cha

In [9]:
sentence = doc.sentences[0]

In [10]:
word = sentence.words[0]

In [11]:
word.lemma

'오늘+은'

In [12]:
word.xpos

'ncn+jxt'

## 단어와 품사 태그 짝짓기

In [13]:
lemma = word.lemma.split('+')

In [14]:
xpos = word.xpos.split('+')

In [15]:
list(zip(lemma, xpos))

[('오늘', 'ncn'), ('은', 'jxt')]

In [16]:
for sentence in doc.sentences:
    for word in sentence.words:
        lemma = word.lemma.split('+')
        xpos = word.xpos.split('+')
        for tok, pos in zip(lemma, xpos):
            print(tok, pos)

오늘 ncn
은 jxt
자연어 ncn
처리 ncpa
를 jco
배우 pvg
기 etn
좋 paa
ㄴ etm
날 ncn
이 jp
다 ef
. sf
자연어 ncn
처리 ncpa
는 jxt
재미있 ncps
다 xsm
. sf


## 명사 추출

In [17]:
import stanza

In [18]:
nlp = stanza.Pipeline('ko')

2021-03-28 02:12:16 INFO: Loading these models for language: ko (Korean):
| Processor | Package |
-----------------------
| tokenize  | kaist   |
| pos       | kaist   |
| lemma     | kaist   |
| depparse  | kaist   |

2021-03-28 02:12:16 INFO: Use device: gpu
2021-03-28 02:12:16 INFO: Loading: tokenize
2021-03-28 02:12:16 INFO: Loading: pos
2021-03-28 02:12:17 INFO: Loading: lemma
2021-03-28 02:12:17 INFO: Loading: depparse
2021-03-28 02:12:17 INFO: Done loading processors!


In [19]:
text = '오늘 커피를 마셨다.'

In [20]:
doc = nlp(text)

In [21]:
doc

[
  [
    {
      "id": 1,
      "text": "오늘",
      "lemma": "오늘",
      "upos": "NOUN",
      "xpos": "ncn",
      "head": 3,
      "deprel": "advmod",
      "misc": "start_char=0|end_char=2"
    },
    {
      "id": 2,
      "text": "커피를",
      "lemma": "커피+를",
      "upos": "NOUN",
      "xpos": "ncn+jco",
      "head": 3,
      "deprel": "obj",
      "misc": "start_char=3|end_char=6"
    },
    {
      "id": 3,
      "text": "마셨다",
      "lemma": "마시+었+다",
      "upos": "VERB",
      "xpos": "pvg+ep+ef",
      "head": 0,
      "deprel": "root",
      "misc": "start_char=7|end_char=10"
    },
    {
      "id": 4,
      "text": ".",
      "lemma": ".",
      "upos": "PUNCT",
      "xpos": "sf",
      "head": 3,
      "deprel": "punct",
      "misc": "start_char=10|end_char=11"
    }
  ]
]

In [22]:
for sentence in doc.sentences:
    for word in sentence.words:
        lemma = word.lemma.split('+')
        xpos = word.xpos.split('+')
        for tok, pos in zip(lemma, xpos):
            if pos.startswith('n'):
                print(tok)

오늘
커피


In [23]:
def extract_noun(text):
    doc = nlp(text)
    for sentence in doc.sentences:
        for word in sentence.words:
            lemma = word.lemma.split('+')
            xpos = word.xpos.split('+')
            
            for tok, pos in zip(lemma, xpos):
                if pos.startswith('n'):
                    yield tok

In [24]:
def extract_noun(text):
    doc = nlp(text)
    nouns = []
    for sentence in doc.sentences:
        for word in sentence.words:
            lemma = word.lemma.split('+')
            xpos = word.xpos.split('+')
            
            for tok, pos in zip(lemma, xpos):
                if pos.startswith('n'):
                    nouns.append(tok)
    return nouns

In [25]:
list(extract_noun('편의점에서 알바를 한다.'))

['편의점', '알바']