<a href="https://colab.research.google.com/github/jhkr1/Practical-Al-Natural-Language-Processing/blob/main/Basic_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 텍스트 전처리(Text PreProcessing)

## 1) 토큰화(Tokenizing)

In [None]:
# 1. NLTK 라이브러리를 3.8.1 버전으로 강제 재설치합니다.
!pip install --force-reinstall nltk==3.8.1

# 2. 변경된 버전을 적용하기 위해 런타임을 강제로 재시작합니다.
import os
os.kill(os.getpid(), 9)

Collecting nltk==3.8.1
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk==3.8.1)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting joblib (from nltk==3.8.1)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk==3.8.1)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm (from nltk==3.8.1)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.11.6-cp311-cp311-manylinux_

In [1]:
import nltk
from nltk.tokenize import word_tokenize

# 다운그레이드된 NLTK를 위해 punkt 데이터를 다운로드합니다.
nltk.download('punkt')

text = "This is the very last attempt with a stable NLTK version 3.8.1."
tokens = word_tokenize(text)

print("--- 특정 버전(3.8.1) 설치 후 최종 테스트 ---")
print("성공! 토큰화 결과:")
print(tokens)

# 실제로 버전이 3.8.1로 변경되었는지 확인합니다.
import nltk
print("\n설치된 NLTK 버전:", nltk.__version__)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


--- 특정 버전(3.8.1) 설치 후 최종 테스트 ---
성공! 토큰화 결과:
['This', 'is', 'the', 'very', 'last', 'attempt', 'with', 'a', 'stable', 'NLTK', 'version', '3.8.1', '.']

설치된 NLTK 버전: 3.8.1


In [2]:
import nltk
from nltk.tokenize import WordPunctTokenizer
text = "Barak Obama likes fried chicken very much"
wordpuncttoken = WordPunctTokenizer().tokenize(text)
print(wordpuncttoken)

['Barak', 'Obama', 'likes', 'fried', 'chicken', 'very', 'much']


In [3]:
import nltk
from nltk.tokenize import TreebankWordTokenizer

text = "Barak Obama likes fried chicken very much"
TreebankWordToken = TreebankWordTokenizer().tokenize(text)
print(TreebankWordToken)

['Barak', 'Obama', 'likes', 'fried', 'chicken', 'very', 'much']


## 2) 영문 품사 부착(PoS Tagging)

In [4]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
taggedToken = pos_tag(wordpuncttoken)
print(taggedToken)

[('Barak', 'NNP'), ('Obama', 'NNP'), ('likes', 'VBZ'), ('fried', 'VBN'), ('chicken', 'JJ'), ('very', 'RB'), ('much', 'JJ')]


## 3) 개채명 인식(NER, Named Entity Recognition)

In [6]:
nltk.download('words')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [7]:
from nltk import ne_chunk
neToken = ne_chunk(taggedToken)
print(neToken)

(S
  (PERSON Barak/NNP)
  (ORGANIZATION Obama/NNP)
  likes/VBZ
  fried/VBN
  chicken/JJ
  very/RB
  much/JJ)


## 4) 원형 복원

### 4-1) 어간추출(Stemming)


In [8]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

print('running ->', ps.stem('running'))
print('beautiful ->', ps.stem('beautiful'))
print('believes ->', ps.stem('believes'))
print('using ->', ps.stem('using'))

running -> run
beautiful -> beauti
believes -> believ
using -> use


### 4-2) 표제어 추출(Lemmatization)

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

print("running -> " + wl.lemmatize("running"))
print("beautiful -> " + wl.lemmatize("beautiful"))
print("believes -> " + wl.lemmatize("believes"))
print("using -> " + wl.lemmatize("using"))
print("conversation -> " + wl.lemmatize("conversation"))
print("organization -> " + wl.lemmatize("organization"))
print("studies -> " + wl.lemmatize("studies"))

running -> running
beautiful -> beautiful
believes -> belief
using -> using
conversation -> conversation
organization -> organization
studies -> study


## 5) 불용어 처리(Stopword)

In [11]:
stopPos = ['IN', 'CC', 'UH', 'TO', 'MD', 'DT', 'VBZ','VBP']

In [12]:
from collections import Counter
Counter(taggedToken).most_common()

[(('Barak', 'NNP'), 1),
 (('Obama', 'NNP'), 1),
 (('likes', 'VBZ'), 1),
 (('fried', 'VBN'), 1),
 (('chicken', 'JJ'), 1),
 (('very', 'RB'), 1),
 (('much', 'JJ'), 1)]

In [13]:
stopWord = [',', 'be', 'able', 'very']

word = []
for tag in taggedToken:
  if tag[1] not in stopPos:
    if tag[0] not in stopWord:
      word.append(tag[0])
print(word)

['Barak', 'Obama', 'fried', 'chicken', 'much']


## 6) 영문 텍스트 전처리 종합

In [21]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
nltk.download('maxent_ne_chunker')
nltk.download('wordnet')

from nltk.tokenize import TreebankWordTokenizer
text = "Barak Obama loves fried chicken of KFC"
sumtoken = TreebankWordTokenizer().tokenize(text)
print(sumtoken)

from nltk import pos_tag
sumTaggedToken = pos_tag(sumtoken)
print(sumTaggedToken)

from nltk import ne_chunk
sumNeToken = ne_chunk(sumTaggedToken)
print(sumNeToken)

from nltk.stem import PorterStemmer
ps = PorterStemmer()
print("loves -> " + ps.stem("loves"))
print("fried -> " + ps.stem("fried"))

from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
print("loves -> " + wl.lemmatize("loves"))
print("fried -> " + wl.lemmatize("fried"))


# 불용어 처리
sumStopPos = ['IN']
sumStopWord = ['fried']

word = []
for tag in sumTaggedToken:
  if tag[1] not in sumStopPos:
    if tag[0] not in sumStopWord:
      word.append(tag[0])
print(word)

['Barak', 'Obama', 'loves', 'fried', 'chicken', 'of', 'KFC']
[('Barak', 'NNP'), ('Obama', 'NNP'), ('loves', 'VBZ'), ('fried', 'VBN'), ('chicken', 'NN'), ('of', 'IN'), ('KFC', 'NNP')]
(S
  (PERSON Barak/NNP)
  (ORGANIZATION Obama/NNP)
  loves/VBZ
  fried/VBN
  chicken/NN
  of/IN
  (ORGANIZATION KFC/NNP))
loves -> love
fried -> fri
loves -> love
fried -> fried
['Barak', 'Obama', 'loves', 'chicken', 'KFC']


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 2. 한글 전처리 실습

영문은 공백으로 토큰화가 가능하지만, 한글의 경우 품사를 고려하여 토큰화 해야한다.

## 1) 한글 토큰화 및 형태소 분석

In [22]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.1/494.1 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.2 konlpy-0.6.0


In [27]:
# 코모란(Komoran) 토큰화
from konlpy.tag import Komoran
komoran= Komoran()
kor_text = '인간이 컴퓨터와 대화하고 있다는 것을 깨닫지 못한다면 컴퓨터는 지능적인 것으로 간주가 가능하다.'
komoran_tokens = komoran.morphs(kor_text)
print(komoran_tokens)

['인간', '이', '컴퓨터', '와', '대화', '하', '고', '있', '다는', '것', '을', '깨닫', '지', '못하', 'ㄴ다면', '컴퓨터', '는', '지능', '적', '이', 'ㄴ', '것', '으로', '간주', '가', '가능', '하', '다', '.']


In [23]:
# 한나눔 토큰화
from konlpy.tag import Hannanum
hannanum = Hannanum()
hannanum_tokens = hannanum.morphs(kor_text)
print(hannanum_tokens)

['인간', '이', '컴퓨터', '와', '대화', '하고', '있', '다는', '것', '을', '깨닫', '지', '못하', 'ㄴ다면', '컴퓨터', '는', '지능적', '이', 'ㄴ', '것', '으로', '간주', '가', '가능', '하', '다', '.']


In [24]:
# Okt 토큰화
from konlpy.tag import Okt
okt = Okt()
okt_tokens = okt.morphs(kor_text)
print(okt_tokens)

['인간', '이', '컴퓨터', '와', '대화', '하고', '있다는', '것', '을', '깨닫지', '못', '한다면', '컴퓨터', '는', '지능', '적', '인', '것', '으로', '간주', '가', '가능하다', '.']


In [25]:
from konlpy.tag import Kkma
kkma = Kkma()
kkma_tokens = kkma.morphs(kor_text)
print(kkma_tokens)

['인간', '이', '컴퓨터', '와', '대화', '하', '고', '있', '다는', '것', '을', '깨닫', '지', '못하', 'ㄴ다면', '컴퓨터', '는', '지능', '적', '이', 'ㄴ', '것', '으로', '간주', '가', '가능', '하', '다', '.']


## 2) 한글 품사 부착(PoS Tagging)

In [28]:
# 코모란(Komoran) 품사 태깅
komoranTag = []
for token in komoran_tokens:
    komoranTag += komoran.pos(token)
print(komoranTag)

[('인간', 'NNG'), ('이', 'MM'), ('컴퓨터', 'NNG'), ('오', 'VV'), ('아', 'EC'), ('대화', 'NNG'), ('하', 'NNG'), ('고', 'MM'), ('있', 'VV'), ('달', 'VV'), ('는', 'ETM'), ('것', 'NNB'), ('을', 'NNG'), ('깨닫', 'VV'), ('지', 'NNB'), ('못', 'MAG'), ('하', 'MAG'), ('ㄴ다면', 'EC'), ('컴퓨터', 'NNG'), ('늘', 'VV'), ('ㄴ', 'ETM'), ('지능', 'NNP'), ('적', 'NNB'), ('이', 'MM'), ('ㄴ', 'JX'), ('것', 'NNB'), ('으로', 'JKB'), ('간주', 'NNG'), ('가', 'VV'), ('아', 'EC'), ('가능', 'XR'), ('하', 'NNG'), ('다', 'MAG'), ('.', 'SF')]


In [29]:
hannanumTag = []
for token in hannanum_tokens:
    hannanumTag += hannanum.pos(token)
print(hannanumTag)

[('인간', 'N'), ('이', 'M'), ('컴퓨터', 'N'), ('와', 'I'), ('대화', 'N'), ('하', 'P'), ('고', 'E'), ('있', 'N'), ('다', 'M'), ('는', 'J'), ('것', 'N'), ('을', 'N'), ('깨닫', 'N'), ('지', 'N'), ('못하', 'P'), ('어', 'E'), ('ㄴ다', 'N'), ('이', 'J'), ('면', 'E'), ('컴퓨터', 'N'), ('늘', 'P'), ('ㄴ', 'E'), ('지능적', 'N'), ('이', 'M'), ('ㄴ', 'N'), ('것', 'N'), ('으', 'N'), ('로', 'J'), ('간주', 'N'), ('가', 'J'), ('가능', 'N'), ('하', 'I'), ('다', 'M'), ('.', 'S')]


In [30]:
# Okt 품사 태깅
oktTag = []
for token in okt_tokens:
    oktTag += okt.pos(token)
print(oktTag)

[('인간', 'Noun'), ('이', 'Noun'), ('컴퓨터', 'Noun'), ('와', 'Verb'), ('대화', 'Noun'), ('하고', 'Verb'), ('있다는', 'Adjective'), ('것', 'Noun'), ('을', 'Josa'), ('깨닫지', 'Verb'), ('못', 'Noun'), ('한다면', 'Verb'), ('컴퓨터', 'Noun'), ('는', 'Verb'), ('지능', 'Noun'), ('적', 'Noun'), ('인', 'Noun'), ('것', 'Noun'), ('으로', 'Josa'), ('간주', 'Noun'), ('가', 'Verb'), ('가능하다', 'Adjective'), ('.', 'Punctuation')]


In [31]:
# Kkma 품사 태깅
kkmaTag = []
for token in kkma_tokens:
    kkmaTag += kkma.pos(token)
print(kkmaTag)

[('인간', 'NNG'), ('이', 'NNG'), ('컴퓨터', 'NNG'), ('오', 'VA'), ('아', 'ECS'), ('대화', 'NNG'), ('하', 'NNG'), ('고', 'NNG'), ('있', 'VA'), ('달', 'VV'), ('는', 'ETD'), ('것', 'NNB'), ('을', 'NNG'), ('깨닫', 'VV'), ('지', 'NNG'), ('못하', 'VX'), ('ㄴ다면', 'UN'), ('컴퓨터', 'NNG'), ('늘', 'VA'), ('ㄴ', 'ETD'), ('지능', 'NNG'), ('적', 'NNG'), ('이', 'NNG'), ('ㄴ', 'NNG'), ('것', 'NNB'), ('으', 'UN'), ('로', 'JKM'), ('간주', 'NNG'), ('가', 'NNG'), ('가능', 'NNG'), ('하', 'NNG'), ('다', 'NNG'), ('.', 'SF')]


## 3) 불용어(Stopword) 처리

In [35]:
# 불용어 처리
stopPos = ['Suffix', 'Punctuation', 'Josa','Foreign','Alpha','Number']

# 최빈어 조회. 최빈어를 조회하여 불용어 제거 대상을 선정
from collections import Counter
Counter(oktTag).most_common()

stopWord = ['의','이','로','두고','들','를','은','과','수','했다','것','있는','한다','하는','그','있다','할','이런','되기','해야','있게','여기']

word = []
for tag in oktTag:
  if tag[1] not in stopPos:
    if tag[0] not in stopWord:
      word.append(tag[0])
print(word)

['인간', '컴퓨터', '와', '대화', '하고', '있다는', '깨닫지', '못', '한다면', '컴퓨터', '는', '지능', '적', '인', '간주', '가', '가능하다']
