In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [2]:
DATA_PATH =  os.getcwd() + '/data/review/'
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device,DATA_PATH

('cuda', 'c:\\study\\04_NLP/data/review/')

In [3]:
train = pd.read_csv(f"{DATA_PATH}review_train.csv")
test = pd.read_csv(f"{DATA_PATH}review_test.csv")

train.shape, test.shape

((2000, 3), (1000, 2))

- 긍정 1, 부정 0

In [4]:
train.head()

Unnamed: 0,id,review,target
0,train_0,이런 최고의 영화를 이제서야 보다니,1
1,train_1,안봤지만 유승준나와서 비추.,0
2,train_2,시대를 못 따라간 연출과 촌스러운 영상미.,0
3,train_3,원소전 굿,1
4,train_4,ㅋㅋㅋㅋ 개봉영화평점단사람이1명 ㅋㅋㅋㅋ,1


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      2000 non-null   object
 1   review  2000 non-null   object
 2   target  2000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 47.0+ KB


In [6]:
train["target"].mean()

0.519

# kiwi 라이브러리

```shell
!pip install kiwipiepy
```

In [7]:
!pip install kiwipiepy

Collecting kiwipiepy
  Downloading kiwipiepy-0.18.0-cp312-cp312-win_amd64.whl.metadata (1.1 kB)
Collecting kiwipiepy-model<0.19,>=0.18 (from kiwipiepy)
  Downloading kiwipiepy_model-0.18.0.tar.gz (34.7 MB)
     ---------------------------------------- 0.0/34.7 MB ? eta -:--:--
     -- ------------------------------------- 2.1/34.7 MB 11.8 MB/s eta 0:00:03
     ----- ---------------------------------- 4.7/34.7 MB 11.9 MB/s eta 0:00:03
     -------- ------------------------------- 7.1/34.7 MB 11.8 MB/s eta 0:00:03
     ---------- ----------------------------- 9.4/34.7 MB 11.7 MB/s eta 0:00:03
     ------------- ------------------------- 11.8/34.7 MB 11.7 MB/s eta 0:00:02
     --------------- ----------------------- 14.2/34.7 MB 11.7 MB/s eta 0:00:02
     ------------------ -------------------- 16.5/34.7 MB 11.7 MB/s eta 0:00:02
     --------------------- ----------------- 18.9/34.7 MB 11.7 MB/s eta 0:00:02
     ----------------------- --------------- 21.2/34.7 MB 11.7 MB/s eta 0:00:02
  



In [7]:
from kiwipiepy import Kiwi

kiwi = Kiwi()

In [8]:
text = train["review"][0]
text

'이런 최고의 영화를 이제서야 보다니'

In [9]:
result = kiwi.analyze(text)
result,result[0][0][0]

([([Token(form='이런', tag='MM', start=0, len=2),
    Token(form='최고', tag='NNG', start=3, len=2),
    Token(form='의', tag='JKG', start=5, len=1),
    Token(form='영화', tag='NNG', start=7, len=2),
    Token(form='를', tag='JKO', start=9, len=1),
    Token(form='이제서야', tag='MAG', start=11, len=4),
    Token(form='보', tag='VV', start=16, len=1),
    Token(form='다니', tag='EF', start=17, len=2)],
   -63.57200622558594)],
 Token(form='이런', tag='MM', start=0, len=2))

In [10]:
result = kiwi.analyze(text, top_n=2) # 분석 확률이 높은 몇개를 반환 할 것인지
result

[([Token(form='이런', tag='MM', start=0, len=2),
   Token(form='최고', tag='NNG', start=3, len=2),
   Token(form='의', tag='JKG', start=5, len=1),
   Token(form='영화', tag='NNG', start=7, len=2),
   Token(form='를', tag='JKO', start=9, len=1),
   Token(form='이제서야', tag='MAG', start=11, len=4),
   Token(form='보', tag='VV', start=16, len=1),
   Token(form='다니', tag='EF', start=17, len=2)],
  -63.57200622558594),
 ([Token(form='이런', tag='MM', start=0, len=2),
   Token(form='최고', tag='NNG', start=3, len=2),
   Token(form='의', tag='JKG', start=5, len=1),
   Token(form='영화', tag='NNG', start=7, len=2),
   Token(form='를', tag='JKO', start=9, len=1),
   Token(form='이제서야', tag='MAG', start=11, len=4),
   Token(form='보', tag='VV', start=16, len=1),
   Token(form='다니', tag='EF', start=17, len=2)],
  -63.57200622558594)]

In [11]:
tmp = result[0][0]
tmp

[Token(form='이런', tag='MM', start=0, len=2),
 Token(form='최고', tag='NNG', start=3, len=2),
 Token(form='의', tag='JKG', start=5, len=1),
 Token(form='영화', tag='NNG', start=7, len=2),
 Token(form='를', tag='JKO', start=9, len=1),
 Token(form='이제서야', tag='MAG', start=11, len=4),
 Token(form='보', tag='VV', start=16, len=1),
 Token(form='다니', tag='EF', start=17, len=2)]

In [12]:
tmp[0].form

'이런'

- 가장 높은 형태소 분석 결과만 반환하기

In [13]:
result = kiwi.tokenize(text)
result

[Token(form='이런', tag='MM', start=0, len=2),
 Token(form='최고', tag='NNG', start=3, len=2),
 Token(form='의', tag='JKG', start=5, len=1),
 Token(form='영화', tag='NNG', start=7, len=2),
 Token(form='를', tag='JKO', start=9, len=1),
 Token(form='이제서야', tag='MAG', start=11, len=4),
 Token(form='보', tag='VV', start=16, len=1),
 Token(form='다니', tag='EF', start=17, len=2)]

In [14]:
for token in result:
    print(token.form , token.tag) # 분리된 단어, 품사정보

이런 MM
최고 NNG
의 JKG
영화 NNG
를 JKO
이제서야 MAG
보 VV
다니 EF


- iterable 객체를 전달할 경우 map 객체로 반환

In [15]:
result = kiwi.tokenize(train["review"][:2])
result # 이터러블한 객체임

<map at 0x1833004b9d0>

In [16]:
for lst in result: # 꺼낼 때 마다 형태소 분석을 수행
    print(lst)

[Token(form='이런', tag='MM', start=0, len=2), Token(form='최고', tag='NNG', start=3, len=2), Token(form='의', tag='JKG', start=5, len=1), Token(form='영화', tag='NNG', start=7, len=2), Token(form='를', tag='JKO', start=9, len=1), Token(form='이제서야', tag='MAG', start=11, len=4), Token(form='보', tag='VV', start=16, len=1), Token(form='다니', tag='EF', start=17, len=2)]
[Token(form='안', tag='MAG', start=0, len=1), Token(form='보', tag='VV', start=1, len=1), Token(form='었', tag='EP', start=1, len=1), Token(form='지만', tag='EC', start=2, len=2), Token(form='유승준', tag='NNP', start=5, len=3), Token(form='나오', tag='VV', start=8, len=2), Token(form='어서', tag='EC', start=9, len=2), Token(form='비추', tag='VV', start=12, len=2), Token(form='.', tag='SF', start=14, len=1)]


In [17]:
data_sample=[]
num=0
result = kiwi.tokenize(train["review"][:2])

for lst in result:
    sub=[]
    num+=1
    print(f'{num}번째 문장')
    for token in lst:
        sub.append(token.form)

    data_sample.append(sub)

1번째 문장
2번째 문장


In [18]:
data_sample

[['이런', '최고', '의', '영화', '를', '이제서야', '보', '다니'],
 ['안', '보', '었', '지만', '유승준', '나오', '어서', '비추', '.']]

## 불용어

In [19]:
from kiwipiepy.utils import Stopwords
stopwords = Stopwords()
stopwords.stopwords # 불용어 리스트

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP'),
 ('원', 'NNB'),


- 불용어 추가하기

In [20]:
stopwords.add("재연")
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP'),
 ('원', 'NNB'),


In [21]:
stopwords.add(["민수","길동","재연"]) # 여려개 추가 가능

In [22]:
stopwords.add( ("철수", "NNG") ) # 품사를 지정해서 추가 가능, 단 튜플로 전달해야함.

- 불용어 제거하기

In [23]:
stopwords.remove(["재연", "민수", "길동", ("철수", "NNG") ])

```
train["review"] 의 문장들을 kiwi 를 이용하여 토큰화해서 새로운 리스트에 담아주세요.
품사정보가 N, V , J , M 으로 시작하는 토큰만 담아주세요.
```

In [24]:
result = kiwi.tokenize(train["review"], stopwords=stopwords) 
# stopwords=stopwords 는 불용어 제거할 경우
train_list = []

for lst in tqdm(result, total=train.shape[0]):
    tmp = [ token.form for token in lst if token.tag[0] in "NVJM" ]
    train_list.append(tmp)

  0%|          | 0/2000 [00:00<?, ?it/s]

- 토큰이 하나도 없는 샘플이 존재하는지 확인해보기

In [25]:
min( len(lst) for lst in train_list  ) 
# 각 행에 최소 길이가 존재

0

In [26]:
word_count = [ len(lst) for lst in train_list ]
mask = np.array(word_count) == 0
mask.sum()

36

In [27]:
train.loc[mask] # 토큰화에 안걸린 결과 값들

Unnamed: 0,id,review,target
31,train_31,대박....,1
102,train_102,What a great drama!!!,1
103,train_103,Space Jason!!!!,0
225,train_225,the roles play very real touching,1
307,train_307,참신하지는 않다,0
342,train_342,...,0
524,train_524,harry potter go!,1
581,train_581,ㅋㅋ,1
640,train_640,.,0
670,train_670,ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ,0


# spacy 라이브러리

- 한국어 형태소 분석 모델 다운로드

In [86]:
!python -m spacy download ko_core_news_sm

Collecting ko-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ko_core_news_sm-3.7.0/ko_core_news_sm-3.7.0-py3-none-any.whl (14.7 MB)
     ---------------------------------------- 0.0/14.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/14.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/14.7 MB ? eta -:--:--
     - -------------------------------------- 0.5/14.7 MB 1.9 MB/s eta 0:00:08
     ---- ----------------------------------- 1.8/14.7 MB 4.6 MB/s eta 0:00:03
     -------- ------------------------------- 3.1/14.7 MB 5.8 MB/s eta 0:00:03
     ---------------- ----------------------- 6.0/14.7 MB 7.5 MB/s eta 0:00:02
     ------------------- -------------------- 7.3/14.7 MB 8.1 MB/s eta 0:00:01
     ----------------------------- ---------- 10.7/14.7 MB 8.9 MB/s eta 0:00:01
     ----------------------------------- ---- 13.1/14.7 MB 9.3 MB/s eta 0:00:01
     ---------------------------------------- 14.7



In [28]:
import spacy
nlp = spacy.load('ko_core_news_sm')
nlp

<spacy.lang.ko.Korean at 0x183407d5760>

In [29]:
text

'이런 최고의 영화를 이제서야 보다니'

In [30]:
doc = nlp(text)
doc

이런 최고의 영화를 이제서야 보다니

In [31]:
lst = [  [token.text, token.lemma_, token.tag_, token.is_alpha, token.is_stop ]  for token in doc  ]
pd.DataFrame(lst, columns= ["원래단어", "형태소", "품사", "한글여부", "불용어여부"])

Unnamed: 0,원래단어,형태소,품사,한글여부,불용어여부
0,이런,이런,mmd,True,False
1,최고의,최고+의,ncn+jcm,True,False
2,영화를,영화+를,ncn+jco,True,False
3,이제서야,이제서야,ncpa+xsv+ecs,True,False
4,보다니,보+다니,pvg+ecs,True,False


In [32]:
train_list = []
for text in tqdm( train["review"] ):
    doc = nlp(text)
    token_list = []
    
    for token in doc:
        tmp = token.lemma_.split("+")
        token_list.extend(tmp)

    train_list.append(token_list)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [33]:
token_list

['보다가', '감', '동', '...', '진짜', '울', '뻔']

# konlpy 라이브러리
- C++, 자바 등 다른 언어로 개발된 오픈소스 형태소 분석기를 파이썬 환경에서 사용할 수 있게 해주는 라이브러리

In [93]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
   ---------------------------------------- 0.0/19.4 MB ? eta -:--:--
   --- ------------------------------------ 1.8/19.4 MB 10.1 MB/s eta 0:00:02
   -------- ------------------------------- 4.2/19.4 MB 11.0 MB/s eta 0:00:02
   -------------- ------------------------- 6.8/19.4 MB 11.3 MB/s eta 0:00:02
   ------------------ --------------------- 9.2/19.4 MB 11.4 MB/s eta 0:00:01
   ----------------------- ---------------- 11.5/19.4 MB 11.5 MB/s eta 0:00:01
   ---------------------------- ----------- 13.9/19.4 MB 11.5 MB/s eta 0:00:01
   --------------------------------- ------ 16.3/19.4 MB 11.5 MB/s eta 0:00:01
   -------------------------------------- - 18.6/19.4 MB 11.5 MB/s eta 0:00:01
   ---------------------------------------- 19.4/19.



In [34]:
from konlpy.tag import Okt, Komoran, Hannanum, Kkma

In [35]:
text = train["review"][0]
text

'이런 최고의 영화를 이제서야 보다니'

## Okt

In [36]:
tokenizer = Okt()

In [37]:
tokenizer.morphs(text)

['이런', '최고', '의', '영화', '를', '이제', '서야', '보다니']

In [38]:
tokenizer.pos(text)

[('이런', 'Adjective'),
 ('최고', 'Noun'),
 ('의', 'Josa'),
 ('영화', 'Noun'),
 ('를', 'Josa'),
 ('이제', 'Noun'),
 ('서야', 'Verb'),
 ('보다니', 'Verb')]

## Hannanum

In [39]:
tokenizer = Hannanum()
tokenizer.morphs(text)

['이런', '최고', '의', '영화', '를', '이제', '서야', '보', '이', '다니']

In [40]:
tokenizer.pos(text)

[('이런', 'M'),
 ('최고', 'N'),
 ('의', 'J'),
 ('영화', 'N'),
 ('를', 'J'),
 ('이제', 'M'),
 ('서야', 'J'),
 ('보', 'N'),
 ('이', 'J'),
 ('다니', 'E')]

## Komoran

In [41]:
tokenizer = Komoran()
tokenizer.morphs(text)

['이런', '최고', '의', '영화', '를', '이제서야', '보', '다니']

In [42]:
tokenizer.pos(text)

[('이런', 'MM'),
 ('최고', 'NNG'),
 ('의', 'JKG'),
 ('영화', 'NNG'),
 ('를', 'JKO'),
 ('이제서야', 'MAG'),
 ('보', 'VV'),
 ('다니', 'EC')]

## Kkma

In [43]:
tokenizer = Kkma()
tokenizer.morphs(text)

['이런', '최고', '의', '영화', '를', '이제서야', '보', '다니']

In [44]:
tokenizer.pos(text)

[('이런', 'MDT'),
 ('최고', 'NNG'),
 ('의', 'JKG'),
 ('영화', 'NNG'),
 ('를', 'JKO'),
 ('이제서야', 'MAG'),
 ('보', 'VV'),
 ('다니', 'EFN')]

# mecab 라이브러리

In [105]:
!pip install python-mecab-ko

Collecting python-mecab-ko
  Downloading python_mecab_ko-1.3.7-cp312-cp312-win_amd64.whl.metadata (3.5 kB)
Collecting python-mecab-ko-dic (from python-mecab-ko)
  Downloading python_mecab_ko_dic-2.1.1.post2-py3-none-any.whl.metadata (1.4 kB)
Downloading python_mecab_ko-1.3.7-cp312-cp312-win_amd64.whl (653 kB)
   ---------------------------------------- 0.0/653.5 kB ? eta -:--:--
   --------------------------------------- 653.5/653.5 kB 12.6 MB/s eta 0:00:00
Downloading python_mecab_ko_dic-2.1.1.post2-py3-none-any.whl (34.5 MB)
   ---------------------------------------- 0.0/34.5 MB ? eta -:--:--
   -- ------------------------------------- 2.4/34.5 MB 12.2 MB/s eta 0:00:03
   ----- ---------------------------------- 4.7/34.5 MB 11.9 MB/s eta 0:00:03
   -------- ------------------------------- 7.1/34.5 MB 11.8 MB/s eta 0:00:03
   ---------- ----------------------------- 9.4/34.5 MB 11.7 MB/s eta 0:00:03
   ------------- -------------------------- 11.8/34.5 MB 11.5 MB/s eta 0:00:02
   ---



In [45]:
from mecab import MeCab

In [46]:
tokenizer = MeCab()

In [47]:
tokenizer.morphs(text)

['이런', '최고', '의', '영화', '를', '이제서야', '보', '다니']

In [48]:
tokenizer.pos(text)

[('이런', 'MM'),
 ('최고', 'NNG'),
 ('의', 'JKG'),
 ('영화', 'NNG'),
 ('를', 'JKO'),
 ('이제서야', 'MAG'),
 ('보', 'VV'),
 ('다니', 'EC')]

- 속도 비교 해보기

In [49]:
tokenizer = MeCab() # 클래스 객체 변경해보면서 속도 비교해보세요
train_list = []
for text in tqdm( train["review"] ):
    lst = tokenizer.pos(text)
    lst = [ t for t, p in lst ] # 한 문장의 단어들 리스트
    train_list.append(lst) # 문장들의 묶음 리스트

  0%|          | 0/2000 [00:00<?, ?it/s]

In [50]:
train_list

[['이런', '최고', '의', '영화', '를', '이제서야', '보', '다니'],
 ['안', '봤', '지만', '유승준', '나와서', '비추', '.'],
 ['시대', '를', '못', '따라간', '연출', '과', '촌스러운', '영상미', '.'],
 ['원소전', '굿'],
 ['ㅋㅋ', 'ㅋㅋ', '개봉', '영화', '평점', '단', '사람', '이', '1', '명', 'ㅋㅋ', 'ㅋㅋ'],
 ['실화', '라니', '너무', '가슴', '아프', '다', '.', '..'],
 ['뭐', '야', '이거', 'ㅡㅡ폴', '워커', '믿', '고', '볼', '영화', '는', '분질', '뿐', '인가'],
 ['갑자기',
  '생각나',
  '서',
  '오늘',
  '다시',
  '봤',
  '는데',
  '.',
  '..',
  '역시',
  '.',
  '..',
  '말',
  '이',
  '필요',
  '없',
  '네요'],
 ['주', '님', '사랑', '합니다', '.', '행복', '합니다', '.'],
 ['하드보일드',
  '액션',
  '.',
  '..',
  '그건',
  '봐',
  '줄',
  '만',
  '한데',
  '.',
  '..',
  '스토리',
  '가',
  '약해',
  '.',
  '..',
  '그래서',
  '설득력',
  '도',
  '떨어지',
  '고'],
 ['믿',
  '을',
  '수',
  '없',
  '어',
  '이렇게',
  '평점',
  '이',
  '높',
  '다니',
  'ㅋㅋㅋ',
  'ㅋㅋㅋ',
  'ㅋㅋㅋ',
  'ㅋㅋㅋ',
  'ㅋㅋㅋ',
  'ㅋㅋㅋ',
  'ㅋㅋㅋ'],
 ['스토리', '가', '좀', '아쉽', '긴', '하지만', '나름', '설레', '면서', '봄'],
 ['춘추',
  '전국',
  '시대',
  '에',
  '공자',
  '를',
  '비롯',
  '한',
  '그',
  '제자',
  '들',
  '이',
  

# kiwi 사용해서 토큰화 데이터셋 만들기

In [51]:
kiwi = Kiwi()

In [52]:
result = kiwi.tokenize( train["review"] ) # 샘플별로 형태소 분석을 할 수 있는 map 객체 생성
train_list = [] # 토큰화된 샘플들을 담을 리스트
for tokens in tqdm(result, total=train.shape[0] ):
    lst = [ t.form for t in tokens]
    train_list.append(lst)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [53]:
min( len(lst) for lst in train_list ) # 키위토큰나이저에 필터링 되지 않은 것

1

In [54]:
word_count = [ len(lst) for lst in train_list ]
mask = np.array(word_count) == 1
train.loc[mask]

Unnamed: 0,id,review,target
289,train_289,할렐루야,1
342,train_342,...,0
386,train_386,굿,1
400,train_400,군,1
470,train_470,별로,0
522,train_522,최악,0
581,train_581,ㅋㅋ,1
640,train_640,.,0
670,train_670,ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ,0
764,train_764,곧,0


In [55]:
result = kiwi.tokenize( test["review"] ) # 샘플별로 형태소 분석을 할 수 있는 map 객체 생성
test_list = [] # 토큰화된 샘플들을 담을 리스트
for tokens in tqdm(result, total=test.shape[0] ):
    lst = [ t.form for t in tokens]
    test_list.append(lst)

  0%|          | 0/1000 [00:00<?, ?it/s]

# 어휘집 만들기

In [56]:
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(train_list, specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])



In [57]:
len(vocab)

4853

# 토큰별로 단어번호 부여하기

In [58]:
train_list = [ vocab(lst) for lst in train_list ]
test_list = [ vocab(lst) for lst in test_list ]

# 패딩

In [59]:
token_count = [ len(lst) for lst in train_list  ]

np.mean(token_count), np.min(token_count) , np.max(token_count)

(19.654, 1, 94)

In [60]:
max_len = np.max(token_count).astype(int)
max_len

94

In [61]:
train_data = [ [0] * (max_len-len(lst)) + lst  if len(lst) < max_len else lst[:max_len]  for lst in train_list ]
test_data = [ [0] * (max_len-len(lst)) + lst  if len(lst) < max_len else lst[:max_len]  for lst in test_list ]

train_data = np.array(train_data)
test_data = np.array(test_data)

train_data.shape, test_data.shape # batch, seq

((2000, 94), (1000, 94))

- 정답 데이터

In [62]:
target = train["target"].to_numpy().reshape(-1, 1)
target.shape

(2000, 1)

```
평가지표는 정확도
```

# 데이터셋 클래스

In [63]:
train_data.dtype

dtype('int32')

In [64]:
target.dtype

dtype('int64')

In [65]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, x, y=None):
        self.x = x  # 토큰화되어 단어번호가 부여된 데이터셋
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        item = {}
        item["x"] = torch.tensor(self.x[idx]) # 기존 int64 형태의 타입을 유지하면서 텐서로 변경하기 위해

        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx]) # float32 형태로 전달해야하기 때문에 Tensor 클래스 사용하여 텐서 변환
        return item

In [66]:
class dataset(torch.utils.data.Dataset):
    def __init__(self,x,y=None):
        self.x=x
        if y is not None:
            if y.ndim != 2:
                self.y=y.reshape(-1,1)
            else: self.y=y
    def __len__(self):
        return len(self.x)
    def __getitem__(self,idx):
        return {"x":torch.tensor(self.x[idx]),"y":torch.Tensor(self.y[idx])} if self.y is not None else {"x":torch.tensor(self.x[idx])}

In [67]:
dt = dataset(train_data, target)
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
batch = next(iter(dl))
batch

{'x': tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,   83,   68,   12,    7,   27, 1189,   10,  210],
         [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0, 

In [68]:
dt = ReviewDataset(train_data, target)
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
batch = next(iter(dl))
batch

{'x': tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,   83,   68,   12,    7,   27, 1189,   10,  210],
         [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0, 

# 모델 클래스

In [69]:
class Net(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.emb_layer = torch.nn.Embedding(vocab_size, embedding_dim) # 단어 사전 크기와 임베딩 벡터 크기 전달

        self.conv1d_block = torch.nn.Sequential(
            torch.nn.Conv1d(embedding_dim, embedding_dim * 2, 3), # 입력 피처크기, 출력 피처크기, 커널 사이즈
            torch.nn.ReLU(), # 비선형성
            torch.nn.MaxPool1d(2), # seq 차원 크기가 절반 줄이기
            torch.nn.Conv1d(embedding_dim*2, embedding_dim * 4, 3), # 입력 피처크기, 출력 피처크기, 커널 사이즈
            torch.nn.ReLU(), # 비선형성
            torch.nn.MaxPool1d(2), # seq 차원 크기가 절반 줄이기
        )
        self.gl_pool = torch.nn.AdaptiveMaxPool1d(1)
        self.output_layer = torch.nn.Linear(embedding_dim * 4, 1)

    def forward(self, x):
        x = self.emb_layer(x) # batch, seq , features
        x = x.permute(0, 2, 1) # batch, features, seq
        x = self.conv1d_block(x) # batch, features, seq , conv1d_block 층임
        x = self.gl_pool(x) # batch, features, 1
        x = x.flatten(1) # batch, features x 1
        return self.output_layer(x)

In [70]:
model = Net(len(vocab), 128)
model(batch["x"])

tensor([[-0.1784],
        [-0.1814]], grad_fn=<AddmmBackward0>)

# 학습 loop 함수

In [71]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train() # 학습 모드
    
    for batch in dataloader:

        pred = model( batch["x"].to(device) )
        loss = loss_fn( pred, batch["y"].to(device) )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

# 검증 및 테스트 loop 함수

In [72]:
@torch.no_grad()
def test_loop(dataloader, model, loss_fn, device):
    epoch_loss = 0
    pred_list = []
    act_func = torch.nn.Sigmoid()
    model.eval() # 평가 모드

    for batch in dataloader:
        pred = model( batch["x"].to(device) )

        if batch.get("y") is not None:
            loss = loss_fn( pred, batch["y"].to(device) )
            epoch_loss += loss.item()

        pred = act_func(pred) # logit 값을 확률로 변환
        pred = pred.to("cpu").numpy() # cpu 이동후 ndarray 로변환
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    
    return epoch_loss, pred

# 하이퍼파라미터 정의

In [134]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
n_splits = 5
cv = KFold(n_splits, shuffle=True, random_state=SEED)

batch_size = 32 # 배치 사이즈
loss_fn = torch.nn.BCEWithLogitsLoss() # 손실 객체
epochs = 100 # 최대 가능한 에폭수

vocab_size = len(vocab) # 단어 사전 크기
embedding_dim = 128 # 임베딩 벡터 크기

# 학습

In [109]:
DATA_PATH

'c:\\study\\04_NLP/data/review/'

In [138]:
is_holdout = False
reset_seeds(SEED) # 재현을 위해 시드고정
best_score_list = []

for i, (tri, vai) in enumerate( cv.split(train_data) ):
    # 학습용 데이터로더 객체
    train_dt = ReviewDataset(train_data[tri], target[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용 데이터로더 객체
    valid_dt = ReviewDataset(train_data[vai], target[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 객체 생성
    model = Net(vocab_size, embedding_dim).to(device)
    optimizer = torch.optim.Adam( model.parameters() )

    best_score = 0 # 현재 최고 점수
    patience = 0 # 조기 종료 조건을 주기 위한 변수
    
    for epoch in range(epochs):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)

        pred = (pred > 0.5).astype(int) # 이진분류 문제에서 클래스 번호 결정
        score = accuracy_score(target[vai], pred)

        #print(train_loss, valid_loss, score)
        if score > best_score:
            best_score = score # 최고 점수 업데이트
            patience = 0
            torch.save(model.state_dict(), f"{DATA_PATH}\weight\model_{i}.pth") # 최고 점수 모델 가중치 저장

        patience += 1
        if patience == 5:
            break

    print(f"{i}번째 폴드 최고 정확도: {best_score}")
    best_score_list.append(best_score)

    if is_holdout:
        break

  torch.save(model.state_dict(), f"{DATA_PATH}\weight\model_{i}.pth") # 최고 점수 모델 가중치 저장


0번째 폴드 최고 정확도: 0.7475
1번째 폴드 최고 정확도: 0.76
2번째 폴드 최고 정확도: 0.755
3번째 폴드 최고 정확도: 0.7325
4번째 폴드 최고 정확도: 0.7425


In [139]:
np.mean(best_score_list)

0.7475

# 추론

In [140]:
test_dt = ReviewDataset(test_data)
test_dl = torch.utils.data.DataLoader(test_dt, batch_size=batch_size, shuffle=False)

In [141]:
device

'cuda'

In [143]:
pred_list = []
for i in tqdm(range(n_splits)):
    model = Net(vocab_size, embedding_dim).to(device)
    state_dict = torch.load(f"{DATA_PATH}\weight\model_{i}.pth")
    model.load_state_dict(state_dict)

    _, pred = test_loop(test_dl, model, loss_fn, device)

    pred_list.append(pred)
    if is_holdout:
        break

  state_dict = torch.load(f"{DATA_PATH}\weight\model_{i}.pth")


  0%|          | 0/5 [00:00<?, ?it/s]

In [144]:
pred = np.mean(pred_list, axis=0)
(pred > 0.5).astype(int)

array([[0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
    