In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "../data/"
SEED = 42
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 네이버 영화 리뷰 데이터
- 학습데이터
    - https://drive.google.com/file/d/1B1TjJQPR2POtZmUxUC7yjo6SqcL200D3/view?usp=sharing
- 테스트데이터
    - https://drive.google.com/file/d/1EsqnKZ-UWELNq46UPJ8psoOEc4nZNKDK/view?usp=sharing
- 긍정1, 부정0

In [2]:
train = pd.read_csv(f"{DATA_PATH}review_train.csv")
test = pd.read_csv(f"{DATA_PATH}review_test.csv")

train.shape, test.shape

((2000, 3), (1000, 2))

# kiwi를 이용한 형태소 분석
- Kiwipiepy는 한국어 형태소 분석기인 Kiwi(Korean Intelligent Word Identifier)의 Python 라이브러리
- 품사 정보
    - https://github.com/bab2min/Kiwi#%ED%92%88%EC%82%AC-%ED%83%9C%EA%B7%B8
- 설치
    ```bash
    pip install kiwipiepy
    ```

In [3]:
from kiwipiepy import Kiwi
kiwi = Kiwi()

In [4]:
text = train["review"][0]
text

'이런 최고의 영화를 이제서야 보다니'

## analyze 메서드
- 문서 전달 시 여러 개의 형태소 분석 결과 반환

In [5]:
kiwi.analyze(text, top_n=2)

[([Token(form='이런', tag='MM', start=0, len=2),
   Token(form='최고', tag='NNG', start=3, len=2),
   Token(form='의', tag='JKG', start=5, len=1),
   Token(form='영화', tag='NNG', start=7, len=2),
   Token(form='를', tag='JKO', start=9, len=1),
   Token(form='이제서야', tag='MAG', start=11, len=4),
   Token(form='보', tag='VV', start=16, len=1),
   Token(form='다니', tag='EF', start=17, len=2)],
  -63.7940559387207),
 ([Token(form='이런', tag='MM', start=0, len=2),
   Token(form='최고', tag='NNG', start=3, len=2),
   Token(form='의', tag='JKG', start=5, len=1),
   Token(form='영화', tag='NNG', start=7, len=2),
   Token(form='를', tag='JKO', start=9, len=1),
   Token(form='이제', tag='NNG', start=11, len=2),
   Token(form='서', tag='JKB', start=13, len=1),
   Token(form='야', tag='JX', start=14, len=1),
   Token(form='보', tag='VV', start=16, len=1),
   Token(form='다니', tag='EC', start=17, len=2)],
  -64.6888656616211)]

In [6]:
result = kiwi.analyze(text)
result[0][0]

[Token(form='이런', tag='MM', start=0, len=2),
 Token(form='최고', tag='NNG', start=3, len=2),
 Token(form='의', tag='JKG', start=5, len=1),
 Token(form='영화', tag='NNG', start=7, len=2),
 Token(form='를', tag='JKO', start=9, len=1),
 Token(form='이제서야', tag='MAG', start=11, len=4),
 Token(form='보', tag='VV', start=16, len=1),
 Token(form='다니', tag='EF', start=17, len=2)]

## tokenize 메서드
- 하나의 형태소 분석 결과 반환

In [7]:
result = kiwi.tokenize(text)
result

[Token(form='이런', tag='MM', start=0, len=2),
 Token(form='최고', tag='NNG', start=3, len=2),
 Token(form='의', tag='JKG', start=5, len=1),
 Token(form='영화', tag='NNG', start=7, len=2),
 Token(form='를', tag='JKO', start=9, len=1),
 Token(form='이제서야', tag='MAG', start=11, len=4),
 Token(form='보', tag='VV', start=16, len=1),
 Token(form='다니', tag='EF', start=17, len=2)]

In [8]:
result[0].form # 토큰 문자열

'이런'

In [9]:
result[0].tag # 품사

'MM'

In [10]:
result[0][0], result[0][1]

('이런', 'MM')

- iterable 한 객체를 전달할 경우 map 객체 반환

In [11]:
result = kiwi.tokenize(train["review"].iloc[:2])

for tokens in result:
    print(tokens)

[Token(form='이런', tag='MM', start=0, len=2), Token(form='최고', tag='NNG', start=3, len=2), Token(form='의', tag='JKG', start=5, len=1), Token(form='영화', tag='NNG', start=7, len=2), Token(form='를', tag='JKO', start=9, len=1), Token(form='이제서야', tag='MAG', start=11, len=4), Token(form='보', tag='VV', start=16, len=1), Token(form='다니', tag='EF', start=17, len=2)]
[Token(form='안', tag='MAG', start=0, len=1), Token(form='보', tag='VV', start=1, len=1), Token(form='었', tag='EP', start=1, len=1), Token(form='지만', tag='EC', start=2, len=2), Token(form='유승준', tag='NNP', start=5, len=3), Token(form='나오', tag='VV', start=8, len=2), Token(form='어서', tag='EC', start=9, len=2), Token(form='비추', tag='VV', start=12, len=2), Token(form='.', tag='SF', start=14, len=1)]


## 불용어

In [12]:
from kiwipiepy.utils import Stopwords

stopwords = Stopwords()
stopwords.stopwords # set 자료형

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP'),
 ('원', 'NNB'),


- 불용어 추가

In [13]:
stopwords.add("관수")
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('관수', 'NNP'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP'),

In [14]:
stopwords.add(["길동", "민수"])
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('관수', 'NNP'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('길동', 'NNP'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('민수', 'NNP'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'

In [15]:
stopwords.add(("철수", "NNP")) # 품사 지정해서 추가할 경우 튜플로 전달
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('관수', 'NNP'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('길동', 'NNP'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('민수', 'NNP'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'

- 불용어 삭제

In [16]:
stopwords.remove(["관수", "철수", "길동", "민수"])
stopwords.stopwords

{('ᆫ', 'ETM'),
 ('ᆫ', 'JX'),
 ('ᆫ다', 'EF'),
 ('ᆯ', 'ETM'),
 ('가', 'JKS'),
 ('같', 'VA'),
 ('것', 'NNB'),
 ('게', 'EC'),
 ('겠', 'EP'),
 ('고', 'EC'),
 ('고', 'JKQ'),
 ('과', 'JC'),
 ('과', 'JKB'),
 ('그', 'MM'),
 ('그', 'NP'),
 ('기', 'ETN'),
 ('까지', 'JX'),
 ('나', 'NP'),
 ('년', 'NNB'),
 ('는', 'ETM'),
 ('는', 'JX'),
 ('다', 'EC'),
 ('다', 'EF'),
 ('다고', 'EC'),
 ('다는', 'ETM'),
 ('대하', 'VV'),
 ('더', 'MAG'),
 ('던', 'ETM'),
 ('도', 'JX'),
 ('되', 'VV'),
 ('되', 'XSV'),
 ('들', 'XSN'),
 ('등', 'NNB'),
 ('따르', 'VV'),
 ('때', 'NNG'),
 ('때문', 'NNB'),
 ('라', 'EC'),
 ('라는', 'ETM'),
 ('로', 'JKB'),
 ('를', 'JKO'),
 ('만', 'JX'),
 ('만', 'NR'),
 ('말', 'NNG'),
 ('며', 'EC'),
 ('면', 'EC'),
 ('면서', 'EC'),
 ('명', 'NNB'),
 ('받', 'VV'),
 ('보', 'VV'),
 ('부터', 'JX'),
 ('사람', 'NNG'),
 ('성', 'XSN'),
 ('수', 'NNB'),
 ('아니', 'VCN'),
 ('않', 'VX'),
 ('어', 'EC'),
 ('어', 'EF'),
 ('어서', 'EC'),
 ('어야', 'EC'),
 ('없', 'VA'),
 ('었', 'EP'),
 ('에', 'JKB'),
 ('에게', 'JKB'),
 ('에서', 'JKB'),
 ('와', 'JC'),
 ('와', 'JKB'),
 ('우리', 'NP'),
 ('원', 'NNB'),


## 결합

In [17]:
text = train["review"][1]
text

'안봤지만 유승준나와서 비추.'

In [18]:
result = kiwi.tokenize(text)
result

[Token(form='안', tag='MAG', start=0, len=1),
 Token(form='보', tag='VV', start=1, len=1),
 Token(form='었', tag='EP', start=1, len=1),
 Token(form='지만', tag='EC', start=2, len=2),
 Token(form='유승준', tag='NNP', start=5, len=3),
 Token(form='나오', tag='VV', start=8, len=2),
 Token(form='어서', tag='EC', start=9, len=2),
 Token(form='비추', tag='VV', start=12, len=2),
 Token(form='.', tag='SF', start=14, len=1)]

In [19]:
tokens = [(t.form, t.tag) for t in result]
tokens

[('안', 'MAG'),
 ('보', 'VV'),
 ('었', 'EP'),
 ('지만', 'EC'),
 ('유승준', 'NNP'),
 ('나오', 'VV'),
 ('어서', 'EC'),
 ('비추', 'VV'),
 ('.', 'SF')]

In [20]:
kiwi.join(tokens)

'안 봤지만 유승준 나와서 비추.'

## 토큰화 해보기
- 불용어 제거와 함께 품사가 N, V로 시작하는 단어들만 토큰화

In [21]:
stopwords = Stopwords()
result = kiwi.tokenize(train["review"], stopwords=stopwords)
train_list = []

for tokens in result:
    token = [t.form for t in tokens if t.tag[0] in "NV"]
    train_list.append(token)

In [22]:
min(len(tokens) for tokens in train_list)

0

In [23]:
cnt = np.array([len(tokens) for tokens in train_list])
mask = cnt == 0
mask.sum()

49

In [24]:
train.loc[mask]

Unnamed: 0,id,review,target
31,train_31,대박....,1
102,train_102,What a great drama!!!,1
103,train_103,Space Jason!!!!,0
225,train_225,the roles play very real touching,1
307,train_307,참신하지는 않다,0
342,train_342,...,0
470,train_470,별로,0
524,train_524,harry potter go!,1
546,train_546,글쎄~ 별로던데~,0
581,train_581,ㅋㅋ,1


# spacy를 이용한 형태소 분석

In [25]:
# !python -m spacy download ko_core_news_sm

In [26]:
import spacy

In [30]:
nlp = spacy.load("ko_core_news_sm")

In [31]:
text

'안봤지만 유승준나와서 비추.'

In [32]:
doc = nlp(text)
doc

안봤지만 유승준나와서 비추.

In [33]:
doc[0].text, doc[0].lemma_, doc[0].tag_

('안봤지만', '안봤+지만', 'ncn+jxt')

## 토큰화 해보기

In [104]:
# train_list = []
# for text in tqdm(train["review"]):
#     doc = nlp(text)
#     tmp_list = []

#     for tokens in doc:
#         tmp = tokens.lemma_.split("+")
#         tmp_list.extend(tmp)

#     train_list.append(tmp_list)

# konlpy를 이용한 형태소 분석
- C++, 자바 등 다른 언어로 개발된 오픈소스 형태소 분석 라이브러리를 파이썬에서도 쉽게 사용할 수 있게 해주는 라이브러리
- 사용방법
    - 클래스 객체 생성
    - morphs 메서드와 pos 메서드 사용
    - morphs 메서드
        - 토큰화
    - pos 메서드
        - 품사 태깅이 추가된 토큰화
- 설치
```bash
pip install konlpy
```

In [35]:
from konlpy.tag import Okt, Komoran, Hannanum, Kkma

In [36]:
text

'안봤지만 유승준나와서 비추.'

## Okt

In [37]:
tokenizer = Okt()
tokenizer.morphs(text)

['안', '봤지만', '유승준', '나와서', '비추', '.']

In [118]:
tokenizer.pos(text)

[('보다가', 'Verb'),
 ('감동', 'Noun'),
 ('...', 'Punctuation'),
 ('진짜', 'Noun'),
 ('울', 'Modifier'),
 ('뻔', 'Noun')]

## Komoran

In [116]:
tokenizer = Komoran()
tokenizer.morphs(text)

['보', '다가', '감동', '...', '진짜', '울', '뻔']

In [115]:
tokenizer.pos(text)

[('보', 'VV'),
 ('다가', 'EC'),
 ('감동', 'NNG'),
 ('...', 'SE'),
 ('진짜', 'NNG'),
 ('울', 'NNP'),
 ('뻔', 'NNB')]

## Hannanum

In [119]:
tokenizer = Hannanum()
tokenizer.morphs(text)

['보다가감동', '...', '진짜울뻔']

In [120]:
tokenizer.pos(text)

[('보다가감동', 'N'), ('...', 'S'), ('진짜울뻔', 'N')]

## Kkma

In [121]:
tokenizer = Kkma()
tokenizer.morphs(text)

['보', '다가', '감동', '...', '진짜', '울', 'ㄹ', '뻔']

In [122]:
tokenizer.pos(text)

[('보', 'VV'),
 ('다가', 'ECD'),
 ('감동', 'NNG'),
 ('...', 'SE'),
 ('진짜', 'MAG'),
 ('울', 'VV'),
 ('ㄹ', 'ETD'),
 ('뻔', 'NNB')]

# Mecab 형태소 분석기

In [39]:
from mecab import MeCab

In [126]:
tokenizer = MeCab()
tokenizer.morphs(text)

['보', '다가', '감동', '.', '..', '진짜', '울', '뻔']

In [127]:
tokenizer.pos(text)

[('보', 'VV'),
 ('다가', 'EC'),
 ('감동', 'NNG'),
 ('.', 'SF'),
 ('..', 'SY'),
 ('진짜', 'MAG'),
 ('울', 'VV+ETM'),
 ('뻔', 'NNB')]

# 속도 비교

In [128]:
tokenizer = Okt() # 16초
train_list = []
for text in tqdm(train["review"]):
    tokens = tokenizer.pos(text)
    tokens = [t for t, p in tokens]
    train_list.append(tokens)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [129]:
tokenizer = Komoran() # 7초
train_list = []
for text in tqdm(train["review"]):
    tokens = tokenizer.pos(text)
    tokens = [t for t, p in tokens]
    train_list.append(tokens)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [130]:
tokenizer = Hannanum() # 24초
train_list = []
for text in tqdm(train["review"]):
    tokens = tokenizer.pos(text)
    tokens = [t for t, p in tokens]
    train_list.append(tokens)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [131]:
tokenizer = Kkma() # 1분27초
train_list = []
for text in tqdm(train["review"]):
    tokens = tokenizer.pos(text)
    tokens = [t for t, p in tokens]
    train_list.append(tokens)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [132]:
tokenizer = MeCab() # 1초 미만
train_list = []
for text in tqdm(train["review"]):
    tokens = tokenizer.pos(text)
    tokens = [t for t, p in tokens]
    train_list.append(tokens)

  0%|          | 0/2000 [00:00<?, ?it/s]

# Kiwi로 학습 데이터와 테스트 데이터 만들기

```
kiwi를 이용하여 학습 데이터와 테스트 데이터를 토큰화 후 각각 train_list와 test_list에 담기
```

In [143]:
kiwi = Kiwi()

In [145]:
# train_tokens = kiwi.tokenize(train["review"])
# train_list = []

# for tokens in train_tokens:
#     token = [t.form for t in tokens]
#     train_list.append(token)

In [152]:
train_tokens = kiwi.tokenize(train["review"])
train_list = [[t.form for t in tokens] for tokens in train_tokens]

In [153]:
test_tokens = kiwi.tokenize(test["review"])
test_list = [[t.form for t in tokens] for tokens in test_tokens]

In [177]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=500)
train_data = vec.fit_transform([" ".join(t) for t in train_list]).toarray()

In [178]:
test_data = vec.transform([" ".join(t) for t in test_list]).toarray()

In [179]:
train_data.shape, test_data.shape

((2000, 500), (1000, 500))

In [180]:
(train_data.sum(axis=1) == 0).sum()

135

In [181]:
target = train["target"].to_numpy().reshape(-1, 1)
target.shape

(2000, 1)

# 데이터셋 클래스

In [182]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, x, y=None):
        self.x, self.y = x, y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, i):
        item = {}
        item["x"] = torch.Tensor(self.x[i])

        if self.y is not None:
            item["y"] = torch.Tensor(self.y[i])

        return item

In [183]:
dataset = ReviewDataset(train_data, target)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
batch = next(iter(dataloader))
batch

{'x': tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0

# 모델 클래스

In [184]:
class ResidualBlock(torch.nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.fx = torch.nn.Sequential(
            torch.nn.Linear(in_features, in_features),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(in_features, in_features)
        )
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        fx = self.fx(x)
        hx = fx + x
        return self.relu(hx)

In [187]:
class Net(torch.nn.Module):
    def __init__(self, in_features, n_layers=8):
        super().__init__()

        self.init_layer = torch.nn.Sequential(
            torch.nn.Linear(in_features, in_features // 2),
            torch.nn.BatchNorm1d(in_features // 2),
            torch.nn.LeakyReLU()
        )

        res_list = [ResidualBlock(in_features // 2) for _ in range(n_layers)]
        self.seq = torch.nn.Sequential(*res_list)
        self.output_layer = torch.nn.Linear(in_features // 2, 1)

    def forward(self, x):
        x = self.init_layer(x)
        x = self.seq(x)
        return self.output_layer(x)

In [188]:
Net(train_data.shape[1])(batch["x"])

tensor([[0.2313],
        [0.5680]], grad_fn=<AddmmBackward0>)

# 학습 루프

In [189]:
def train_loop(dataloader, model, loss_function, optimizer, device):
    epoch_loss = 0
    model.train()

    for batch in dataloader:
        pred = model(batch["x"].to(device))
        loss = loss_function(pred, batch["y"].to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

# 테스트 루프

In [190]:
@torch.no_grad()
def test_loop(dataloader, model, loss_function, device):
    epoch_loss = 0
    model.eval()

    act = torch.nn.Sigmoid()
    pred_list = []
    for batch in dataloader:
        pred = model(batch["x"].to(device))
        if batch.get("y") is not None:
            loss = loss_function(pred, batch["y"].to(device))
            epoch_loss += loss.item()

        pred = act(pred)
        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    pred = np.concatenate(pred_list)
    epoch_loss /= len(dataloader)

    return epoch_loss, pred

# 학습

In [193]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

n_splits = 5
batch_size = 32
epochs = 100
loss_function = torch.nn.BCEWithLogitsLoss()
cv = KFold(n_splits, shuffle=True, random_state=SEED)

In [None]:
is_holdout = False
reset_seeds(SEED)
score_list = []

for i, (tri, vai) in enumerate(cv.split(train_data)):
    # 학습 데이터
    train_dataset = ReviewDataset(train_data[tri], target[tri])
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 검증 데이터
    valid_dataset = ReviewDataset(train_data[vai], target[vai])
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    # 모델 객체 및 옵티마이저 생성
    model = Net(train_data.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    patience = 0 # 조기 종료 조건을 주기 위한 변수
    best_score = 0 # 현재 최고 점수 / mse, mae 등은 np.inf로 초기화
    for _ in tqdm(range(epochs)):
        train_loss = train_loop(train_dataloader, model, loss_function, optimizer, device)
        valid_loss, pred = test_loop(valid_dataloader, model, loss_function, device)
        pred = (pred > 0.5).astype(int)
        score = accuracy_score(target[vai], pred)
        print(train_loss, valid_loss, score)

        patience += 1
        if score > best_score:
            best_score = score
            patience = 0
            torch.save(model.state_dict(), f"../output/model{i}.pt")

        if patience == 5:
            break

    score_list.append(best_score)
    print(f"ACC 최고점수: {best_score}")

    if is_holdout:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

0.6239883267879486 0.6003756981629592 0.6875
0.3689043334126472 0.5612044403186212 0.6975
0.25145461812615394 0.7075439095497131 0.7325
0.19466836571693422 0.9870089567624606 0.74
0.16309428118169308 0.967693376999635 0.7275
0.1537451395764947 1.2984054455390344 0.7375
0.14861789118498564 1.572257330784431 0.7275
0.1647557428199798 0.9915695763551272 0.725
0.14850941363722087 0.9529484510421753 0.7175
ACC 최고점수: 0.74


  0%|          | 0/100 [00:00<?, ?it/s]

0.6573500633239746 0.5771184792885413 0.74
0.41165226578712466 0.5348975956439972 0.7225
0.2681350499391556 0.6423366299042335 0.7425
0.1912845045886934 0.739616368825619 0.7375
0.18615186650305987 0.8642864227294922 0.71
0.16113962173461915 0.8885924907831045 0.7225
0.1757215578854084 0.675834433390544 0.695
0.16178344830870628 0.9730217754840851 0.6925
ACC 최고점수: 0.7425


  0%|          | 0/100 [00:00<?, ?it/s]

0.6347084254026413 0.5681947653110211 0.6925
0.3749066907167435 0.5037340063315171 0.7275
0.24978277653455735 0.6312504915090708 0.73
0.20687112510204314 0.8109894389143357 0.7525
0.19402924731373786 0.7719763723703531 0.7375
0.15858736258000136 0.9617924988269806 0.7425
0.14524240812286734 0.8858572336343619 0.7425
0.15219196252524853 0.8544722703786997 0.7575
0.12443026639521122 1.3097223845811992 0.725
0.11131450425833464 1.4488552121015696 0.7475
0.14589900450780988 0.9527226182130667 0.74
0.13654109623283148 1.3716921256138728 0.7275
0.14625778675079346 1.4477969408035278 0.7425
ACC 최고점수: 0.7575


  0%|          | 0/100 [00:00<?, ?it/s]

0.6361208021640777 0.5755125146645766 0.74
0.370450259745121 0.5210863007948949 0.7375
0.259934675693512 0.7073244658800272 0.72
0.2050063768029213 0.6604009981338794 0.7225
0.1592521343752742 0.7389627167811761 0.7375
0.1688633156940341 0.6619661312836868 0.7375
ACC 최고점수: 0.74


  0%|          | 0/100 [00:00<?, ?it/s]

0.6251080852746963 0.6045285830130944 0.7
0.36707259267568587 0.5388868130170382 0.7475
0.2720933347940445 0.6892650035711435 0.755
0.19936337649822236 0.9583192765712738 0.74
0.1767845781892538 0.8008638253578773 0.7325
0.14043993022292853 0.9181772883121784 0.7275
0.14664909522980452 0.9981749241168683 0.7425
0.13841013818979264 1.1914041454975421 0.7325
ACC 최고점수: 0.755


# 테스트 데이터 예측

In [195]:
test_dataset = ReviewDataset(test_data)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
pred_list = []

for i in range(n_splits):
    model = Net(test_data.shape[1]).to(device)
    state_dict = torch.load(f"../output/model{i}.pt", weights_only=True)
    model.load_state_dict(state_dict)

    _, pred = test_loop(test_dataloader, model, None, device)
    pred_list.append(pred)

In [197]:
pred = np.mean(pred_list, axis=0)
pred = (pred > 0.5).astype(int)
pred.shape

(1000, 1)