- 한글 영화 리뷰 데이터셋 -> 긍/부정 분류기, 감정 분석

In [2]:
import numpy as np
import pandas as pd

In [4]:
text_train = pd.read_csv("./data/ratings_train.txt", delimiter = "\t")
text_test = pd.read_csv("./data/ratings_test.txt", delimiter = "\t")

In [5]:
text_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


In [6]:
text_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [9]:
# 결측치가 존재!
# 확인
text_train[text_train['document'].isnull()]

Unnamed: 0,id,document,label
25857,2172111,,1
55737,6369843,,1
110014,1034280,,0
126782,5942978,,0
140721,1034283,,0


In [10]:
# 결측치가 존재!
# 확인
text_test[text_test['document'].isnull()]

Unnamed: 0,id,document,label
5746,402110,,1
7899,5026896,,0
27097,511097,,1


In [12]:
# 결측치 제거
text_train.dropna(inplace=True)
text_test.dropna(inplace=True)

In [13]:
text_test.info(), text_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49997 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        49997 non-null  int64 
 1   document  49997 non-null  object
 2   label     49997 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 149995 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        149995 non-null  int64 
 1   document  149995 non-null  object
 2   label     149995 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB


(None, None)

- 토큰화 다뤄보기
- 형태소 분석시 : konlpy
- 토큰화: TF-IDE

#### TF-IDF
- 개별 문서에서 자주 등장하는 단어는 가중치를 높게 주되(TF), 모든 문서에 자주 등장하는 단어에는 패널티를 주는 방식으로 가중치를 연산
  - 단어의 중요도 라고 하는 수치값으로 벡터화

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDE객체
from konlpy.tag import Okt

In [16]:
# 샘플데이터를 활용해서 Tfidf를 파악해보자!
# 벡터화 수행하기 전에 맛보기로 샘플데이터 생성!
sample_text = ["나는 오늘 점심에 삼계탕을 먹었다 왜", 
               "너 저녁에 운동갈꺼야?", 
               "나는 오늘 저녁에 운동하러 갈꺼야", 
               "좋겠다. 나는 저녁에 야간 자율 학습 하러 갈건데..."]

In [17]:
# Tfidf 객체 초기화
sample_tfidf = TfidfVectorizer()

In [18]:
sample_tfidf.fit(sample_text)

In [19]:
sample_tfidf.vocabulary_

{'나는': 2,
 '오늘': 6,
 '점심에': 11,
 '삼계탕을': 4,
 '먹었다': 3,
 '저녁에': 10,
 '운동갈꺼야': 7,
 '운동하러': 8,
 '갈꺼야': 1,
 '좋겠다': 12,
 '야간': 5,
 '자율': 9,
 '학습': 14,
 '하러': 13,
 '갈건데': 0}

In [20]:
okt = Okt()

In [21]:
okt.morphs(sample_text[0])

['나', '는', '오늘', '점심', '에', '삼계탕', '을', '먹었다', '왜']

In [23]:
okt.nouns(sample_text[0])  # 명사만 추출

['나', '오늘', '점심', '삼계탕', '왜']

In [None]:
# BOW, TFIDF 는 기본적으로 "공백" 기준으로 토큰화가 수행됨!
# 토큰의 단위를 형태소 분석기를 응용해서 설정

In [24]:
# 토큰화 도구 만들기 (사용자 정의 함수)
def my_tokenizer(text) :
    return okt.nouns(text)

In [28]:
tfidf_okt = TfidfVectorizer(tokenizer=my_tokenizer, token_pattern=None)

In [29]:
tfidf_okt.fit(sample_text)

In [42]:
sample_text

['나는 오늘 점심에 삼계탕을 먹었다 왜',
 '너 저녁에 운동갈꺼야?',
 '나는 오늘 저녁에 운동하러 갈꺼야',
 '좋겠다. 나는 저녁에 야간 자율 학습 하러 갈건데...']

In [40]:
df = pd.DataFrame( [tfidf_okt.vocabulary_.keys(), tfidf_okt.vocabulary_.values()]).T
df = df.sort_values(by = 1).set_index(1).T

pd.concat([df, pd.DataFrame(tfidf_okt.transform(sample_text).toarray())])
#값이 높을 수록 희소하다, 중요하다 라는 걸 의미

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,나,너,삼계탕,야간,오늘,왜,운동,자율,저녁,점심,학습
0,0.317993,0.0,0.498197,0.0,0.392784,0.498197,0.0,0.0,0.0,0.498197,0.0
1,0.0,0.702035,0.0,0.0,0.0,0.0,0.553492,0.0,0.4481,0.0,0.0
2,0.444931,0.0,0.0,0.0,0.549578,0.0,0.549578,0.0,0.444931,0.0,0.0
3,0.326798,0.0,0.0,0.511992,0.0,0.0,0.0,0.511992,0.326798,0.0,0.511992


- 실제 데이터에 적용

In [43]:
text_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 149995 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        149995 non-null  int64 
 1   document  149995 non-null  object
 2   label     149995 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB


In [45]:
# 데이터 나누기 (문제와 답 나누기)
X_train = text_train['document']
X_test = text_test['document']
y_train = text_train['label']
y_test = text_test['label']

In [46]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((149995,), (49997,), (149995,), (49997,))

- tfidf + okt 활용해서 토큰화
- LogisticRegression 모델 사용해서 모델링 분석 수행
- pipeline 기능 연결

In [47]:
from sklearn.pipeline import make_pipeline #기능 연결 도구
from sklearn.linear_model import LogisticRegression

In [50]:
pipe_model = make_pipeline(
    TfidfVectorizer(tokenizer=my_tokenizer  # 사용자 정의 토크나이저 지정
                    token_pattern=None,   # 기본 세팅 제거
                    ngram_range=(1, 2)),   # 1-gram, 2-gram을 단어 사전으로 체택하겠다
    LogisticRegression(max_iter = 10000) # 최대 반복수
)

In [56]:
# 데이터가 너무 커서 임의로 잘라옴 ...!
X_train = X_train[:10000]
X_test = X_test[:10000]
y_train = y_train[:10000]
y_test = y_test[:10000]

In [55]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10000,), (10000,))

In [None]:
pipe_model.fit(X_train, y_train)