<a href="https://colab.research.google.com/github/jihun0423/NLP/blob/main/word_encoding_BOW_TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 010. Vectorization of Statement (문장의 vector 화)

- BOW (Bag of Words)
- TF-IDF (Term Frequency - Inverse Document Frequency)  
- Word Embedding - Keras word API 사용

In [1]:
import sklearn
sklearn.__version__

'1.2.2'

In [105]:
import pandas as pd

sentences = ['I love my dog.',
             'I love my cat.',
             'I love my dog and love my cat',
             'You love my dog!',
             'Do you think my dog is amazing?',
             'I do not love my dog',
             'I hate my dog',
             'I do not hate my dog']

In [106]:
labels = [1,1,1,1,1,0,0,0]

## 1. Bag of Word (BOW)

- CountVectorizer
    - min_df : vocabulary 에 포함할 최소 발생 빈도
    - ngram_range : (1, 1) - unigram only, (1, 2) - unigram + bigram
    - max_features : top max_features 만으로 vocabulary 구성
    - token_pattern = (?u)\\b\\w\\w+\\b : unocode 영수자 2 글자 이상만 포함

In [107]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(min_df=1,ngram_range = (1,1), token_pattern=r'\b\w+\b')
count_vectorizer

## Text vs token Matrix 생성

In [108]:
features = count_vectorizer.fit_transform(sentences)
features

<8x13 sparse matrix of type '<class 'numpy.int64'>'
	with 41 stored elements in Compressed Sparse Row format>

In [109]:
print(f"document 수: {features.shape[0]}")
print(f"단어수: {features.shape[1]}")

document 수: 8
단어수: 13


In [110]:
features.shape

(8, 13)

In [111]:
vectorized_sentences = features.toarray()
vectorized_sentences

array([[0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 1, 0, 1, 0, 2, 2, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1],
       [1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1],
       [0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0]])

### features 의 단어 list

In [112]:
feature_names = count_vectorizer.get_feature_names_out()
feature_names

array(['amazing', 'and', 'cat', 'do', 'dog', 'hate', 'i', 'is', 'love',
       'my', 'not', 'think', 'you'], dtype=object)

In [113]:
df = pd.DataFrame(vectorized_sentences, columns=feature_names)
df.index.name = 'sentence'
df

Unnamed: 0_level_0,amazing,and,cat,do,dog,hate,i,is,love,my,not,think,you
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,0,0,0,1,0,1,0,1,1,0,0,0
1,0,0,1,0,0,0,1,0,1,1,0,0,0
2,0,1,1,0,1,0,1,0,2,2,0,0,0
3,0,0,0,0,1,0,0,0,1,1,0,0,1
4,1,0,0,1,1,0,0,1,0,1,0,1,1
5,0,0,0,1,1,0,1,0,1,1,1,0,0
6,0,0,0,0,1,1,1,0,0,1,0,0,0
7,0,0,0,1,1,1,1,0,0,1,1,0,0


## 2. TF-IDF

- TF-IDF(Term Frequency - Inverse Document Frequency)

In [120]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b')
tfidf_vectorizer

In [121]:
tfidf_sentences = tfidf_vectorizer.fit_transform(sentences)
tfidf_sentences

<8x13 sparse matrix of type '<class 'numpy.float64'>'
	with 41 stored elements in Compressed Sparse Row format>

In [122]:
print(tfidf_sentences) # (문장번호, 단어 인덱스)     tfidf값

  (0, 4)	0.4645123584875426
  (0, 9)	0.41556576157444736
  (0, 8)	0.5840631780172822
  (0, 6)	0.5200034333576489
  (1, 2)	0.7016732402691847
  (1, 9)	0.33435105858190867
  (1, 8)	0.4699187466959015
  (1, 6)	0.4183783037145368
  (2, 1)	0.49694315373153203
  (2, 2)	0.41647714664614083
  (2, 4)	0.22182805836664693
  (2, 9)	0.39690718375661393
  (2, 8)	0.5578391979273964
  (2, 6)	0.24832784286149595
  (3, 12)	0.7144196161297492
  (3, 4)	0.3805210383842703
  (3, 9)	0.34042477497506585
  (3, 8)	0.47845514316302845
  (4, 0)	0.46705921344010853
  (4, 7)	0.46705921344010853
  (4, 11)	0.46705921344010853
  (4, 3)	0.3377737618932283
  (4, 12)	0.39143207239638167
  (4, 4)	0.20848831034634807
  (4, 9)	0.18651947980577446
  (5, 10)	0.5717164733890293
  (5, 3)	0.49334440780669986
  (5, 4)	0.3045131197459675
  (5, 9)	0.2724259628498939
  (5, 8)	0.38288518532829685
  (5, 6)	0.3408905379523903
  (6, 5)	0.7320017965151856
  (6, 4)	0.38988582818885303
  (6, 9)	0.3488027781347609
  (6, 6)	0.436461948904490

## Text vs tf-idf Matrix 생성

In [123]:
tfidf_vect_sentences = tfidf_sentences.toarray()
tfidf_vect_sentences

array([[0.        , 0.        , 0.        , 0.        , 0.46451236,
        0.        , 0.52000343, 0.        , 0.58406318, 0.41556576,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.70167324, 0.        , 0.        ,
        0.        , 0.4183783 , 0.        , 0.46991875, 0.33435106,
        0.        , 0.        , 0.        ],
       [0.        , 0.49694315, 0.41647715, 0.        , 0.22182806,
        0.        , 0.24832784, 0.        , 0.5578392 , 0.39690718,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.38052104,
        0.        , 0.        , 0.        , 0.47845514, 0.34042477,
        0.        , 0.        , 0.71441962],
       [0.46705921, 0.        , 0.        , 0.33777376, 0.20848831,
        0.        , 0.        , 0.46705921, 0.        , 0.18651948,
        0.        , 0.46705921, 0.39143207],
       [0.        , 0.        , 0.        , 0.49334441, 0.30451312,
        0.        , 0.34089

In [124]:
tfidf_feature_names =count_vectorizer.get_feature_names_out()
tfidf_feature_names

array(['amazing', 'and', 'cat', 'do', 'dog', 'hate', 'i', 'is', 'love',
       'my', 'not', 'think', 'you'], dtype=object)

In [125]:
df = pd.DataFrame(tfidf_vect_sentences, columns=tfidf_feature_names)
df

Unnamed: 0,amazing,and,cat,do,dog,hate,i,is,love,my,not,think,you
0,0.0,0.0,0.0,0.0,0.464512,0.0,0.520003,0.0,0.584063,0.415566,0.0,0.0,0.0
1,0.0,0.0,0.701673,0.0,0.0,0.0,0.418378,0.0,0.469919,0.334351,0.0,0.0,0.0
2,0.0,0.496943,0.416477,0.0,0.221828,0.0,0.248328,0.0,0.557839,0.396907,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.380521,0.0,0.0,0.0,0.478455,0.340425,0.0,0.0,0.71442
4,0.467059,0.0,0.0,0.337774,0.208488,0.0,0.0,0.467059,0.0,0.186519,0.0,0.467059,0.391432
5,0.0,0.0,0.0,0.493344,0.304513,0.0,0.340891,0.0,0.382885,0.272426,0.571716,0.0,0.0
6,0.0,0.0,0.0,0.0,0.389886,0.732002,0.436462,0.0,0.0,0.348803,0.0,0.0,0.0
7,0.0,0.0,0.0,0.454111,0.280296,0.52625,0.313781,0.0,0.0,0.250761,0.52625,0.0,0.0


## 재미로 구현해본 머신러닝 감정 분류 모델 (긍정은 1, 부정은 0)

In [None]:
test = ['I hate my cat',
        'I do not hate my cat',
        'I love my cat',
        'I think I love my cat',
        'I think I hate my cat']

In [None]:
tfidf_test = tfidf_vectorizer.transform(test)

In [None]:
tfidf_vect_test = tfidf_test.toarray()
tfidf_vect_test

array([[0.        , 0.        , 0.62226163, 0.        , 0.        ,
        0.62226163, 0.3710285 , 0.        , 0.        , 0.296511  ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.48072432, 0.41482565, 0.        ,
        0.48072432, 0.28663574, 0.        , 0.        , 0.22906771,
        0.48072432, 0.        , 0.        ],
       [0.        , 0.        , 0.70167324, 0.        , 0.        ,
        0.        , 0.4183783 , 0.        , 0.46991875, 0.33435106,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.47028725, 0.        , 0.        ,
        0.        , 0.5608251 , 0.        , 0.31495685, 0.22409439,
        0.        , 0.56114971, 0.        ],
       [0.        , 0.        , 0.44398893, 0.        , 0.        ,
        0.44398893, 0.52946393, 0.        , 0.        , 0.2115631 ,
        0.        , 0.52977039, 0.        ]])

In [None]:
test_df = pd.DataFrame(tfidf_vect_test, columns=tfidf_feature_names)
test_df

Unnamed: 0,amazing,and,cat,do,dog,hate,i,is,love,my,not,think,you
0,0.0,0.0,0.622262,0.0,0.0,0.622262,0.371028,0.0,0.0,0.296511,0.0,0.0,0.0
1,0.0,0.0,0.480724,0.414826,0.0,0.480724,0.286636,0.0,0.0,0.229068,0.480724,0.0,0.0
2,0.0,0.0,0.701673,0.0,0.0,0.0,0.418378,0.0,0.469919,0.334351,0.0,0.0,0.0
3,0.0,0.0,0.470287,0.0,0.0,0.0,0.560825,0.0,0.314957,0.224094,0.0,0.56115,0.0
4,0.0,0.0,0.443989,0.0,0.0,0.443989,0.529464,0.0,0.0,0.211563,0.0,0.52977,0.0


In [130]:
from sklearn.svm import SVC
model = SVC()
model.fit(df, labels)

In [137]:
model.predict(test_df)

array([1, 0, 1, 1, 1])

# 3. keras word encoding

- keras  API 이용

In [46]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

## Tokenize

In [47]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')    # 문장으로 부터 상위 100 개 단어로 vocabulary 작성

## Word Index Vocabulary 작성

In [51]:
sentences

['I love my dog.',
 'I love my cat.',
 'I love my dog and love my cat',
 'You love my dog!',
 'Do you think my dog is amazing?']

In [48]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'cat': 6,
 'you': 7,
 'and': 8,
 'do': 9,
 'think': 10,
 'is': 11,
 'amazing': 12}

## text 의 sentence 변환 및 padding

- texts_to_sequences: text list 내의 각 text 를 수열 (sequence of integers) 로 convert


    - 입력 : text (strings) list
    - 반환 : sequence list
    
- pad_sequences: 동일한 길이로 sequence 를 zero padding

In [49]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post', truncating='post')

In [50]:
print(sequences)
print()
print(padded)

[[5, 3, 2, 4], [5, 3, 2, 6], [5, 3, 2, 4, 8, 3, 2, 6], [7, 3, 2, 4], [9, 7, 10, 2, 4, 11, 12]]

[[ 5  3  2  4  0  0  0  0]
 [ 5  3  2  6  0  0  0  0]
 [ 5  3  2  4  8  3  2  6]
 [ 7  3  2  4  0  0  0  0]
 [ 9  7 10  2  4 11 12  0]]


In [60]:
tokenizer.texts_to_sequences(['I think she loves my dog'])

[[5, 10, 1, 1, 2, 4]]

In [None]:
[tokenizer.index_word

{1: '<OOV>',
 2: 'my',
 3: 'love',
 4: 'dog',
 5: 'i',
 6: 'cat',
 7: 'you',
 8: 'and',
 9: 'do',
 10: 'think',
 11: 'is',
 12: 'amazing'}

### sequenced sentence 를 word sentence 로 환원

In [61]:
for sequence in sequences:
    sent = []
    for idx in sequence:
        sent.append(tokenizer.index_word[idx])
    print(' '.join(sent))

i love my dog
i love my cat
i love my dog and love my cat
you love my dog
do you think my dog is amazing


In [63]:
tokenizer.index_word

{1: '<OOV>',
 2: 'my',
 3: 'love',
 4: 'dog',
 5: 'i',
 6: 'cat',
 7: 'you',
 8: 'and',
 9: 'do',
 10: 'think',
 11: 'is',
 12: 'amazing'}

In [62]:
for sequence in padded:
    sent = []
    for idx in sequence:
        sent.append(tokenizer.index_word[idx])
    print(' '.join(sent))


    # padding을 한 경우, tokenizer에 등록되어 있지 않은 공백 (0)이 있으므로 오류 발생

KeyError: ignored

### One-Hot-Encoding 표현

In [64]:
to_categorical(padded)

array([[[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

    