In [1]:
corpus = [
    "I love NLP",
    "NLP is fun",
    "I love machine learning"
]

In [2]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [4]:
# 단어 목록 생성
words = set()

for sent in corpus:
    for word in sent.split():
        words.add(word)

word_list = sorted(list(words))

In [5]:
print(word_list)

['I', 'NLP', 'fun', 'is', 'learning', 'love', 'machine']


In [6]:
# 원 - 핫 인코더 생성
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(np.array(word_list).reshape(-1,1))

In [9]:
# 원-학 인코딩
for sent in corpus:
    encoded = encoder.transform(np.array(sent.split()).reshape(-1,1))
    print(f"\n문장: {sent}")
    print(encoded)


문장: I love NLP
[[1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]]

문장: NLP is fun
[[0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]]

문장: I love machine learning
[[1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0.]]


In [11]:
encoded_sent = encoder.transform(np.array("I love fun NLP".split()).reshape(-1,1))
print(f"\n새로운 문장: {sent}")
print(encoded)


새로운 문장: I love machine learning
[[1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0.]]


In [15]:
#원-핫 인코더 - 기본 값: 희소행렬
encoder2 = OneHotEncoder()
encoder2.fit(np.array(word_list).reshape(-1,1))

for sent in corpus:
    X_sparse = encoder2.transform(np.array(sent.split()).reshape(-1,1))

    print(f"\n문장: {sent}")
    print("원-핫 인코딩 행렬 형태:", type(X_sparse))
    print("희소행렬 크기:", X_sparse.shape)
    print(X_sparse)


문장: I love NLP
원-핫 인코딩 행렬 형태: <class 'scipy.sparse._csr.csr_matrix'>
희소행렬 크기: (3, 7)
  (0, 0)	1.0
  (1, 5)	1.0
  (2, 1)	1.0

문장: NLP is fun
원-핫 인코딩 행렬 형태: <class 'scipy.sparse._csr.csr_matrix'>
희소행렬 크기: (3, 7)
  (0, 1)	1.0
  (1, 3)	1.0
  (2, 2)	1.0

문장: I love machine learning
원-핫 인코딩 행렬 형태: <class 'scipy.sparse._csr.csr_matrix'>
희소행렬 크기: (4, 7)
  (0, 0)	1.0
  (1, 5)	1.0
  (2, 6)	1.0
  (3, 4)	1.0


In [23]:
corpus = [
    "I love NLP",
    "NLP is fun",
    "I love NLP, Ypu love NLP! "
]

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
#CountVector 객체 생성
vectorizer = CountVectorizer()

In [27]:
#CountVector 설정 및 수행
X_bow = vectorizer.fit_transform(corpus)

In [28]:
print("Bow Vocabulary:", vectorizer.vocabulary_)
print("Bow Representation:\n", X_bow.toarray())

Bow Vocabulary: {'love': 2, 'nlp': 3, 'is': 1, 'fun': 0, 'ypu': 4}
Bow Representation:
 [[0 0 1 1 0]
 [1 1 0 1 0]
 [0 0 2 2 1]]


In [30]:
#TF-IDF (tfidf Vector)
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(corpus)

In [32]:
print("TF-IDF Vocabulary:", tfidf.vocabulary_)
print("TF-IDF Representation:\n", X_tfidf.toarray())

TF-IDF Vocabulary: {'love': 2, 'nlp': 3, 'is': 1, 'fun': 0, 'ypu': 4}
TF-IDF Representation:
 [[0.         0.         0.78980693 0.61335554 0.        ]
 [0.65249088 0.65249088 0.         0.38537163 0.        ]
 [0.         0.         0.70094487 0.54434622 0.46082913]]
