In [1]:
#사전 구축
from nltk import FreqDist
import numpy as np
import re

def buildDict(docs):
    doc_tokens = []     # python list
    for doc in docs:
        delim = re.compile(r'[\s,.]+')
        tokens = delim.split(doc.lower()) 
        if tokens[-1] == '' :   tokens = tokens[:-1] 
        doc_tokens.append(tokens)

        
    vocab = FreqDist(np.hstack(doc_tokens))
    vocab = vocab.most_common()
    word_to_id = {word[0] : id for id, word in enumerate(vocab)}
    id_to_word = {id : word[0] for id, word in enumerate(vocab)}
    return doc_tokens, vocab, word_to_id, id_to_word

docs = []
docs.append('To do is to be. To be is to do.')
docs.append('To be or not to be. I am what I am')
docs.append('I think therefore I am. Do be do be do.')
docs.append('Do do do da da da. Let it be let it be.')

doc_tokens, vocab, word_to_id, id_to_word = buildDict(docs)

In [2]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

encoder = LabelEncoder()
labels = encoder.fit_transform([word for word, id in vocab])
for label in labels:
    print('[{:2d} : {}]'.format(label, encoder.classes_[label]))
print(encoder.classes_)

[ 3 : do]
[ 1 : be]
[12 : to]
[ 4 : i]
[ 0 : am]
[ 2 : da]
[ 5 : is]
[ 7 : let]
[ 6 : it]
[ 9 : or]
[ 8 : not]
[13 : what]
[11 : think]
[10 : therefore]
['am' 'be' 'da' 'do' 'i' 'is' 'it' 'let' 'not' 'or' 'therefore' 'think'
 'to' 'what']


In [4]:
encode_data = np.array([encoder.transform(doc_token) for doc_token in doc_tokens])
        
print(encode_data)
for code in encode_data:
    print(encoder.inverse_transform(code))

[array([12,  3,  5, 12,  1, 12,  1,  5, 12,  3])
 array([12,  1,  9,  8, 12,  1,  4,  0, 13,  4,  0])
 array([ 4, 11, 10,  4,  0,  3,  1,  3,  1,  3])
 array([3, 3, 3, 2, 2, 2, 7, 6, 1, 7, 6, 1])]
['to' 'do' 'is' 'to' 'be' 'to' 'be' 'is' 'to' 'do']
['to' 'be' 'or' 'not' 'to' 'be' 'i' 'am' 'what' 'i' 'am']
['i' 'think' 'therefore' 'i' 'am' 'do' 'be' 'do' 'be' 'do']
['do' 'do' 'do' 'da' 'da' 'da' 'let' 'it' 'be' 'let' 'it' 'be']


  encode_data = np.array([encoder.transform(doc_token) for doc_token in doc_tokens])


In [7]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(categories='auto')
labels = labels.reshape(-1, 1)
print(labels)

oh_labels = oh_encoder.fit_transform(labels)

print(oh_labels.toarray())

[[ 3]
 [ 1]
 [12]
 [ 4]
 [ 0]
 [ 2]
 [ 5]
 [ 7]
 [ 6]
 [ 9]
 [ 8]
 [13]
 [11]
 [10]]
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]


In [10]:
oh_vectors = []
for data in encode_data:
    data = data.reshape(-1,1)
    oh_vector = oh_encoder.transform(data).toarray()
    oh_vectors.append(oh_vector)

for data, oh_vector in zip(encode_data, oh_vectors):
    print(encoder.inverse_transform(data))
    print(oh_vector)

['to' 'do' 'is' 'to' 'be' 'to' 'be' 'is' 'to' 'do']
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
['to' 'be' 'or' 'not' 'to' 'be' 'i' 'am' 'what' 'i' 'am']
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

docs = []
docs.append('To do is to be. To be is to do.')
docs.append('To be or not to be. II am what II am')
docs.append('II think therefore II am. Do be do be do.')
docs.append('Do do do da da da. Let it be let it be.')

cnt_vectr = CountVectorizer()
vectors = cnt_vectr.fit_transform(docs)

print(cnt_vectr.vocabulary_)

{'to': 12, 'do': 3, 'is': 5, 'be': 1, 'or': 9, 'not': 8, 'ii': 4, 'am': 0, 'what': 13, 'think': 11, 'therefore': 10, 'da': 2, 'let': 7, 'it': 6}


In [16]:
print(cnt_vectr.get_feature_names_out())

['am' 'be' 'da' 'do' 'ii' 'is' 'it' 'let' 'not' 'or' 'therefore' 'think'
 'to' 'what']


In [17]:
print(vectors.toarray())

[[0 2 0 2 0 2 0 0 0 0 0 0 4 0]
 [2 2 0 0 2 0 0 0 1 1 0 0 2 1]
 [1 2 0 3 2 0 0 0 0 0 1 1 0 0]
 [0 2 3 3 0 0 2 2 0 0 0 0 0 0]]


In [19]:
print(pd.DataFrame(vectors.toarray(),
                   columns=cnt_vectr.get_feature_names_out()))

   am  be  da  do  ii  is  it  let  not  or  therefore  think  to  what
0   0   2   0   2   0   2   0    0    0   0          0      0   4     0
1   2   2   0   0   2   0   0    0    1   1          0      0   2     1
2   1   2   0   3   2   0   0    0    0   0          1      1   0     0
3   0   2   3   3   0   0   2    2    0   0          0      0   0     0


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf = TfidfVectorizer().fit(docs)
dtm = tfidf.transform(docs).toarray()

df = pd.DataFrame(dtm, columns=tfidf.get_feature_names_out())
print(df)
print(sorted(tfidf.vocabulary_.items()))

         am        be        da        do        ii        is        it  \
0  0.000000  0.255666  0.000000  0.312717  0.000000  0.489931  0.000000   
1  0.464005  0.307120  0.000000  0.000000  0.464005  0.000000  0.000000   
2  0.251031  0.332310  0.000000  0.609695  0.502063  0.000000  0.000000   
3  0.000000  0.223758  0.643179  0.410533  0.000000  0.000000  0.428786   

        let       not        or  therefore     think        to      what  
0  0.000000  0.000000  0.000000   0.000000  0.000000  0.772535  0.000000  
1  0.000000  0.294266  0.294266   0.000000  0.000000  0.464005  0.294266  
2  0.000000  0.000000  0.000000   0.318401  0.318401  0.000000  0.000000  
3  0.428786  0.000000  0.000000   0.000000  0.000000  0.000000  0.000000  
[('am', 0), ('be', 1), ('da', 2), ('do', 3), ('ii', 4), ('is', 5), ('it', 6), ('let', 7), ('not', 8), ('or', 9), ('therefore', 10), ('think', 11), ('to', 12), ('what', 13)]


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf = TfidfVectorizer(norm='l1').fit(docs)
dtm = tfidf.transform(docs).toarray()
print(dtm)

df = pd.DataFrame(dtm, columns=tfidf.get_feature_names_out())
print(df)
print(sorted(tfidf.vocabulary_.items()))

[[0.         0.13964363 0.         0.17080421 0.         0.2675978
  0.         0.         0.         0.         0.         0.
  0.42195436 0.        ]
 [0.1797123  0.11894973 0.         0.         0.1797123  0.
  0.         0.         0.11397113 0.11397113 0.         0.
  0.1797123  0.11397113]
 [0.10765092 0.14250608 0.         0.26145809 0.21530184 0.
  0.         0.         0.         0.         0.13654154 0.13654154
  0.         0.        ]
 [0.         0.10480277 0.30124885 0.19228324 0.         0.
  0.20083257 0.20083257 0.         0.         0.         0.
  0.         0.        ]]
         am        be        da        do        ii        is        it  \
0  0.000000  0.139644  0.000000  0.170804  0.000000  0.267598  0.000000   
1  0.179712  0.118950  0.000000  0.000000  0.179712  0.000000  0.000000   
2  0.107651  0.142506  0.000000  0.261458  0.215302  0.000000  0.000000   
3  0.000000  0.104803  0.301249  0.192283  0.000000  0.000000  0.200833   

        let       not       

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf = TfidfVectorizer(norm='l2').fit(docs)
dtm = tfidf.transform(docs).toarray()
print(dtm)

df = pd.DataFrame(dtm, columns=tfidf.get_feature_names_out())
print(df)
print(sorted(tfidf.vocabulary_.items()))

[[0.         0.25566647 0.         0.3127168  0.         0.48993129
  0.         0.         0.         0.         0.         0.
  0.77253491 0.        ]
 [0.4640046  0.30711989 0.         0.         0.4640046  0.
  0.         0.         0.2942655  0.2942655  0.         0.
  0.4640046  0.2942655 ]
 [0.25103134 0.33231014 0.         0.6096945  0.50206267 0.
  0.         0.         0.         0.         0.31840142 0.31840142
  0.         0.        ]
 [0.         0.22375843 0.6431793  0.41053301 0.         0.
  0.4287862  0.4287862  0.         0.         0.         0.
  0.         0.        ]]
         am        be        da        do        ii        is        it  \
0  0.000000  0.255666  0.000000  0.312717  0.000000  0.489931  0.000000   
1  0.464005  0.307120  0.000000  0.000000  0.464005  0.000000  0.000000   
2  0.251031  0.332310  0.000000  0.609695  0.502063  0.000000  0.000000   
3  0.000000  0.223758  0.643179  0.410533  0.000000  0.000000  0.428786   

        let       not      