## Feature Extraction - English

추준호(20224224)

In [3]:
corpus = [
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]

## Bag of words

### (1) nltk & gensim

In [4]:
import nltk
import gensim
import string

In [10]:
def tokenize(text):
    text = text.lower()
    stemmer = nltk.stem.SnowballStemmer('english')
    
    for token in nltk.word_tokenize(text):
        if token in string.punctuation:
            continue
            
        yield stemmer.stem(token)

In [11]:
for doc in corpus :
    print(doc)
    
    tokens = tokenize(doc)
    print(tokens)
    print()

The elephant sneezed at the sight of potatoes.
<generator object tokenize at 0x177ec0e40>

Bats can see via echolocation. See the bat sight sneeze!
<generator object tokenize at 0x177ec06d0>

Wondering, she opened the door to the studio.
<generator object tokenize at 0x177ec0e40>



In [12]:
for doc in corpus :
    print(doc)
    
    tokens = list(tokenize(doc))
    print(tokens)
    print()

The elephant sneezed at the sight of potatoes.
['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']

Bats can see via echolocation. See the bat sight sneeze!
['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez']

Wondering, she opened the door to the studio.
['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']



In [13]:
tokenized_corpus = [
    list(tokenize(doc)) for doc in corpus
]

In [14]:
tokenized_corpus

[['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'],
 ['bat',
  'can',
  'see',
  'via',
  'echoloc',
  'see',
  'the',
  'bat',
  'sight',
  'sneez'],
 ['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']]

In [15]:
lexicon = gensim.corpora.Dictionary(tokenized_corpus)

In [16]:
for x in lexicon.items():
    print(x)

(0, 'at')
(1, 'eleph')
(2, 'of')
(3, 'potato')
(4, 'sight')
(5, 'sneez')
(6, 'the')
(7, 'bat')
(8, 'can')
(9, 'echoloc')
(10, 'see')
(11, 'via')
(12, 'door')
(13, 'open')
(14, 'she')
(15, 'studio')
(16, 'to')
(17, 'wonder')


In [17]:
for doc in tokenized_corpus:
    print(doc)
    
    vec = lexicon.doc2bow(doc)
    print(vec)
    print()

['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)]

['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez']
[(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)]

['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']
[(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]



## (2) scikit-Learn

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vectorizer = CountVectorizer()

In [20]:
results = vectorizer.fit_transform(corpus)

In [21]:
print(results.A)

[[1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 2 0 0 0]
 [0 1 1 1 0 1 0 0 0 0 2 0 1 1 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 2 1 0 1]]


In [22]:
vectorizer.vocabulary_

{'the': 16,
 'elephant': 6,
 'sneezed': 14,
 'at': 0,
 'sight': 12,
 'of': 7,
 'potatoes': 9,
 'bats': 2,
 'can': 3,
 'see': 10,
 'via': 18,
 'echolocation': 5,
 'bat': 1,
 'sneeze': 13,
 'wondering': 19,
 'she': 11,
 'opened': 8,
 'door': 4,
 'to': 17,
 'studio': 15}

# Compare nltk & sklern

In [24]:
vectorizer.vocabulary_.keys()

dict_keys(['the', 'elephant', 'sneezed', 'at', 'sight', 'of', 'potatoes', 'bats', 'can', 'see', 'via', 'echolocation', 'bat', 'sneeze', 'wondering', 'she', 'opened', 'door', 'to', 'studio'])

In [25]:
list(lexicon.values())

['at',
 'eleph',
 'of',
 'potato',
 'sight',
 'sneez',
 'the',
 'bat',
 'can',
 'echoloc',
 'see',
 'via',
 'door',
 'open',
 'she',
 'studio',
 'to',
 'wonder']

#### NLTK only

In [26]:
set(lexicon.values()) - set(vectorizer.vocabulary_.keys())

{'echoloc', 'eleph', 'open', 'potato', 'sneez', 'wonder'}

#### SKLearn only

In [28]:
set(vectorizer.vocabulary_.keys()) - set(lexicon.values())

{'bats',
 'echolocation',
 'elephant',
 'opened',
 'potatoes',
 'sneeze',
 'sneezed',
 'wondering'}

## 2. One-Hot Encoding

### (1) Gensim

In [29]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    
    print(vec)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)]
[(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)]
[(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]


In [32]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    vec = [(x[0],1) for x in vec]
    print(vec)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
[(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]
[(6, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]


### (2) Scikit-Learn

In [33]:
from sklearn.preprocessing import Binarizer

In [34]:
vectorizer = CountVectorizer()

In [35]:
vectors = vectorizer.fit_transform(corpus)

In [36]:
onehot = Binarizer()

In [37]:
vectors = onehot.fit_transform(vectors)

In [38]:
vectors.A

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1]])

# 3. Tf-Idf(Term frequency to Inverse document frequency)

$$
\begin{align}
    tf (t, d) &= 1 + \log \, f_{t,d}  \\
    idf (t,D) &= \log 1 + \frac{N}{n_t} \\
    tf-idf (t, d, D) &= tf (t,d) \cdot idf (t,D)
\end{align}
$$

## (1) Gensim

In [40]:
tfidf = gensim.models.TfidfModel(dictionary=lexicon, normalize=True)

In [41]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    vec = tfidf[vec]
    
    print(vec)

[(0, 0.4837965208957426), (1, 0.4837965208957426), (2, 0.4837965208957426), (3, 0.4837965208957426), (4, 0.17855490118826325), (5, 0.17855490118826325)]
[(4, 0.10992597952954358), (5, 0.10992597952954358), (7, 0.5956913654963344), (8, 0.2978456827481672), (9, 0.2978456827481672), (10, 0.5956913654963344), (11, 0.2978456827481672)]
[(12, 0.408248290463863), (13, 0.408248290463863), (14, 0.408248290463863), (15, 0.408248290463863), (16, 0.408248290463863), (17, 0.408248290463863)]


In [42]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    vec = tfidf[vec]
    
    vec = [(tfidf.id2word[x[0]], x[1]) for x in vec]
    
    print(vec)

[('at', 0.4837965208957426), ('eleph', 0.4837965208957426), ('of', 0.4837965208957426), ('potato', 0.4837965208957426), ('sight', 0.17855490118826325), ('sneez', 0.17855490118826325)]
[('sight', 0.10992597952954358), ('sneez', 0.10992597952954358), ('bat', 0.5956913654963344), ('can', 0.2978456827481672), ('echoloc', 0.2978456827481672), ('see', 0.5956913654963344), ('via', 0.2978456827481672)]
[('door', 0.408248290463863), ('open', 0.408248290463863), ('she', 0.408248290463863), ('studio', 0.408248290463863), ('to', 0.408248290463863), ('wonder', 0.408248290463863)]


In [43]:
for doc in tokenized_corpus:
    print(doc)
    vec = lexicon.doc2bow(doc)
    vec = tfidf[vec]
    
    vec = [(tfidf.id2word[x[0]], x[1]) for x in vec]
    
    print(vec)

['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']
[('at', 0.4837965208957426), ('eleph', 0.4837965208957426), ('of', 0.4837965208957426), ('potato', 0.4837965208957426), ('sight', 0.17855490118826325), ('sneez', 0.17855490118826325)]
['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez']
[('sight', 0.10992597952954358), ('sneez', 0.10992597952954358), ('bat', 0.5956913654963344), ('can', 0.2978456827481672), ('echoloc', 0.2978456827481672), ('see', 0.5956913654963344), ('via', 0.2978456827481672)]
['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']
[('door', 0.408248290463863), ('open', 0.408248290463863), ('she', 0.408248290463863), ('studio', 0.408248290463863), ('to', 0.408248290463863), ('wonder', 0.408248290463863)]


## (2) Scikit-Learn

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
tfidf = TfidfVectorizer()

In [46]:
vectors = tfidf.fit_transform(corpus)

In [47]:
vectors.A

array([[0.37867627, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37867627, 0.37867627, 0.        , 0.37867627,
        0.        , 0.        , 0.28799306, 0.        , 0.37867627,
        0.        , 0.44730461, 0.        , 0.        , 0.        ],
       [0.        , 0.30251368, 0.30251368, 0.30251368, 0.        ,
        0.30251368, 0.        , 0.        , 0.        , 0.        ,
        0.60502736, 0.        , 0.23006945, 0.30251368, 0.        ,
        0.        , 0.17866945, 0.        , 0.30251368, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.36772387,
        0.        , 0.        , 0.        , 0.36772387, 0.        ,
        0.        , 0.36772387, 0.        , 0.        , 0.        ,
        0.36772387, 0.43436728, 0.36772387, 0.        , 0.36772387]])