## Feature Extraction - English

### 오태건 (20224071)

In [9]:
corpus = [
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]

## Bag of Words
### 1. nltk & gensim 

In [10]:
import nltk
import string

In [11]:
"""
    * 어근 뽑아내기 (Generator type function)
        1. 소문자 변환
        2. stemmer 인스턴스 생성
        3. 
         3_1) 문장부호는 불필요하기 때문에 pass(continue)
         3_2) stemmer.stem을 사용하여 어근 기준으로 파싱
    * @return 어근으로 쪼개진(=token) 목록을 반환
"""
def tokenize(text):
    text = text.lower()
    stemmer = nltk.stem.SnowballStemmer('english')
    
    for token in nltk.word_tokenize(text):
        if token in string.punctuation:
            continue

        yield stemmer.stem(token)


In [12]:
for doc in corpus:
    print(doc)

    tokens = list(tokenize(doc))
    print(tokens)
    print()

The elephant sneezed at the sight of potatoes.
['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']

Bats can see via echolocation. See the bat sight sneeze!
['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez']

Wondering, she opened the door to the studio.
['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']



In [13]:
tokenized_corpus = [
    list(tokenize(doc)) for doc in corpus
]

In [14]:
tokenized_corpus

[['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'],
 ['bat',
  'can',
  'see',
  'via',
  'echoloc',
  'see',
  'the',
  'bat',
  'sight',
  'sneez'],
 ['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']]

In [16]:
import gensim

In [17]:
lexicon = gensim.corpora.Dictionary(tokenized_corpus)

In [18]:
for x in lexicon.items():
    print(x)

(0, 'at')
(1, 'eleph')
(2, 'of')
(3, 'potato')
(4, 'sight')
(5, 'sneez')
(6, 'the')
(7, 'bat')
(8, 'can')
(9, 'echoloc')
(10, 'see')
(11, 'via')
(12, 'door')
(13, 'open')
(14, 'she')
(15, 'studio')
(16, 'to')
(17, 'wonder')


In [19]:
for doc in tokenized_corpus:
    print(doc)

    # document to bag-of-words
    vec = lexicon.doc2bow(doc)

    # sparse vector <=> dense vector
    # sparse vector는 벡터중에 0은 표현하지 않고, 1의 위치만 작성하여 메모리 절약 
    print(vec)
    print()
          

['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)]

['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez']
[(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)]

['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']
[(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]



### 2. Scikit-Learn

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
vectorizer = CountVectorizer()

In [28]:
# fit & transform(predict)
results = vectorizer.fit_transform(corpus)

In [29]:
results.A

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0, 1]],
      dtype=int64)

In [30]:
results.A.shape

(3, 20)

## One-Hot Encoding

### 1. Gensim

In [31]:
for doc in tokenized_corpus:
    vec = lexicon.doc2bow(doc)
    print(vec)

    # 2번 나온 2값을 1로 변경
    # (6,2) > (6,1)
    vec = [(x[0], 1) for x in vec]
    print(vec)

    print()


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)]
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]

[(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)]
[(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]

[(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]
[(6, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]



### 2. Scikit-Learn

In [32]:
from sklearn.preprocessing import Binarizer

In [33]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)

In [34]:
onehot = Binarizer()
vectors = onehot.fit_transform(vectors)

In [36]:
vectors.A

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1]],
      dtype=int64)

## Tf-Idf

### 1. Gensim


$$
\begin{align}
    tf (t, d) &= 1 + \log \, f_{t,d}  \\
    idf (t,D) &= \log 1 + \frac{N}{n_t} \\
    tf-idf (t, d, D) &= tf (t,d) \cdot idf (t,D)
\end{align}
$$

In [45]:
tfidf = gensim.models.TfidfModel(dictionary=lexicon, normalize=True)

In [46]:
tfidf

<gensim.models.tfidfmodel.TfidfModel at 0x24126fedb90>

In [48]:
for doc in tokenized_corpus:
    print(doc)

    vec = lexicon.doc2bow(doc) # bag of words
    vec = tfidf[vec] # bow => tfidf

    print(vec)
    print()

['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato']
[(0, 0.4837965208957426), (1, 0.4837965208957426), (2, 0.4837965208957426), (3, 0.4837965208957426), (4, 0.17855490118826325), (5, 0.17855490118826325)]

['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez']
[(4, 0.10992597952954358), (5, 0.10992597952954358), (7, 0.5956913654963344), (8, 0.2978456827481672), (9, 0.2978456827481672), (10, 0.5956913654963344), (11, 0.2978456827481672)]

['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']
[(12, 0.408248290463863), (13, 0.408248290463863), (14, 0.408248290463863), (15, 0.408248290463863), (16, 0.408248290463863), (17, 0.408248290463863)]



### 2. Scikit-Learn

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
tfidf = TfidfVectorizer()

In [43]:
vectors = tfidf.fit_transform(corpus)

In [44]:
vectors.A

array([[0.37867627, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37867627, 0.37867627, 0.        , 0.37867627,
        0.        , 0.        , 0.28799306, 0.        , 0.37867627,
        0.        , 0.44730461, 0.        , 0.        , 0.        ],
       [0.        , 0.30251368, 0.30251368, 0.30251368, 0.        ,
        0.30251368, 0.        , 0.        , 0.        , 0.        ,
        0.60502736, 0.        , 0.23006945, 0.30251368, 0.        ,
        0.        , 0.17866945, 0.        , 0.30251368, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.36772387,
        0.        , 0.        , 0.        , 0.36772387, 0.        ,
        0.        , 0.36772387, 0.        , 0.        , 0.        ,
        0.36772387, 0.43436728, 0.36772387, 0.        , 0.36772387]])