# Simple Methods
* OneHot Encoding
* CountVectorizer
* TF-IDF Vectorizer

In [8]:
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

!pip show scikit-learn

Name: scikit-learn
Version: 1.0.1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /home/tslab/anaconda3/envs/comet2020/lib/python3.8/site-packages
Requires: threadpoolctl, numpy, scipy, joblib
Required-by: seqeval, sentence-transformers, flair, allennlp


In [9]:
# Example Corpus from sklearn
train_corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

test_corpus = [
    'This is the fourth document.',
    'This document is the one.',
    'This document is new.',
]

In [10]:
# Count Vectorizer

#Unigram
print("Unigram")
cv = CountVectorizer()

#cv tokenizer
cv_tokenizer=cv.build_tokenizer()
print(cv_tokenizer(train_corpus[0]))

train_cv=cv.fit_transform(train_corpus)
print("Train CV",train_cv.toarray())
print("Vocab",cv.get_feature_names_out())

test_cv=cv.transform(test_corpus)
print("Test CV",test_cv.toarray())


Unigram
['This', 'is', 'the', 'first', 'document']
Train CV [[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Vocab ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Test CV [[0 1 0 1 0 0 1 0 1]
 [0 1 0 1 1 0 1 0 1]
 [0 1 0 1 0 0 0 0 1]]


In [11]:
#OneHot Using Count Vectorizer
#binary=True reports non-zero as 1
cv = CountVectorizer(binary=True)

train_cv=cv.fit_transform(train_corpus)
print("Train CV",train_cv.toarray())
print("Vocab",cv.get_feature_names_out())

test_cv=cv.transform(test_corpus)
print("Test CV",test_cv.toarray())

Train CV [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Vocab ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Test CV [[0 1 0 1 0 0 1 0 1]
 [0 1 0 1 1 0 1 0 1]
 [0 1 0 1 0 0 0 0 1]]


In [12]:
#Bigram
cv = CountVectorizer(ngram_range=(2,2))

train_cv=cv.fit_transform(train_corpus)
print("Train CV",train_cv.toarray())
print("Vocab",cv.get_feature_names_out())

test_cv=cv.transform(test_corpus)
print("Test CV",test_cv.toarray())

Train CV [[0 0 1 1 0 0 1 0 0 0 0 1 0]
 [0 1 0 1 0 1 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 1 0]
 [0 0 1 0 1 0 1 0 0 0 0 0 1]]
Vocab ['and this' 'document is' 'first document' 'is the' 'is this'
 'second document' 'the first' 'the second' 'the third' 'third one'
 'this document' 'this is' 'this the']
Test CV [[0 0 0 1 0 0 0 0 0 0 0 1 0]
 [0 1 0 1 0 0 0 0 0 0 1 0 0]
 [0 1 0 0 0 0 0 0 0 0 1 0 0]]


In [13]:
# TF-IDF Vectorizer
tfidf_vec = TfidfVectorizer()

train_tfidf=tfidf_vec.fit_transform(train_corpus)
print("Train TFIDF",train_tfidf.toarray())
print("Vocab",tfidf_vec.get_feature_names_out())

test_tfidf=tfidf_vec.transform(test_corpus)
print("Test TFIDF",test_tfidf.toarray())


Train TFIDF [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
Vocab ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Test TFIDF [[0.         0.57684669 0.         0.47160997 0.         0.
  0.47160997 0.         0.47160997]
 [0.         0.42796959 0.         0.34989318 0.67049706 0.
  0.34989318 0.         0.34989318]
 [0.         0.65416415 0.         0.53482206 0.         0.
  0.         0.         0.53482206]]


In [14]:
#Bigram
tfidf_vec = TfidfVectorizer(ngram_range=(2,2))

train_tfidf=tfidf_vec.fit_transform(train_corpus)
print("Train TFIDF",train_tfidf.toarray())
print("Vocab",tfidf_vec.get_feature_names_out())

test_tfidf=tfidf_vec.transform(test_corpus)
print("Test TFIDF",test_tfidf.toarray())

Train TFIDF [[0.         0.         0.52303503 0.42344193 0.         0.
  0.52303503 0.         0.         0.         0.         0.52303503
  0.        ]
 [0.         0.47633035 0.         0.30403549 0.         0.47633035
  0.         0.47633035 0.         0.         0.47633035 0.
  0.        ]
 [0.49819711 0.         0.         0.31799276 0.         0.
  0.         0.         0.49819711 0.49819711 0.         0.39278432
  0.        ]
 [0.         0.         0.43779123 0.         0.55528266 0.
  0.43779123 0.         0.         0.         0.         0.
  0.55528266]]
Vocab ['and this' 'document is' 'first document' 'is the' 'is this'
 'second document' 'the first' 'the second' 'the third' 'third one'
 'this document' 'this is' 'this the']
Test TFIDF [[0.         0.         0.         0.62922751 0.         0.
  0.         0.         0.         0.         0.         0.77722116
  0.        ]
 [0.         0.64450299 0.         0.41137791 0.         0.
  0.         0.         0.         0.  