# Count-based Representation Methods
* Bag-of-Words
    * Binary
    * Count
* TF-IDF

In [8]:
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

!pip show scikit-learn

Name: scikit-learn
Version: 1.0.1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /home/tslab/anaconda3/envs/comet2020/lib/python3.8/site-packages
Requires: threadpoolctl, numpy, scipy, joblib
Required-by: seqeval, sentence-transformers, flair, allennlp


In [9]:
# Example Corpus from sklearn documentation
train_corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

test_corpus = [
    'This is the fourth document.',
    'This document is the one.',
    'This document is new.',
]

In [40]:
def print_vector_results(corpus,vector,vocab,is_float=False):
    for doc,vec in zip(corpus,vector):
        print(doc,vec)
        for word,val in zip(vocab,vec):
            if is_float:
                print("{}:{:.3f}".format(word,val),end="\t")
            else:
                print("{}:{}".format(word,val),end="\t")
        print("\n")
        

# Bag-of-Words
## Binary
Each column (corresponds to a n-gram in vocab) is 1 if it exists in sentence and 0 otherwise.<br>
Used CountVectorizer with parameter binary=True

In [34]:
#OneHot Using Count Vectorizer
#binary=True reports non-zero as 1
cv = CountVectorizer(binary=True)

train_cv=cv.fit_transform(train_corpus).toarray()
vocab=cv.get_feature_names_out()
print("Train Vectors")
print_vector_results(train_corpus,train_cv,vocab)

test_cv=cv.transform(test_corpus).toarray()
print("Test Vectors")
print_vector_results(test_corpus,test_cv,vocab)


Train Vectors
This is the first document. [0 1 1 1 0 0 1 0 1]
and:0	document:1	first:1	is:1	one:0	second:0	the:1	third:0	this:1	

This document is the second document. [0 1 0 1 0 1 1 0 1]
and:0	document:1	first:0	is:1	one:0	second:1	the:1	third:0	this:1	

And this is the third one. [1 0 0 1 1 0 1 1 1]
and:1	document:0	first:0	is:1	one:1	second:0	the:1	third:1	this:1	

Is this the first document? [0 1 1 1 0 0 1 0 1]
and:0	document:1	first:1	is:1	one:0	second:0	the:1	third:0	this:1	

Test Vectors
This is the fourth document. [0 1 0 1 0 0 1 0 1]
and:0	document:1	first:0	is:1	one:0	second:0	the:1	third:0	this:1	

This document is the one. [0 1 0 1 1 0 1 0 1]
and:0	document:1	first:0	is:1	one:1	second:0	the:1	third:0	this:1	

This document is new. [0 1 0 1 0 0 0 0 1]
and:0	document:1	first:0	is:1	one:0	second:0	the:0	third:0	this:1	



## Count
Columns of Term-Document Matrix used as Document Vectors<br>

In [33]:
# Count Vectorizer

#Unigram
print("Unigram")
cv = CountVectorizer()

#cv tokenizer
cv_tokenizer=cv.build_tokenizer()
print(cv_tokenizer(train_corpus[0]))

train_cv=cv.fit_transform(train_corpus).toarray()
vocab=cv.get_feature_names_out()
print("Train Vectors")
print_vector_results(train_corpus,train_cv,vocab)

test_cv=cv.transform(test_corpus).toarray()
print("Test Vectors")
print_vector_results(test_corpus,test_cv,vocab)


Unigram
['This', 'is', 'the', 'first', 'document']
Train Vectors
This is the first document. [0 1 1 1 0 0 1 0 1]
and:0	document:1	first:1	is:1	one:0	second:0	the:1	third:0	this:1	

This document is the second document. [0 2 0 1 0 1 1 0 1]
and:0	document:2	first:0	is:1	one:0	second:1	the:1	third:0	this:1	

And this is the third one. [1 0 0 1 1 0 1 1 1]
and:1	document:0	first:0	is:1	one:1	second:0	the:1	third:1	this:1	

Is this the first document? [0 1 1 1 0 0 1 0 1]
and:0	document:1	first:1	is:1	one:0	second:0	the:1	third:0	this:1	

Test Vectors
This is the fourth document. [0 1 0 1 0 0 1 0 1]
and:0	document:1	first:0	is:1	one:0	second:0	the:1	third:0	this:1	

This document is the one. [0 1 0 1 1 0 1 0 1]
and:0	document:1	first:0	is:1	one:1	second:0	the:1	third:0	this:1	

This document is new. [0 1 0 1 0 0 0 0 1]
and:0	document:1	first:0	is:1	one:0	second:0	the:0	third:0	this:1	



In [41]:
#Bigram
cv = CountVectorizer(ngram_range=(2,2))

train_cv=cv.fit_transform(train_corpus).toarray()
vocab=cv.get_feature_names_out()
print("Train Vectors")
print_vector_results(train_corpus,train_cv,vocab)

test_cv=cv.transform(test_corpus).toarray()
print("Test Vectors")
print_vector_results(test_corpus,test_cv,vocab)


Train Vectors
This is the first document. [0 0 1 1 0 0 1 0 0 0 0 1 0]
and this:0	document is:0	first document:1	is the:1	is this:0	second document:0	the first:1	the second:0	the third:0	third one:0	this document:0	this is:1	this the:0	

This document is the second document. [0 1 0 1 0 1 0 1 0 0 1 0 0]
and this:0	document is:1	first document:0	is the:1	is this:0	second document:1	the first:0	the second:1	the third:0	third one:0	this document:1	this is:0	this the:0	

And this is the third one. [1 0 0 1 0 0 0 0 1 1 0 1 0]
and this:1	document is:0	first document:0	is the:1	is this:0	second document:0	the first:0	the second:0	the third:1	third one:1	this document:0	this is:1	this the:0	

Is this the first document? [0 0 1 0 1 0 1 0 0 0 0 0 1]
and this:0	document is:0	first document:1	is the:0	is this:1	second document:0	the first:1	the second:0	the third:0	third one:0	this document:0	this is:0	this the:1	

Test Vectors
This is the fourth document. [0 0 0 1 0 0 0 0 0 0 0 1 0]
and this:0	docu

## TF-IDF Vectorizer
TfidfVectorizer is same as applying TfidfTransformer to CountVectorizer<br>
IDF value of the train corpus is used for transforming test documents<br>

### norm
By default <i>norm</i> parameter is set to 'l2' -> Sum of squares of elements of vector equals to 1.<br>
This means dot product of 'l2' normalized vector equals to their <b>cosine similarity</b><br>
'l1' normalization makes sum of absolute element values to 1

In [42]:
# TF-IDF Vectorizer
tfidf_vec = TfidfVectorizer()

train_tfidf=tfidf_vec.fit_transform(train_corpus).toarray()
vocab=tfidf_vec.get_feature_names_out()
print("Train TFIDF")
print_vector_results(train_corpus,train_tfidf,vocab,is_float=True)

test_tfidf=tfidf_vec.transform(test_corpus).toarray()
print("Test TFIDF")
print_vector_results(test_corpus,test_tfidf,vocab,is_float=True)


Train TFIDF
This is the first document. [0.         0.46979139 0.58028582 0.38408524 0.         0.
 0.38408524 0.         0.38408524]
and:0.000	document:0.470	first:0.580	is:0.384	one:0.000	second:0.000	the:0.384	third:0.000	this:0.384	

This document is the second document. [0.         0.6876236  0.         0.28108867 0.         0.53864762
 0.28108867 0.         0.28108867]
and:0.000	document:0.688	first:0.000	is:0.281	one:0.000	second:0.539	the:0.281	third:0.000	this:0.281	

And this is the third one. [0.51184851 0.         0.         0.26710379 0.51184851 0.
 0.26710379 0.51184851 0.26710379]
and:0.512	document:0.000	first:0.000	is:0.267	one:0.512	second:0.000	the:0.267	third:0.512	this:0.267	

Is this the first document? [0.         0.46979139 0.58028582 0.38408524 0.         0.
 0.38408524 0.         0.38408524]
and:0.000	document:0.470	first:0.580	is:0.384	one:0.000	second:0.000	the:0.384	third:0.000	this:0.384	

Test TFIDF
This is the fourth document. [0.         0.57684669 0.  

In [43]:
#Bigram
tfidf_vec = TfidfVectorizer(ngram_range=(2,2))

train_tfidf=tfidf_vec.fit_transform(train_corpus).toarray()
vocab=tfidf_vec.get_feature_names_out()
print("Train TFIDF")
print_vector_results(train_corpus,train_tfidf,vocab,is_float=True)

test_tfidf=tfidf_vec.transform(test_corpus).toarray()
print("Test TFIDF")
print_vector_results(test_corpus,test_tfidf,vocab,is_float=True)

Train TFIDF
This is the first document. [0.         0.         0.52303503 0.42344193 0.         0.
 0.52303503 0.         0.         0.         0.         0.52303503
 0.        ]
and this:0.000	document is:0.000	first document:0.523	is the:0.423	is this:0.000	second document:0.000	the first:0.523	the second:0.000	the third:0.000	third one:0.000	this document:0.000	this is:0.523	this the:0.000	

This document is the second document. [0.         0.47633035 0.         0.30403549 0.         0.47633035
 0.         0.47633035 0.         0.         0.47633035 0.
 0.        ]
and this:0.000	document is:0.476	first document:0.000	is the:0.304	is this:0.000	second document:0.476	the first:0.000	the second:0.476	the third:0.000	third one:0.000	this document:0.476	this is:0.000	this the:0.000	

And this is the third one. [0.49819711 0.         0.         0.31799276 0.         0.
 0.         0.         0.49819711 0.49819711 0.         0.39278432
 0.        ]
and this:0.498	document is:0.000	first d

In [44]:
#L1 Norm of Tf-idf Vectorizer
# TF-IDF Vectorizer
tfidf_vec = TfidfVectorizer(norm='l1')

train_tfidf=tfidf_vec.fit_transform(train_corpus).toarray()
vocab=tfidf_vec.get_feature_names_out()
print("Train TFIDF")
print_vector_results(train_corpus,train_tfidf,vocab,is_float=True)

test_tfidf=tfidf_vec.transform(test_corpus).toarray()
print("Test TFIDF")
print_vector_results(test_corpus,test_tfidf,vocab,is_float=True)


Train TFIDF
This is the first document. [0.         0.21331533 0.26348688 0.17439926 0.         0.
 0.17439926 0.         0.17439926]
and:0.000	document:0.213	first:0.263	is:0.174	one:0.000	second:0.000	the:0.174	third:0.000	this:0.174	

This document is the second document. [0.         0.33225959 0.         0.13582199 0.         0.26027443
 0.13582199 0.         0.13582199]
and:0.000	document:0.332	first:0.000	is:0.136	one:0.000	second:0.260	the:0.136	third:0.000	this:0.136	

And this is the third one. [0.21903289 0.         0.         0.11430045 0.21903289 0.
 0.11430045 0.21903289 0.11430045]
and:0.219	document:0.000	first:0.000	is:0.114	one:0.219	second:0.000	the:0.114	third:0.219	this:0.114	

Is this the first document? [0.         0.21331533 0.26348688 0.17439926 0.         0.
 0.17439926 0.         0.17439926]
and:0.000	document:0.213	first:0.263	is:0.174	one:0.000	second:0.000	the:0.174	third:0.000	this:0.174	

Test TFIDF
This is the fourth document. [0.         0.28962869 0.  