# Bag of Words

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

### Counting the occurence of words in language

#### Pros
* Works for any text
* Easy and fast to do
* Does not require a language model (just the corpus)

#### Cons
* Does not apply language knowledge (stopwords EN only)
* Order of words is ignored
* Uniqueness of words is not accentuated ()

In [37]:
artists = ["Ben Howard", "The Doors"]

In [38]:
labels = [artists[0]]*3 + [artists[1]]*3

In [49]:
# corups is a collection of the documents.
corpus = ["""Living without her
Living at all
Seems to slow me down
Living forever
Hell, I don't know
Do I care, do I care
The thunder's rumbled sound""",
          """Wrapped up in dissonance
I'm sorry that I just walked away
Lost in the insignificance of mine
I had no words to say""",
          """Hold it in, the river in your mouth is pouring out
Water takes the shape of all that it surrounds
Yeah I know, I've been trying so hard
To keep in time with all of the hours in your day""",
          """People are strange when you're a stranger
Faces look ugly when you're alone
Women seem wicked when you're unwanted
Streets are uneven when you're down""",
          """You know that it would be untrue
You know that I would be a liar
If I was to say to you
Girl, we couldn't get much higher
""", 
          """This is the end
Beautiful friend
This is the end
My only friend, the end"""]

In [50]:
cv = CountVectorizer()

In [51]:
vectorized_corpus = cv.fit_transform(corpus)

In [52]:
vectorized_corpus

<6x95 sparse matrix of type '<class 'numpy.int64'>'
	with 113 stored elements in Compressed Sparse Row format>

#### Sparse Matrix
Most of our matrix consists of zeroes. A Sparse Matrix only stores the non-zero values to save memory.

In [57]:
vectorized_corpus.todense()

matrix([[1, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 2, 1, 1, 0, 0, 1, 0, 0, 0,
         0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
         0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
        [2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 1, 1, 0, 4, 0, 1, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0,
         1, 0, 0, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
         0, 0, 1, 1, 1, 3, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
         0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2],
       

In [58]:
df=pd.DataFrame(vectorized_corpus.todense(),index=labels,columns=cv.get_feature_names())
df

Unnamed: 0,all,alone,are,at,away,be,beautiful,been,care,couldn,...,wicked,with,without,women,words,would,wrapped,yeah,you,your
Ben Howard,1,0,0,1,0,0,0,0,2,0,...,0,0,1,0,0,0,0,0,0,0
Ben Howard,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
Ben Howard,2,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,2
The Doors,0,1,2,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,4,0
The Doors,0,0,0,0,0,2,0,0,0,1,...,0,0,0,0,0,2,0,0,3,0
The Doors,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---

### The Tf-Idf Transformer:

#### (Counting plus scaling - both by document and by word)

* TF - Term Frequency (same as Count Vectorizer)
* IDF - Inverse Document Frequency

$TFIDF = TF(w,d) * IDF(w)$

$IDF(w) = log(\frac{1+ no.documents}{1 + no.documents containing word w})+1$

##### The steps for calculating TFIDF are:
* For each vector:
    * Calculate the term frequency for each term in the vector
    * Calculate the inverse doc frequency for each term in the vector
    * Multiply the two for each term in the vector
* Then normalise each vector by the Euclidean norm (numpy.linalg.norm)
    * $norm = \frac{v}{||v||^2}$

In [70]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [59]:
tfid = TfidfTransformer()

In [63]:
tfid=tfid.fit_transform(df)

In [66]:
df_tfid = pd.DataFrame(tfid.todense(), index = labels, columns=cv.get_feature_names())

---

In [68]:
df_tfid.head()

Unnamed: 0,all,alone,are,at,away,be,beautiful,been,care,couldn,...,wicked,with,without,women,words,would,wrapped,yeah,you,your
Ben Howard,0.14604,0.0,0.0,0.178095,0.0,0.0,0.0,0.0,0.35619,0.0,...,0.0,0.0,0.178095,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ben Howard,0.0,0.0,0.0,0.0,0.234289,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.234289,0.0,0.234289,0.0,0.0,0.0
Ben Howard,0.236771,0.0,0.0,0.0,0.0,0.0,0.0,0.14437,0.0,0.0,...,0.0,0.14437,0.0,0.0,0.0,0.0,0.0,0.14437,0.0,0.28874
The Doors,0.0,0.128638,0.257276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.128638,0.0,0.0,0.128638,0.0,0.0,0.0,0.0,0.42194,0.0
The Doors,0.0,0.0,0.0,0.0,0.0,0.361321,0.0,0.0,0.0,0.180661,...,0.0,0.0,0.0,0.0,0.0,0.361321,0.0,0.0,0.444433,0.0


In [91]:
tfid_vec = TfidfVectorizer()
tfid_vec_trans = tfid_vec.fit(corpus)
tfidtrans_corpus=tfid_vec_trans.transform(corpus)
#tfidtrans = TfidfTransformer()
#tfidtrans.fit(tfid_vec_corpus)
#tfidtrans_corpus = tfidtrans.transform(tfid_vec_corpus)

### Building TF-IDF from the ground up

In [92]:
import math

n_docs = 4
doc_freq = 4
tf = 1.0
idf = math.log((n_docs) / (1+doc_freq)) + 1

(tf * idf) / math.sqrt(4)

0.38842822434289515

## Using a Classifier

In [75]:
from sklearn.linear_model import LogisticRegression

In [80]:
tfidtrans_corpus.shape

(6, 95)

In [82]:
m = LogisticRegression()
m.fit(tfidtrans_corpus, labels)

LogisticRegression()

### Prediction for a new song 

In [93]:
m.predict(tfid_vec_trans.transform(['this is a song']))

array(['The Doors'], dtype='<U10')

---