# Bag of Words and Simple Vectorisation

In [1]:
import spacy

from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
corpus = [
  "Red Bull drops hint on F1 engine.",
  "Honda exits F1, leaving F1 partner Red Bull.",
  "Hamilton eyes record eighth F1 title.",
  "Aston Martin announces sponsor."
]

### Building a basic bag of words representation of the above corpus
we can use the sci-kit learn CountVectorizer which takes a collection of text documents and creates a matrix of token counts

In [3]:
vectorizer = CountVectorizer()

In [4]:
"""
The fit_transform method does two things:
1. it learns a vocabulary dictionary from the corpus
2. It returns a matrix where each row represents a document and each column represents a token (ie term)
"""
bow = vectorizer.fit_transform(corpus)

In [5]:
# view features
print(vectorizer.get_feature_names_out())

['announces' 'aston' 'bull' 'drops' 'eighth' 'engine' 'exits' 'eyes' 'f1'
 'hamilton' 'hint' 'honda' 'leaving' 'martin' 'on' 'partner' 'record'
 'red' 'sponsor' 'title']


In [6]:
# view vocabulary dictionary
vectorizer.vocabulary_

{'red': 17,
 'bull': 2,
 'drops': 3,
 'hint': 10,
 'on': 14,
 'f1': 8,
 'engine': 5,
 'honda': 11,
 'exits': 6,
 'leaving': 12,
 'partner': 15,
 'hamilton': 9,
 'eyes': 7,
 'record': 16,
 'eighth': 4,
 'title': 19,
 'aston': 1,
 'martin': 13,
 'announces': 0,
 'sponsor': 18}

In [7]:
# Specifically, the CountVectorizer generates a sparse matrix using an efficient, compressed representation.
print(type(bow))

<class 'scipy.sparse.csr.csr_matrix'>


In [8]:
# the vector is a sequence of raw tuples where the first element represents the document, and the second element represents a token ID
print(bow)

  (0, 17)	1
  (0, 2)	1
  (0, 3)	1
  (0, 10)	1
  (0, 14)	1
  (0, 8)	1
  (0, 5)	1
  (1, 17)	1
  (1, 2)	1
  (1, 8)	2
  (1, 11)	1
  (1, 6)	1
  (1, 12)	1
  (1, 15)	1
  (2, 8)	1
  (2, 9)	1
  (2, 7)	1
  (2, 16)	1
  (2, 4)	1
  (2, 19)	1
  (3, 1)	1
  (3, 13)	1
  (3, 0)	1
  (3, 18)	1


In [10]:
# Count vectorizer supports custom tokenisers. for each document, it will call the tokeniser and expect a list of tokens returned. 
# simple callback which has spacy tokenise and filter tokens, and then return them
nlp = spacy.load('en_core_web_sm')

# this function gets rid of punctuation
def spacy_tokeniser(doc):
    return [t.text for t in nlp(doc) if not t.is_punct]

In [12]:
# This time, we instantiate CountVectorizer with our custom tokenizer (spacy_tokenizer), turn off case-folding, and also set the binary parameter to True so we simply get 1s and 0s marking token presence rather than token frequency.
vectorizer = CountVectorizer(tokenizer=spacy_tokeniser, lowercase=False, binary=True)
bow = vectorizer.fit_transform(corpus)

In [13]:
print(vectorizer.get_feature_names_out())

['Aston' 'Bull' 'F1' 'Hamilton' 'Honda' 'Martin' 'Red' 'announces' 'drops'
 'eighth' 'engine' 'exits' 'eyes' 'hint' 'leaving' 'on' 'partner' 'record'
 'sponsor' 'title']


In [14]:
vectorizer.vocabulary_

{'Red': 6,
 'Bull': 1,
 'drops': 8,
 'hint': 13,
 'on': 15,
 'F1': 2,
 'engine': 10,
 'Honda': 4,
 'exits': 11,
 'leaving': 14,
 'partner': 16,
 'Hamilton': 3,
 'eyes': 12,
 'record': 17,
 'eighth': 9,
 'title': 19,
 'Aston': 0,
 'Martin': 5,
 'announces': 7,
 'sponsor': 18}

In [15]:
# array representation of the sparse matrix
print(bow.toarray())

[[0 1 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0]
 [0 1 1 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0]
 [0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1]
 [1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0]]


## Cosine Similarity
We can calculate cosine simularity via the spatial package, a collection of spatial algorithms and data structures. It has a method to calculate cosine distance. To get the cosine simularity, we have to subtract the distance from 1

In [31]:
doc1_vs_doc2 = 1 - spatial.distance.cosine(bow[0].toarray(), bow[1].toarray())
doc1_vs_doc3 = 1 - spatial.distance.cosine(bow[0].toarray(), bow[2].toarray())
doc1_vs_doc4 = 1 - spatial.distance.cosine(bow[0].toarray(), bow[3].toarray())

for row in corpus:
    print("{row}".format(row = row))

print(f"Doc 1 vs Doc 2: {doc1_vs_doc2}")
print(f"Doc 1 vs Doc 3: {doc1_vs_doc3}")
print(f"Doc 1 vs Doc 4: {doc1_vs_doc4}")

Red Bull drops hint on F1 engine.
Honda exits F1, leaving F1 partner Red Bull.
Hamilton eyes record eighth F1 title.
Aston Martin announces sponsor.
Doc 1 vs Doc 2: 0.4285714285714286
Doc 1 vs Doc 3: 0.15430334996209194
Doc 1 vs Doc 4: 0.0


In [33]:
# cosine_simularity can take either array-likes or sparse-matrices
print(cosine_similarity(bow))

[[1.         0.42857143 0.15430335 0.        ]
 [0.42857143 1.         0.15430335 0.        ]
 [0.15430335 0.15430335 1.         0.        ]
 [0.         0.         0.         1.        ]]


### N-Grams
CountVectorizer includes an ngram_range parameter to generate different n-grams. n-gram range is specified using a minimum and maximum range. By default, n-gram range is set to (1,1) which generates unigrams. Setting it to (1,2) generates both unigrams and bigrams

In [35]:
vectorizer = CountVectorizer(tokenizer=spacy_tokeniser, lowercase=False, binary=True, ngram_range=(1,2))
bigrams = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print('Number of features: {}'.format(len(vectorizer.get_feature_names_out())))
print(vectorizer.vocabulary_)

['Aston' 'Aston Martin' 'Bull' 'Bull drops' 'F1' 'F1 engine' 'F1 leaving'
 'F1 partner' 'F1 title' 'Hamilton' 'Hamilton eyes' 'Honda' 'Honda exits'
 'Martin' 'Martin announces' 'Red' 'Red Bull' 'announces'
 'announces sponsor' 'drops' 'drops hint' 'eighth' 'eighth F1' 'engine'
 'exits' 'exits F1' 'eyes' 'eyes record' 'hint' 'hint on' 'leaving'
 'leaving F1' 'on' 'on F1' 'partner' 'partner Red' 'record'
 'record eighth' 'sponsor' 'title']
Number of features: 40
{'Red': 15, 'Bull': 2, 'drops': 19, 'hint': 28, 'on': 32, 'F1': 4, 'engine': 23, 'Red Bull': 16, 'Bull drops': 3, 'drops hint': 20, 'hint on': 29, 'on F1': 33, 'F1 engine': 5, 'Honda': 11, 'exits': 24, 'leaving': 30, 'partner': 34, 'Honda exits': 12, 'exits F1': 25, 'F1 leaving': 6, 'leaving F1': 31, 'F1 partner': 7, 'partner Red': 35, 'Hamilton': 9, 'eyes': 26, 'record': 36, 'eighth': 21, 'title': 39, 'Hamilton eyes': 10, 'eyes record': 27, 'record eighth': 37, 'eighth F1': 22, 'F1 title': 8, 'Aston': 0, 'Martin': 13, 'announces

In [37]:
# Setting n_gram range to (2, 2) generates only bigrams. 
vectorizer = CountVectorizer(tokenizer=spacy_tokeniser, lowercase=False, binary=True, ngram_range=(2,2))
bigrams = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(vectorizer.vocabulary_)

['Aston Martin' 'Bull drops' 'F1 engine' 'F1 leaving' 'F1 partner'
 'F1 title' 'Hamilton eyes' 'Honda exits' 'Martin announces' 'Red Bull'
 'announces sponsor' 'drops hint' 'eighth F1' 'exits F1' 'eyes record'
 'hint on' 'leaving F1' 'on F1' 'partner Red' 'record eighth']
{'Red Bull': 9, 'Bull drops': 1, 'drops hint': 11, 'hint on': 15, 'on F1': 17, 'F1 engine': 2, 'Honda exits': 7, 'exits F1': 13, 'F1 leaving': 3, 'leaving F1': 16, 'F1 partner': 4, 'partner Red': 18, 'Hamilton eyes': 6, 'eyes record': 14, 'record eighth': 19, 'eighth F1': 12, 'F1 title': 5, 'Aston Martin': 0, 'Martin announces': 8, 'announces sponsor': 10}
