In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import nltk.stem
import numpy as np
import sys

# Preprocessing

## Bag of Words (BoW)

In [2]:
vectorizer = CountVectorizer()
dummyTexts = ['This is a toy post about machine learning. Actually, it contains not much interesting stuff.', 
              'Imaging databases can get huge.',
              'Most imaging databases safe images permanently.',
              'Imaging databases store images.',
              'Imaging databases store images. Imaging databases store images. Imaging databases store images.']

### Fit bag of word transformer

In [3]:
vectorizer.fit(dummyTexts)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

### Get unique words list

In [4]:
print(vectorizer.get_feature_names())

['about', 'actually', 'can', 'contains', 'databases', 'get', 'huge', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'safe', 'store', 'stuff', 'this', 'toy']


### Transform data to bag of words representation

In [5]:
dummyTextVectors = vectorizer.transform(dummyTexts)
dummyTextVectors

<5x24 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

### Array representation

In [6]:
dummyTextVectors.toarray()

array([[1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
        1, 1],
       [0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0],
       [0, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
        0, 0]])

## Stopwords

In [7]:
noSWVectorizer = CountVectorizer(stop_words='english')
sorted(noSWVectorizer.get_stop_words())[:15]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although']

## Stemming

In [8]:
s = nltk.stem.SnowballStemmer('english')

In [9]:
print(f'Original Word: graphics, Stemmed Value: ' + s.stem('graphics'))
print(f'Original Word: imaging, Stemmed Value: ' + s.stem('imaging'))
print(f'Original Word: eraser, Stemmed Value: ' + s.stem('eraser'))
print(f'Original Word: cake, Stemmed Value: ' + s.stem('cake'))

Original Word: graphics, Stemmed Value: graphic
Original Word: imaging, Stemmed Value: imag
Original Word: eraser, Stemmed Value: eras
Original Word: cake, Stemmed Value: cake


## BoW + Stemming

In [10]:
englishStemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
  def build_analyzer(self):
    analyzer = super(StemmedCountVectorizer, self).build_analyzer()
    return lambda doc: (englishStemmer.stem(word) for word in analyzer(doc))

## Term frequency - Inverse document frecuency (TF-IDF)

# Distance measure - relatedness

In [11]:
newText = 'imaging databases'
newTextVector = vectorizer.transform([newText])

def analyzeBestText(data, dataVectors, queryVector, measure):
  bestText = None
  bestDist = None
  bestLoc = None
  for i in range(len(data)):
    textVector = dataVectors.getrow(i)
    dist = measure(textVector, queryVector)
    print(f'=== Post #{i} with dist = {dist:.2f}: {data[i]}')
    if bestDist is None or bestDist > dist:
      bestText = data[i]
      bestDist = dist
      bestLoc = i

  print(f"Best post is #{bestLoc} with dist ={bestDist: .2f}: {bestText}")

## Similarity measure

In [12]:
def euclidianMeasure(v1, v2):
  delta = v1 - v2
  return np.linalg.norm(delta.toarray())

analyzeBestText(dummyTexts, dummyTextVectors, newTextVector, euclidianMeasure)

=== Post #0 with dist = 4.00: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post #1 with dist = 1.73: Imaging databases can get huge.
=== Post #2 with dist = 2.00: Most imaging databases safe images permanently.
=== Post #3 with dist = 1.41: Imaging databases store images.
=== Post #4 with dist = 5.10: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is #3 with dist = 1.41: Imaging databases store images.


In [13]:
def normalizedEuclidianMeasure(v1, v2):
  normalizedV1 = v1 / np.linalg.norm(v1.toarray())
  normalizedV2 = v2 / np.linalg.norm(v2.toarray())
  delta = normalizedV1 - normalizedV2
  return np.linalg.norm(delta.toarray())

analyzeBestText(dummyTexts, dummyTextVectors, newTextVector, normalizedEuclidianMeasure)

=== Post #0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post #1 with dist = 0.86: Imaging databases can get huge.
=== Post #2 with dist = 0.92: Most imaging databases safe images permanently.
=== Post #3 with dist = 0.77: Imaging databases store images.
=== Post #4 with dist = 0.77: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is #3 with dist = 0.77: Imaging databases store images.


In [14]:
noSWDummyTextVector = noSWVectorizer.fit_transform(dummyTexts)
newNoSWTextVector = noSWVectorizer.transform([newText])
print("Euclidian measure")
analyzeBestText(dummyTexts, noSWDummyTextVector, newNoSWTextVector, euclidianMeasure)
print("\nNormalized euclidian measure")
analyzeBestText(dummyTexts, noSWDummyTextVector, newNoSWTextVector, normalizedEuclidianMeasure)

Euclidian measure
=== Post #0 with dist = 3.16: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post #1 with dist = 1.00: Imaging databases can get huge.
=== Post #2 with dist = 1.73: Most imaging databases safe images permanently.
=== Post #3 with dist = 1.41: Imaging databases store images.
=== Post #4 with dist = 5.10: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is #1 with dist = 1.00: Imaging databases can get huge.

Normalized euclidian measure
=== Post #0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post #1 with dist = 0.61: Imaging databases can get huge.
=== Post #2 with dist = 0.86: Most imaging databases safe images permanently.
=== Post #3 with dist = 0.77: Imaging databases store images.
=== Post #4 with dist = 0.77: Imaging databases store images. Imaging databases store images. Imaging database