In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
import nltk.stem
import numpy as np
import sys

# Preprocessing

## Bag of Words (BoW)

In [2]:
bow = CountVectorizer()
data = ['This is a toy post about machine learning. Actually, it contains not much interesting stuff.', 
        'Imaging databases can get huge.',
        'Most imaging databases safe images permanently.',
        'Imaging databases store images.',
        'Imaging databases store images. Imaging databases store images. Imaging databases store images.']

### Fit bag of word transformer

In [3]:
bow.fit(data)

CountVectorizer()

### Get unique words list

In [4]:
print(bow.get_feature_names())

['about', 'actually', 'can', 'contains', 'databases', 'get', 'huge', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'safe', 'store', 'stuff', 'this', 'toy']


### Transform data to bag of words representation

In [5]:
dataBow = bow.transform(data)
dataBow

<5x24 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

### Array representation

In [6]:
dataBow.toarray()

array([[1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
        1, 1],
       [0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0],
       [0, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
        0, 0]], dtype=int64)

## Stopwords + BoW

In [7]:
swBow = CountVectorizer(stop_words='english')
swDataBow = swBow.fit_transform(data)
sorted(swBow.get_stop_words())[:15]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although']

## Stemming

In [8]:
stemmer = nltk.stem.SnowballStemmer('english')

In [9]:
print(f'Original Word: graphics, Stemmed Value: ' + stemmer.stem('graphics'))
print(f'Original Word: imaging, Stemmed Value: ' + stemmer.stem('imaging'))
print(f'Original Word: eraser, Stemmed Value: ' + stemmer.stem('eraser'))
print(f'Original Word: cake, Stemmed Value: ' + stemmer.stem('cake'))

Original Word: graphics, Stemmed Value: graphic
Original Word: imaging, Stemmed Value: imag
Original Word: eraser, Stemmed Value: eras
Original Word: cake, Stemmed Value: cake


## BoW + Stemming

In [10]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(word) for word in analyzer(doc))

stemmedBow = StemmedCountVectorizer(stop_words='english')
stemmedDataBow = stemmedBow.fit_transform(data)

## Term frequency - Inverse document frecuency (TF-IDF) + Stemming

In [11]:
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(word) for word in analyzer(doc))

stemmedTfidf = StemmedTfidfVectorizer(stop_words='english')
stemmedDataTfidf = stemmedTfidf.fit_transform(data)

# Distance measure - relatedness

## Similarity measure

In [12]:
def euclidianMeasure(v1, v2):
    delta = v1 - v2
    return np.linalg.norm(delta.toarray())

def normalizedEuclidianMeasure(v1, v2):
    normalizedV1 = v1 / np.linalg.norm(v1.toarray())
    normalizedV2 = v2 / np.linalg.norm(v2.toarray())
    delta = normalizedV1 - normalizedV2
    return np.linalg.norm(delta.toarray())

# Example

In [13]:
def analyzeBestText(data, dataVectors, queryVector, measure):
    bestText = None
    bestDist = None
    bestLoc = None
    for i in range(len(data)):
        textVector = dataVectors.getrow(i)
        dist = measure(textVector, queryVector)
        print(f'=== Post #{i} with dist = {dist:.2f}: {data[i]}')
        if bestDist is None or bestDist > dist:
            bestText = data[i]
            bestDist = dist
            bestLoc = i

    print(f"Best post is #{bestLoc} with dist ={bestDist: .2f}: {bestText}")

## BoW

In [14]:
newText = 'imaging databases'
newTextBow = bow.transform([newText])

analyzeBestText(data, dataBow, newTextBow, normalizedEuclidianMeasure)
analyzeBestText(data, dataBow, newTextBow, euclidianMeasure)

=== Post #0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post #1 with dist = 0.86: Imaging databases can get huge.
=== Post #2 with dist = 0.92: Most imaging databases safe images permanently.
=== Post #3 with dist = 0.77: Imaging databases store images.
=== Post #4 with dist = 0.77: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is #3 with dist = 0.77: Imaging databases store images.
=== Post #0 with dist = 4.00: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post #1 with dist = 1.73: Imaging databases can get huge.
=== Post #2 with dist = 2.00: Most imaging databases safe images permanently.
=== Post #3 with dist = 1.41: Imaging databases store images.
=== Post #4 with dist = 5.10: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is #3 with dist = 1.41

## BoW + Stopwords

In [18]:
swNewTextBow = swBow.transform([newText])
print("\nNormalized euclidian measure")
analyzeBestText(data, swDataBow, swNewTextBow, normalizedEuclidianMeasure)


Normalized euclidian measure
=== Post #0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post #1 with dist = 0.61: Imaging databases can get huge.
=== Post #2 with dist = 0.86: Most imaging databases safe images permanently.
=== Post #3 with dist = 0.77: Imaging databases store images.
=== Post #4 with dist = 0.77: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is #1 with dist = 0.61: Imaging databases can get huge.


## Bow + Stemming

In [16]:
stemmedNewTextBow = stemmedBow.transform([newText])
print("\nNormalized euclidian measure")
analyzeBestText(data, stemmedDataBow, stemmedNewTextBow, normalizedEuclidianMeasure)


Normalized euclidian measure
=== Post #0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post #1 with dist = 0.61: Imaging databases can get huge.
=== Post #2 with dist = 0.63: Most imaging databases safe images permanently.
=== Post #3 with dist = 0.52: Imaging databases store images.
=== Post #4 with dist = 0.52: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is #3 with dist = 0.52: Imaging databases store images.


## Tfidf + Stemming

In [17]:
stemmedNewTextTfidf = stemmedTfidf.transform([newText])
print("\nNormalized euclidian measure")
analyzeBestText(data, stemmedDataTfidf, stemmedNewTextTfidf, normalizedEuclidianMeasure)


Normalized euclidian measure
=== Post #0 with dist = 1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post #1 with dist = 0.87: Imaging databases can get huge.
=== Post #2 with dist = 0.86: Most imaging databases safe images permanently.
=== Post #3 with dist = 0.63: Imaging databases store images.
=== Post #4 with dist = 0.63: Imaging databases store images. Imaging databases store images. Imaging databases store images.
Best post is #3 with dist = 0.63: Imaging databases store images.
