# TF-IDF algorithm Text Similarity

Term frequency (TF): This technique tries to find the relative importance (or frequency) of the word in a given document

Inverse document frequency (IDF) : This technique makes sure that words that are frequently used (a, the, and so on) should be given lower weight when compared to the words that are rarely used.

The text similarity problem deals with the challenge of finding how close given text documents are

In [1]:
import nltk

In [2]:
import math

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
class TextSimilarityExample:
    def __init__(self):
        self.statements = [
            'ruled india',
            'Chalukyas ruled Badami',
            'So many kingdoms ruled India',
            'Lalbagh is a botanical garden in India'
        ]
    def TF(self, sentence):
        words = nltk.word_tokenize(sentence.lower())
        freq = nltk.FreqDist(words)
        dictionary = {}
        for key in freq.keys():
            norm = freq[key]/float(len(words))
            dictionary[key] = norm
        return dictionary

    def IDF(self):
        def idf(TotalNumberOfDocuments, NumberOfDocumentsWithThisWord):
            return 1.0 + math.log(TotalNumberOfDocuments/NumberOfDocumentsWithThisWord)
        numDocuments = len(self.statements)
        uniqueWords = {}
        idfValues = {}
        for sentence in self.statements:
            for word in nltk.word_tokenize(sentence.lower()):
                if word not in uniqueWords:
                    uniqueWords[word] = 1
                else:
                    uniqueWords[word] += 1
        for word in uniqueWords:
            idfValues[word] = idf(numDocuments, uniqueWords[word])
        return idfValues

    def TF_IDF(self, query):
        words = nltk.word_tokenize(query.lower())
        idf = self.IDF()
        vectors = {}
        for sentence in self.statements:
            tf = self.TF(sentence)
            for word in words:
                tfv = tf[word] if word in tf else 0.0
                idfv = idf[word] if word in idf else 0.0
                mul = tfv * idfv
                if word not in vectors:
                    vectors[word] = []
                vectors[word].append(mul)
        return vectors

    def displayVectors(self, vectors):
        print(self.statements)
        for word in vectors:
            print("{} -> {}".format(word, vectors[word]))

    def cosineSimilarity(self):
        vec = TfidfVectorizer()
        matrix = vec.fit_transform(self.statements)
        for j in range(1, 5):
            i = j - 1
            print("\tsimilarity of document {} with others".format(i))
            similarity = cosine_similarity(matrix[i:j], matrix)
            print(similarity)

    def demo(self):
        inputQuery = self.statements[0]
        vectors = self.TF_IDF(inputQuery)
        self.displayVectors(vectors)
        self.cosineSimilarity()

In [6]:
similarity = TextSimilarityExample()

In [7]:
similarity.demo()

['ruled india', 'Chalukyas ruled Badami', 'So many kingdoms ruled India', 'Lalbagh is a botanical garden in India']
ruled -> [0.6438410362258904, 0.42922735748392693, 0.2575364144903562, 0.0]
india -> [0.6438410362258904, 0.0, 0.2575364144903562, 0.18395458177882582]
	similarity of document 0 with others
[[ 1.          0.29088811  0.46216171  0.19409143]]
	similarity of document 1 with others
[[ 0.29088811  1.          0.13443735  0.        ]]
	similarity of document 2 with others
[[ 0.46216171  0.13443735  1.          0.08970163]]
	similarity of document 3 with others
[[ 0.19409143  0.          0.08970163  1.        ]]
