## Document similarity

In [None]:
# http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
# https://stackoverflow.com/questions/8897593/similarity-between-two-text-documents

## TF-IDF 

tf-idf, short for term frequency–inverse document frequency, is a numeric measure that is use to score the importance of a word in a document based on how often did it appear in that document and a given collection of documents. The intuition for this measure is : If a word appears frequently in a document, then it should be important and we should give that word a high score. But if a word appears in too many other documents, it’s probably not a unique identifier, therefore we should assign a lower score to that word. 

See [https://ethen8181.github.io/machine-learning/clustering_old/tf_idf/tf_idf.html](https://ethen8181.github.io/machine-learning/clustering_old/tf_idf/tf_idf.html)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

documents = (
"The sky is blue",
"The sun is bright",
"The sun in the sky is bright",
"We can see the shining sun, the bright sun"
)

tfidf = vectorizer.fit_transform(documents)
print ('size tfidf matrix (#documents, #unique words):', tfidf.shape)

print('unique words', vectorizer.get_feature_names() )

size tfidf matrix (#documents, #unique words): (4, 11)
unique words ['blue', 'bright', 'can', 'in', 'is', 'see', 'shining', 'sky', 'sun', 'the', 'we']


In [43]:
# Inspect the tfidf matrix
print(tfidf_matrix)
# 'shining' is element 6; it appears only in last document (index 3)
# (3,6) is set, (0,6), (1,6), (2,6) are not set
# 'sun' (element 8) does not appear in doc 0 (0, 8) missing
# once in doc 1 and doc 2 (but doc 1 is shorter, so 0.52 (cell 1,8) is larger than 0.32 (cell 2,8)
# appears twice in doc 3, but doc 3 is longer, 0.47 (cell 3,8)

  (0, 9)	0.3439932714296342
  (0, 7)	0.5197138488789809
  (0, 4)	0.42075315164463567
  (0, 0)	0.6591911178676787
  (1, 9)	0.42685800978431027
  (1, 4)	0.5221086219944969
  (1, 8)	0.5221086219944969
  (1, 1)	0.5221086219944969
  (2, 9)	0.5262610401109715
  (2, 7)	0.3975443320946988
  (2, 4)	0.32184639875982174
  (2, 8)	0.32184639875982174
  (2, 1)	0.32184639875982174
  (2, 3)	0.5042345768555538
  (3, 9)	0.39096308821336656
  (3, 8)	0.4782039801500678
  (3, 1)	0.2391019900750339
  (3, 10)	0.37459947122408604
  (3, 2)	0.37459947122408604
  (3, 5)	0.37459947122408604
  (3, 6)	0.37459947122408604


In [44]:
# get tf-idf matrix
pairwise_similarity = (tfidf * tfidf.T)
# .A shows it as a matrix
print('matrix\n', pairwise_similarity.A)
print('\ncells\n', pairwise_similarity)

matrix
 [[1.         0.36651513 0.52305744 0.13448867]
 [0.36651513 1.         0.72875508 0.54139736]
 [0.52305744 0.72875508 1.         0.43661098]
 [0.13448867 0.54139736 0.43661098 1.        ]]

cells
   (0, 3)	0.13448867172274862
  (0, 2)	0.5230574383703659
  (0, 1)	0.36651513142667
  (0, 0)	0.9999999999999998
  (1, 3)	0.5413973573965388
  (1, 2)	0.728755079459936
  (1, 1)	0.9999999999999998
  (1, 0)	0.36651513142667
  (2, 3)	0.4366109847740327
  (2, 2)	0.9999999999999998
  (2, 1)	0.728755079459936
  (2, 0)	0.5230574383703659
  (3, 3)	1.0
  (3, 2)	0.4366109847740327
  (3, 1)	0.5413973573965388
  (3, 0)	0.13448867172274862
