In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# TF-IDF

# tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2', encoding='utf-8', ngram_range=(1, 2), stop_words='english')
tfidf = TfidfVectorizer()
transformer = tfidf.fit_transform(["I'd like an apple",
                            "An apple a day keeps the doctor away",
                            "Never compare an apple to an orange",
                            "I prefer scikit-learn to Orange"])

(transformer * transformer.T).A

array([[1.        , 0.25082859, 0.39482963, 0.        ],
       [0.25082859, 1.        , 0.22057609, 0.        ],
       [0.39482963, 0.22057609, 1.        , 0.26264139],
       [0.        , 0.        , 0.26264139, 1.        ]])

In [3]:
import nltk, string

In [4]:
# TF-IDF with pre-processing

nltk.download('punkt')

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0, 1]

print(cosine_sim("I'd like an apple", 'An apple a day keeps the doctor away'))
print(cosine_sim('An apple a day keeps the doctor away', 'Never compare an apple to an orange'))
print(cosine_sim('Never compare an apple to an orange', 'I prefer scikit-learn to Orange'))
print(cosine_sim('I prefer scikit-learn to Orange', "I'd like an apple"))

0.17077611319011649
0.17077611319011649
0.20199309249791833
0.0


[nltk_data] Downloading package punkt to /Users/kittisakp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  'stop_words.' % sorted(inconsistent))


In [6]:
# Spacy

import spacy

# nlp = spacy.load('en')
# nlp = spacy.load('en_core_web_md')
# nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en_vectors_web_lg')

# doc1 = nlp("I'd like an apple")
# doc2 = nlp('An apple a day keeps the doctor away')
# doc3 = nlp('Never compare an apple to an orange')
# doc4 = nlp('I prefer scikit-learn to Orange')

# print(doc1.similarity(doc2))
# print(doc2.similarity(doc3))
# print(doc3.similarity(doc4))
# print(doc4.similarity(doc1))

tokens = nlp('dog cat banana')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.8016855
dog banana 0.2432765
cat dog 0.8016855
cat cat 1.0
cat banana 0.28154364
banana dog 0.2432765
banana cat 0.28154364
banana banana 1.0
