In [5]:
import numpy as np
import re
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
stopwords = nltk.corpus.stopwords.words('english')
ps = PorterStemmer()
tokenize = nltk.tokenize.word_tokenize
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')

In [7]:
def preprocess_txt(corpus):
    token_no_stop = []
    final = []
    
    if(type(corpus) != list):
        corpus = tokenize(corpus)

    # removing stopwords
    for word in corpus:
        if word.lower() not in stopwords:
            token_no_stop.append(word.lower())

    # building list of indexes for punctuation or numeric-valued strings
    for i, word in enumerate(token_no_stop):
        if(word.isalpha()):
            final.append(ps.stem(word))

    return final

In [8]:
doc1 = "The quick brown fox jumped over the lazy dog!?!"
doc2 = "I love animals; cats, dogs, and the like."
doc3 = "Fish are animals too."
doc4 = "Four score and seven years ago our fathers brought forth on this continent..."
docs = [doc1,doc2,doc3,doc4]

In [9]:
pre = [preprocess_txt(doc) for doc in docs]
print(pre)

[['quick', 'brown', 'fox', 'jump', 'lazi', 'dog'], ['love', 'anim', 'cat', 'dog', 'like'], ['fish', 'anim'], ['four', 'score', 'seven', 'year', 'ago', 'father', 'brought', 'forth', 'contin']]


In [10]:
tfs = tfidf.fit_transform(docs)
print(tfs.A)

[[0.60302269 0.         0.         0.         0.         0.30151134
  0.         0.         0.         0.30151134 0.         0.
  0.30151134 0.         0.         0.         0.         0.30151134
  0.30151134 0.30151134 0.         0.         0.30151134 0.
  0.         0.        ]
 [0.         0.62490281 0.24634028 0.         0.31245141 0.
  0.         0.24634028 0.         0.         0.31245141 0.
  0.         0.31245141 0.         0.         0.         0.
  0.         0.         0.31245141 0.31245141 0.         0.
  0.         0.        ]
 [0.         0.         0.52640543 0.         0.         0.
  0.         0.52640543 0.         0.         0.         0.
  0.         0.         0.         0.66767854 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.33333333 0.         0.
  0.33333333 0.         0.33333333 0.         0.         0.33333333
  0.         0.         0.33333333 0.         0.33333333 0.
 

In [11]:
n_docs, _ = tfs.shape
for i in range(n_docs):
    print([round(score, 2) for score in cosine_similarity(tfs[i], tfs).tolist().pop()])

[1.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.26, 0.0]
[0.0, 0.26, 1.0, 0.0]
[0.0, 0.0, 0.0, 1.0]
