In [2]:
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess(text, stemming=False):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    if stemming:
        ps = PorterStemmer()
        words = [ps.stem(word) for word in words]
    return ' '.join(words)

def document_similarity(doc1, doc2, stemming=False):
    # Preprocess documents
    preprocessed1 = preprocess(doc1, stemming)
    preprocessed2 = preprocess(doc2, stemming)
    
    # Vectorize using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([preprocessed1, preprocessed2])
    
    # Convert to arrays
    vec1 = tfidf_matrix[0].toarray().flatten()
    vec2 = tfidf_matrix[1].toarray().flatten()
    
    # Handle zero vectors
    if not vec1.any() or not vec2.any():
        return 0.0
    
    # Compute cosine similarity
    cosine_sim = 1 - cosine(vec1, vec2)
    return cosine_sim

# Example usage
doc1 = "A quick brown fox jumps over a lazy dog."
doc2 = "The dog sat on the rug."
similarity = document_similarity(doc1, doc2, stemming=False)
print(f"Cosine Similarity: {similarity:.2f}")

if similarity > 0.5:
    print("The documents are similar.")
else:
    print("The documents are not similar.")

Cosine Similarity: 0.14
The documents are not similar.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
