In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

In [2]:
doc = [
    "The sky is blue.",
    "The sun is bright today.",
    "The sun in the sky is bright.",
    "We can see the shining sun, the bright sun.",
    "The moon is full, the sky full of stars.",
    "The sky was dark, the stars plentiful and bright.",
    "The sun is but a morning star."
]

In [3]:
# Create a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')


In [4]:
# Fit the vectorizer and transform the documents into TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(doc)

In [5]:

# Convert the TF-IDF matrix to a dense array
tfidf_matrix_dense = tfidf_matrix.toarray()

In [6]:
# Calculate the cosine similarity between documents
cosine_sim = cosine_similarity(tfidf_matrix_dense)

In [7]:
# Print the TF-IDF matrix
print(pd.DataFrame(tfidf_matrix_dense, columns=tfidf_vectorizer.get_feature_names_out()))

       blue    bright      dark      moon   morning  plentiful   shining  \
0  0.851417  0.000000  0.000000  0.000000  0.000000   0.000000  0.000000   
1  0.000000  0.464479  0.000000  0.000000  0.000000   0.000000  0.000000   
2  0.000000  0.577350  0.000000  0.000000  0.000000   0.000000  0.000000   
3  0.000000  0.361901  0.000000  0.000000  0.000000   0.000000  0.587484   
4  0.000000  0.000000  0.000000  0.695296  0.000000   0.000000  0.000000   
5  0.000000  0.331750  0.538538  0.000000  0.000000   0.538538  0.000000   
6  0.000000  0.000000  0.000000  0.000000  0.648275   0.000000  0.000000   

        sky      star     stars       sun     today  
0  0.524489  0.000000  0.000000  0.000000  0.000000  
1  0.000000  0.000000  0.000000  0.464479  0.754001  
2  0.577350  0.000000  0.000000  0.577350  0.000000  
3  0.000000  0.000000  0.000000  0.723803  0.000000  
4  0.428316  0.000000  0.577156  0.000000  0.000000  
5  0.331750  0.000000  0.447033  0.000000  0.000000  
6  0.000000  

In [8]:
# Print the cosine similarity matrix
print(pd.DataFrame(cosine_sim, columns=doc, index=doc))

                                                   The sky is blue.  \
The sky is blue.                                           1.000000   
The sun is bright today.                                   0.000000   
The sun in the sky is bright.                              0.302814   
We can see the shining sun, the bright sun.                0.000000   
The moon is full, the sky full of stars.                   0.224647   
The sky was dark, the stars plentiful and bright.          0.173999   
The sun is but a morning star.                             0.000000   

                                                   The sun is bright today.  \
The sky is blue.                                                   0.000000   
The sun is bright today.                                           1.000000   
The sun in the sky is bright.                                      0.536335   
We can see the shining sun, the bright sun.                        0.504287   
The moon is full, the sky full of st