In [1]:
#Loading NLTK
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/schibsted/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/schibsted/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/schibsted/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/schibsted/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# Read text from file
def read_text(filename):
    with open(filename, 'r') as f:
        text = f.read().replace('\n', ' ')
    return text

def tokenize(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [w for w in tokens if w.isalpha()]

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens

In [3]:
t1, t2, t3 = map(read_text, ['text.txt', 'text2.txt', 'text3.txt'])
t1_tokens, t2_tokens, t3_tokens = map(tokenize, [t1, t2, t3])

In [4]:
# Document-term matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([t1, t2, t3])
print(X.toarray())

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [4 0 1 ... 1 2 1]]


In [5]:
# TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([t1, t2, t3])
print(X.toarray())

[[0.         0.03148211 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.10723106 0.         0.02680777 ... 0.02680777 0.05361553 0.02680777]]


In [6]:
# Cosine similarity
print(cosine_similarity(X))

[[1.         0.3946272  0.49580717]
 [0.3946272  1.         0.55454709]
 [0.49580717 0.55454709 1.        ]]
