In [1]:
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('punkt') 
nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') 
nltk.download('punkt_tab') 
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /home/l00pz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/l00pz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/l00pz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/l00pz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /home/l00pz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
with open('data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [13]:
sentences = sent_tokenize(text)
print("Sentence Tokenization:\n", sentences)

Sentence Tokenization:
 ['Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language.', "', 'In particular, it focuses on how to program computers to process and analyze large amounts of natural language data"]


In [14]:
words = word_tokenize(text)
print("\nWord Tokenization:\n", words)




Word Tokenization:
 ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', '.', "'", ',', "'In", 'particular', ',', 'it', 'focuses', 'on', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data']


In [15]:
text_clean = re.sub('[^a-zA-Z]', ' ', text)
print("\nAfter Removing Punctuation:\n", text_clean)


After Removing Punctuation:
 Natural language processing  NLP  is a subfield of linguistics  computer science  and artificial intelligence concerned with the interactions between computers and human language     In particular  it focuses on how to program computers to process and analyze large amounts of natural language data


In [16]:
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text_clean.lower())
filtered_words = [word for word in tokens if word not in stop_words and word.isalpha()]
print("\nFiltered Words:\n", filtered_words)




Filtered Words:
 ['natural', 'language', 'processing', 'nlp', 'subfield', 'linguistics', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interactions', 'computers', 'human', 'language', 'particular', 'focuses', 'program', 'computers', 'process', 'analyze', 'large', 'amounts', 'natural', 'language', 'data']


In [17]:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_words]
print("\nStemmed Words:\n", stemmed)




Stemmed Words:
 ['natur', 'languag', 'process', 'nlp', 'subfield', 'linguist', 'comput', 'scienc', 'artifici', 'intellig', 'concern', 'interact', 'comput', 'human', 'languag', 'particular', 'focus', 'program', 'comput', 'process', 'analyz', 'larg', 'amount', 'natur', 'languag', 'data']


In [18]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]
print("\nLemmatized Words:\n", lemmatized)




Lemmatized Words:
 ['natural', 'language', 'processing', 'nlp', 'subfield', 'linguistics', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interaction', 'computer', 'human', 'language', 'particular', 'focus', 'program', 'computer', 'process', 'analyze', 'large', 'amount', 'natural', 'language', 'data']


In [19]:
pos_tags = nltk.pos_tag(words)
print("\nPOS Tags:\n", pos_tags)




POS Tags:
 [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('subfield', 'NN'), ('of', 'IN'), ('linguistics', 'NNS'), (',', ','), ('computer', 'NN'), ('science', 'NN'), (',', ','), ('and', 'CC'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('concerned', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('interactions', 'NNS'), ('between', 'IN'), ('computers', 'NNS'), ('and', 'CC'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ("'", "''"), (',', ','), ("'In", "''"), ('particular', 'JJ'), (',', ','), ('it', 'PRP'), ('focuses', 'VBZ'), ('on', 'IN'), ('how', 'WRB'), ('to', 'TO'), ('program', 'NN'), ('computers', 'NNS'), ('to', 'TO'), ('process', 'VB'), ('and', 'CC'), ('analyze', 'VB'), ('large', 'JJ'), ('amounts', 'NNS'), ('of', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('data', 'NNS')]


In [20]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(lemmatized)])
feature_names = tfidf_vectorizer.get_feature_names_out()
print("\nTF-IDF Representation:\n")
for word, score in zip(feature_names, tfidf_matrix.toarray()[0]):
    print(f"{word}: {score}")


TF-IDF Representation:

amount: 0.15811388300841897
analyze: 0.15811388300841897
artificial: 0.15811388300841897
computer: 0.4743416490252569
concerned: 0.15811388300841897
data: 0.15811388300841897
focus: 0.15811388300841897
human: 0.15811388300841897
intelligence: 0.15811388300841897
interaction: 0.15811388300841897
language: 0.4743416490252569
large: 0.15811388300841897
linguistics: 0.15811388300841897
natural: 0.31622776601683794
nlp: 0.15811388300841897
particular: 0.15811388300841897
process: 0.15811388300841897
processing: 0.15811388300841897
program: 0.15811388300841897
science: 0.15811388300841897
subfield: 0.15811388300841897
