In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re



In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kolek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kolek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text = """ Information Retrieval (IR) is the process of obtaining relevant information from a large repository based on a user’s query. The primary goal of IR is to help users find information that meets their needs by ranking documents according to relevance. Document processing in IR involves tasks like tokenization, stop word removal, and stemming to improve retrieval. Information Retrieval indexing structures, such as inverted indexes, allow rapid access to document lists based on keywords. User queries in IR are transformed into a vector format to facilitate comparison with document vectors. Retrieval and ranking in IR rely on methods like cosine similarity, TF-IDF, and probabilistic models."""

In [4]:
text

' Information Retrieval (IR) is the process of obtaining relevant information from a large repository based on a user’s query. The primary goal of IR is to help users find information that meets their needs by ranking documents according to relevance. Document processing in IR involves tasks like tokenization, stop word removal, and stemming to improve retrieval. Information Retrieval indexing structures, such as inverted indexes, allow rapid access to document lists based on keywords. User queries in IR are transformed into a vector format to facilitate comparison with document vectors. Retrieval and ranking in IR rely on methods like cosine similarity, TF-IDF, and probabilistic models.'

In [5]:
text_lower = text.lower()
text_lower

' information retrieval (ir) is the process of obtaining relevant information from a large repository based on a user’s query. the primary goal of ir is to help users find information that meets their needs by ranking documents according to relevance. document processing in ir involves tasks like tokenization, stop word removal, and stemming to improve retrieval. information retrieval indexing structures, such as inverted indexes, allow rapid access to document lists based on keywords. user queries in ir are transformed into a vector format to facilitate comparison with document vectors. retrieval and ranking in ir rely on methods like cosine similarity, tf-idf, and probabilistic models.'

In [6]:
text_no_specials = re.sub(r'[^a-z\s]','',text_lower)
text_no_specials

' information retrieval ir is the process of obtaining relevant information from a large repository based on a users query the primary goal of ir is to help users find information that meets their needs by ranking documents according to relevance document processing in ir involves tasks like tokenization stop word removal and stemming to improve retrieval information retrieval indexing structures such as inverted indexes allow rapid access to document lists based on keywords user queries in ir are transformed into a vector format to facilitate comparison with document vectors retrieval and ranking in ir rely on methods like cosine similarity tfidf and probabilistic models'

In [7]:
words = word_tokenize(text_no_specials)
words

['information',
 'retrieval',
 'ir',
 'is',
 'the',
 'process',
 'of',
 'obtaining',
 'relevant',
 'information',
 'from',
 'a',
 'large',
 'repository',
 'based',
 'on',
 'a',
 'users',
 'query',
 'the',
 'primary',
 'goal',
 'of',
 'ir',
 'is',
 'to',
 'help',
 'users',
 'find',
 'information',
 'that',
 'meets',
 'their',
 'needs',
 'by',
 'ranking',
 'documents',
 'according',
 'to',
 'relevance',
 'document',
 'processing',
 'in',
 'ir',
 'involves',
 'tasks',
 'like',
 'tokenization',
 'stop',
 'word',
 'removal',
 'and',
 'stemming',
 'to',
 'improve',
 'retrieval',
 'information',
 'retrieval',
 'indexing',
 'structures',
 'such',
 'as',
 'inverted',
 'indexes',
 'allow',
 'rapid',
 'access',
 'to',
 'document',
 'lists',
 'based',
 'on',
 'keywords',
 'user',
 'queries',
 'in',
 'ir',
 'are',
 'transformed',
 'into',
 'a',
 'vector',
 'format',
 'to',
 'facilitate',
 'comparison',
 'with',
 'document',
 'vectors',
 'retrieval',
 'and',
 'ranking',
 'in',
 'ir',
 'rely',
 'on',

In [8]:
stop_words = set(stopwords.words('english'))
no_stopwords = [word for word in words if word not in stop_words]
no_stopwords

['information',
 'retrieval',
 'ir',
 'process',
 'obtaining',
 'relevant',
 'information',
 'large',
 'repository',
 'based',
 'users',
 'query',
 'primary',
 'goal',
 'ir',
 'help',
 'users',
 'find',
 'information',
 'meets',
 'needs',
 'ranking',
 'documents',
 'according',
 'relevance',
 'document',
 'processing',
 'ir',
 'involves',
 'tasks',
 'like',
 'tokenization',
 'stop',
 'word',
 'removal',
 'stemming',
 'improve',
 'retrieval',
 'information',
 'retrieval',
 'indexing',
 'structures',
 'inverted',
 'indexes',
 'allow',
 'rapid',
 'access',
 'document',
 'lists',
 'based',
 'keywords',
 'user',
 'queries',
 'ir',
 'transformed',
 'vector',
 'format',
 'facilitate',
 'comparison',
 'document',
 'vectors',
 'retrieval',
 'ranking',
 'ir',
 'rely',
 'methods',
 'like',
 'cosine',
 'similarity',
 'tfidf',
 'probabilistic',
 'models']

In [9]:
stemmer = PorterStemmer()
words_stemmed = [stemmer.stem(word) for word in no_stopwords]
words_stemmed

['inform',
 'retriev',
 'ir',
 'process',
 'obtain',
 'relev',
 'inform',
 'larg',
 'repositori',
 'base',
 'user',
 'queri',
 'primari',
 'goal',
 'ir',
 'help',
 'user',
 'find',
 'inform',
 'meet',
 'need',
 'rank',
 'document',
 'accord',
 'relev',
 'document',
 'process',
 'ir',
 'involv',
 'task',
 'like',
 'token',
 'stop',
 'word',
 'remov',
 'stem',
 'improv',
 'retriev',
 'inform',
 'retriev',
 'index',
 'structur',
 'invert',
 'index',
 'allow',
 'rapid',
 'access',
 'document',
 'list',
 'base',
 'keyword',
 'user',
 'queri',
 'ir',
 'transform',
 'vector',
 'format',
 'facilit',
 'comparison',
 'document',
 'vector',
 'retriev',
 'rank',
 'ir',
 'reli',
 'method',
 'like',
 'cosin',
 'similar',
 'tfidf',
 'probabilist',
 'model']