# Introduction to preprocesing for text

In [12]:
# Import the necessary functions
from torchtext.data.utils import get_tokenizer
from nltk.probability import FreqDist

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /home/jorge/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Word frequency analysis

In [13]:
text = "In the city of Dataville, a data analyst named Alex explores hidden insights within vast data. With determination, Alex uncovers patterns, cleanses the data, and unlocks innovation. Join this adventure to unleash the power of data-driven decisions."

# Initialize the tokenizer and tokenize the text
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(text)

threshold = 1
# Remove rare words and print common tokens
freq_dist = FreqDist(tokens)
common_tokens = [token for token in tokens if freq_dist[token] > threshold]
print(common_tokens)

['the', 'of', ',', 'data', 'alex', 'data', '.', ',', 'alex', ',', 'the', 'data', ',', '.', 'the', 'of', '.']


## Preprocessing text

In [14]:
# Initialize and tokenize the text
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(text)

# Remove any stopwords
stop_words = set(stopwords.words("english"))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Perform stemming on the filtered tokens
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)

['citi', 'datavil', ',', 'data', 'analyst', 'name', 'alex', 'explor', 'hidden', 'insight', 'within', 'vast', 'data', '.', 'determin', ',', 'alex', 'uncov', 'pattern', ',', 'cleans', 'data', ',', 'unlock', 'innov', '.', 'join', 'adventur', 'unleash', 'power', 'data-driven', 'decis', '.']
