In [1]:
text = """Natural Language Processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language. It's used to
analyze text, allowing machines to understand, interpret, and manipulate human language. NLP has
many real-world applications, including machine translation, sentiment analysis, and chatbots."""



In [2]:
# 1. Tokenization:
tokens = text.split()
tokens

['Natural',
 'Language',
 'Processing',
 '(NLP)',
 'is',
 'a',
 'subfield',
 'of',
 'linguistics,',
 'computer',
 'science,',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'language.',
 "It's",
 'used',
 'to',
 'analyze',
 'text,',
 'allowing',
 'machines',
 'to',
 'understand,',
 'interpret,',
 'and',
 'manipulate',
 'human',
 'language.',
 'NLP',
 'has',
 'many',
 'real-world',
 'applications,',
 'including',
 'machine',
 'translation,',
 'sentiment',
 'analysis,',
 'and',
 'chatbots.']

In [3]:
import string

# 2. Lowercasing:
# 3. Punctuation Removal:
tokens_clean = [word.lower().strip(string.punctuation) for word in tokens]
tokens_clean

['natural',
 'language',
 'processing',
 'nlp',
 'is',
 'a',
 'subfield',
 'of',
 'linguistics',
 'computer',
 'science',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'language',
 "it's",
 'used',
 'to',
 'analyze',
 'text',
 'allowing',
 'machines',
 'to',
 'understand',
 'interpret',
 'and',
 'manipulate',
 'human',
 'language',
 'nlp',
 'has',
 'many',
 'real-world',
 'applications',
 'including',
 'machine',
 'translation',
 'sentiment',
 'analysis',
 'and',
 'chatbots']

In [4]:
# 4. Stop Word Removal:
stop_words = ["the", "a", "an", "in", "on", "at", "for", "to", "of", "and", "is", "are"]

tokens_stop_words_removed = [word for word in tokens_clean if word not in stop_words]
tokens_stop_words_removed

['natural',
 'language',
 'processing',
 'nlp',
 'subfield',
 'linguistics',
 'computer',
 'science',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'interactions',
 'between',
 'computers',
 'human',
 'language',
 "it's",
 'used',
 'analyze',
 'text',
 'allowing',
 'machines',
 'understand',
 'interpret',
 'manipulate',
 'human',
 'language',
 'nlp',
 'has',
 'many',
 'real-world',
 'applications',
 'including',
 'machine',
 'translation',
 'sentiment',
 'analysis',
 'chatbots']

In [5]:
# 5. Stemming:
def stem(word):
    for suffix in ['ing', 'ed', 'es', 's']:
        if word.endswith(suffix) and len(word) > len(suffix) + 2:
            return word[:-len(suffix)]
    return word

tokens_stemmed = [stem(word) for word in tokens_stop_words_removed]
tokens_stemmed

['natural',
 'language',
 'process',
 'nlp',
 'subfield',
 'linguistic',
 'computer',
 'science',
 'artificial',
 'intelligence',
 'concern',
 'with',
 'interaction',
 'between',
 'computer',
 'human',
 'language',
 "it'",
 'used',
 'analyze',
 'text',
 'allow',
 'machin',
 'understand',
 'interpret',
 'manipulate',
 'human',
 'language',
 'nlp',
 'has',
 'many',
 'real-world',
 'application',
 'includ',
 'machine',
 'translation',
 'sentiment',
 'analysi',
 'chatbot']

In [6]:
# Bonus: Lemmatization:

lemmatization_dict = {
    "is": "be",
    "are": "be",
    "was": "be",
    "were": "be",
    "has": "have",
    "had": "have",
    "used": "use",
}
def simple_lemmatize(word):
    return lemmatization_dict.get(word, word)
tokens_lemmatized = [simple_lemmatize(word) for word in tokens_stemmed]
tokens_lemmatized

['natural',
 'language',
 'process',
 'nlp',
 'subfield',
 'linguistic',
 'computer',
 'science',
 'artificial',
 'intelligence',
 'concern',
 'with',
 'interaction',
 'between',
 'computer',
 'human',
 'language',
 "it'",
 'use',
 'analyze',
 'text',
 'allow',
 'machin',
 'understand',
 'interpret',
 'manipulate',
 'human',
 'language',
 'nlp',
 'have',
 'many',
 'real-world',
 'application',
 'includ',
 'machine',
 'translation',
 'sentiment',
 'analysi',
 'chatbot']