# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [2]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: /workspaces/data_analytics/Week_11


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [3]:
# Defining documents about the solar system
d1 = 'The Sun is the center of our solar system and provides light and heat to the planets.'
d2 = 'There are nine planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.'
d3 = 'Earth is the only planet known to support life, with its diverse ecosystems and atmosphere.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'The Sun is the center of our solar system and provides light and heat to the planets. There are nine planets in the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto. Earth is the only planet known to support life, with its diverse ecosystems and atmosphere.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [4]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the sun is the center of our solar system and provides light and heat to the planets. there are nine planets in the solar system: mercury, venus, earth, mars, jupiter, saturn, uranus, neptune and pluto. earth is the only planet known to support life, with its diverse ecosystems and atmosphere.'

### Removing punctuation

In [5]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the sun is the center of our solar system and provides light and heat to the planets there are nine planets in the solar system mercury venus earth mars jupiter saturn uranus neptune and pluto earth is the only planet known to support life with its diverse ecosystems and atmosphere'

### Tokenize text & removal of stopwords

In [6]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{"you'd", 'again', 'any', 'when', 'shouldn', 't', 'the', "that'll", 'yours', 'should', 'what', "you're", 'into', 'hers', 'theirs', 'here', 'won', 'whom', 'does', 'each', 'an', 'for', 'once', 'were', 'you', 'ours', 'to', 'such', 'mustn', 'will', 'now', 'by', 'do', 'most', "didn't", 'up', 'haven', 'while', 'out', 'or', 'with', 'o', 'same', "won't", 'during', "hadn't", 'him', 'further', 'he', 'are', 'don', 'mightn', 'it', "aren't", 'himself', "doesn't", 'll', 'above', 'about', "weren't", 'who', 'nor', 'how', "isn't", "couldn't", 'from', 'be', 'those', "mustn't", 'weren', 'they', "it's", 'had', 'so', 'she', "she's", 'itself', 'after', 'her', "wasn't", 'of', 'because', 'doesn', 'other', 'than', 'but', 'doing', 'through', 'am', 'ain', 'i', 'myself', 're', 'their', 'your', 'm', 'some', 'hadn', 'our', 'my', 'been', 'until', 'on', 'and', 'off', 'no', 'having', 'isn', 'aren', 'where', 'these', 'wouldn', 'this', 'couldn', 'under', 'both', 's', 'needn', 'shan', "you've",

In [7]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['sun', 'center', 'solar', 'system', 'provides', 'light', 'heat', 'planets', 'nine', 'planets', 'solar', 'system', 'mercury', 'venus', 'earth', 'mars', 'jupiter', 'saturn', 'uranus', 'neptune', 'pluto', 'earth', 'planet', 'known', 'support', 'life', 'diverse', 'ecosystems', 'atmosphere']

### Lemmatization

In [8]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['sun', 'center', 'solar', 'system', 'provides', 'light', 'heat', 'planets', 'nine', 'planets', 'solar', 'system', 'mercury', 'venus', 'earth', 'mars', 'jupiter', 'saturn', 'uranus', 'neptune', 'pluto', 'earth', 'planet', 'known', 'support', 'life', 'diverse', 'ecosystems', 'atmosphere'] 

After lemmatization:
['sun', 'center', 'solar', 'system', 'provide', 'light', 'heat', 'planets', 'nine', 'planets', 'solar', 'system', 'mercury', 'venus', 'earth', 'mar', 'jupiter', 'saturn', 'uranus', 'neptune', 'pluto', 'earth', 'planet', 'know', 'support', 'life', 'diverse', 'ecosystems', 'atmosphere']

## Redefine the text corpus (pre-processed)

In [9]:
# Manuell angepasster Korpus mit lemmatisierten Wörtern
corpus = ['sun center solar system provide light heat planet', 
          'nine planet solar system mercury venus earth mars jupiter saturn uranus neptune pluto', 
          'earth planet know support life diverse ecosystem atmosphere']


## Document-term matrix with ngram_range=(1,1)

In [10]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   atmosphere  center  diverse  earth  ecosystem  heat  jupiter  know  life  \
0           0       1        0      0          0     1        0     0     0   
1           0       0        0      1          0     0        1     0     0   
2           1       0        1      1          1     0        0     1     1   

   light  ...  planet  pluto  provide  saturn  solar  sun  support  system  \
0      1  ...       1      0        1       0      1    1        0       1   
1      0  ...       1      1        0       1      1    0        0       1   
2      0  ...       1      0        0       0      0    0        1       0   

   uranus  venus  
0       0      0  
1       1      1  
2       0      0  

[3 rows x 24 columns]


## Document-term matrix with ngram_range=(2,2)

In [11]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   center solar  diverse ecosystem  earth mars  earth planet  \
0             1                  0           0             0   
1             0                  0           1             0   
2             0                  1           0             1   

   ecosystem atmosphere  heat planet  jupiter saturn  know support  \
0                     0            1               0             0   
1                     0            0               1             0   
2                     1            0               0             1   

   life diverse  light heat  ...  planet solar  provide light  saturn uranus  \
0             0           1  ...             0              1              0   
1             0           0  ...             1              0              1   
2             1           0  ...             0              0              0   

   solar system  sun center  support life  system mercury  system provide  \
0             1           1             0  

## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [12]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 24 

The words in the corpus: 
 {'life', 'heat', 'earth', 'ecosystem', 'atmosphere', 'venus', 'support', 'neptune', 'mercury', 'uranus', 'system', 'center', 'pluto', 'planet', 'nine', 'light', 'diverse', 'provide', 'solar', 'know', 'sun', 'saturn', 'mars', 'jupiter'}

Term Frequency (TF):
    life   heat   earth  ecosystem  atmosphere   venus  support  neptune  \
0  0.000  0.125  0.0000      0.000       0.000  0.0000    0.000   0.0000   
1  0.000  0.000  0.0769      0.000       0.000  0.0769    0.000   0.0769   
2  0.125  0.000  0.1250      0.125       0.125  0.0000    0.125   0.0000   

   mercury  uranus  ...    nine  light  diverse  provide   solar   know  \
0   0.0000  0.0000  ...  0.0000  0.125    0.000    0.125  0.1250  0.000   
1   0.0769  0.0769  ...  0.0769  0.000    0.000    0.000  0.0769  0.000   
2   0.0000  0.0000  ...  0.0000  0.000    0.125    0.000  0.0000  0.125   

     sun  saturn    mars  jupiter  
0  0.125  0.0000  0.0000   0.0000  
1

### Inverse Document Frequency (IDF)

In [13]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
           life:     0.4771
           heat:     0.4771
          earth:     0.1761
      ecosystem:     0.4771
     atmosphere:     0.4771
          venus:     0.4771
        support:     0.4771
        neptune:     0.4771
        mercury:     0.4771
         uranus:     0.4771
         system:     0.1761
         center:     0.4771
          pluto:     0.4771
         planet:        0.0
           nine:     0.4771
          light:     0.4771
        diverse:     0.4771
        provide:     0.4771
          solar:     0.1761
           know:     0.4771
            sun:     0.4771
         saturn:     0.4771
           mars:     0.4771
        jupiter:     0.4771


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [14]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
     life    heat   earth  ecosystem  atmosphere   venus  support  neptune  \
0  0.0000  0.0596  0.0000     0.0000      0.0000  0.0000   0.0000   0.0000   
1  0.0000  0.0000  0.0135     0.0000      0.0000  0.0367   0.0000   0.0367   
2  0.0596  0.0000  0.0220     0.0596      0.0596  0.0000   0.0596   0.0000   

   mercury  uranus  ...    nine   light  diverse  provide   solar    know  \
0   0.0000  0.0000  ...  0.0000  0.0596   0.0000   0.0596  0.0220  0.0000   
1   0.0367  0.0367  ...  0.0367  0.0000   0.0000   0.0000  0.0135  0.0000   
2   0.0000  0.0000  ...  0.0000  0.0000   0.0596   0.0000  0.0000  0.0596   

      sun  saturn    mars  jupiter  
0  0.0596  0.0000  0.0000   0.0000  
1  0.0000  0.0367  0.0367   0.0367  
2  0.0000  0.0000  0.0000   0.0000  

[3 rows x 24 columns]


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [15]:
text = '''The Sun is the center of our solar system and provides 
light and heat to the planets. There are nine planets in the solar 
system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, 
Neptune and Pluto. Earth is the only planet known to support life, 
with its diverse ecosystems and atmosphere.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('The', 'DT', 'O'),
 ('Sun', 'NNP', 'O'),
 ('is', 'VBZ', 'O'),
 ('the', 'DT', 'B-NP'),
 ('center', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('our', 'PRP$', 'O'),
 ('solar', 'JJ', 'B-NP'),
 ('system', 'NN', 'I-NP'),
 ('and', 'CC', 'O'),
 ('provides', 'VBZ', 'O'),
 ('light', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('heat', 'NN', 'B-NP'),
 ('to', 'TO', 'O'),
 ('the', 'DT', 'O'),
 ('planets', 'NNS', 'O'),
 ('.', '.', 'O'),
 ('There', 'EX', 'O'),
 ('are', 'VBP', 'O'),
 ('nine', 'CD', 'O'),
 ('planets', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('solar', 'JJ', 'I-NP'),
 ('system', 'NN', 'I-NP'),
 (':', ':', 'O'),
 ('Mercury', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Venus', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Earth', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Mars', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Jupiter', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Saturn', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Uranus', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Neptune', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('Pluto', 'NNP', 'O'),
 ('.',

### f)In the section ‘Part-of-Speach (POS) tagging’, choose your own text example and derive POS tags for this example. In the Jupyter notebook, briefly explain the meaning of at least 5 POS-tags from the output of the POS tagging.

('The', 'DT', 'O') DT (Determiner): The word "The" is tagged as a determiner. 

('Sun', 'NNP', 'O') NNP (Proper Noun, Singular): "Sun" is tagged as a proper noun in singular form.

('is', 'VBZ', 'O') VBZ (Verb, 3rd Person Singular Present): The word "is" is tagged as a present-tense verb for the 3rd person singular

('center', 'NN', 'I-NP') NN (Noun, Singular): "Center" is tagged as a singular noun, indicating a thing or place, in this case, the center of something.

('of', 'IN', 'O') IN (Preposition/Subordinating Conjunction): The word "of" is tagged as a preposition

'O' (Outside): "of" is outside any named entity or chunked phrase here.

'I-NP' (Inside Noun Phrase): This tag shows that "center" is inside a noun phrase and follows the determiner "the," continuing the noun phrase that started with "the."

'B-NP' (Beginning of Noun Phrase): This indicates the beginning of a noun phrase (NP), with "the" as the first word of this chunked phrase.

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [16]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-10-30 16:42:06
Python Version: 3.11.10
-----------------------------------
