# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [101]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: /workspaces/data_analytics/Week_11


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [102]:
# Defining documents (=sentenses)
d1 = 'i loved the smell of Flowers in the Room.'
d2 = 'i loved the smell of Rain in my Chamber.'
d3 = 'i always despised the taste of ginger Lily Candles in that House.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'i loved the smell of Flowers in the Room. i loved the smell of Rain in my Chamber. i always despised the taste of ginger Lily Candles in that House.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [103]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'i loved the smell of flowers in the room. i loved the smell of rain in my chamber. i always despised the taste of ginger lily candles in that house.'

hier werden die Grossbuchstaben in Kleinbuchstaben umgewandelt


### Removing punctuation

In [104]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'i loved the smell of flowers in the room i loved the smell of rain in my chamber i always despised the taste of ginger lily candles in that house'

komma und Punkte und Zeichen werden entfernt

### Tokenize text & removal of stopwords

In [105]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'about', 'out', 'a', 'ours', 'their', 'having', 'what', 'some', 'yours', 'mustn', 'just', 'can', 'who', 'against', 'on', 'all', 'that', 'by', "don't", 'yourselves', 'at', 'whom', 'm', "mightn't", "needn't", 'aren', 'hasn', 'was', 'ma', 'd', "weren't", 'did', 'before', 'both', 'weren', 'isn', 'with', 'more', 'than', 'o', "shan't", 'how', 'me', 'doesn', 'from', 'i', 'to', "mustn't", "won't", 'him', 'will', "shouldn't", "isn't", 're', 'should', 'any', 'her', "hasn't", 'if', 'now', 'most', 'an', 'of', 'don', 'himself', 'once', 'until', "you'll", 'the', 'y', 'were', 'my', 'theirs', 'too', 'why', 'needn', "you'd", 'there', 'very', 'own', 'been', 'shouldn', 'ourselves', 'again', 'herself', 'his', 'mightn', 'being', 'only', 'they', 'didn', 'down', 'our', "hadn't", 'wouldn', "you're", 'those', 'you', 'same', 'it', 'and', 'while', 'yourself', 'but', 'here', 'had', 'am', 'its', 'we', 'such', "wasn't", 't', 'between', 'no', 'won', 'not', 'above', 'or', 'are', 'which', '

In [106]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['loved', 'smell', 'flowers', 'room', 'loved', 'smell', 'rain', 'chamber', 'always', 'despised', 'taste', 'ginger', 'lily', 'candles', 'house']

die stopwords werden entfernt -> hier i, the, of

### Lemmatization

In [107]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['loved', 'smell', 'flowers', 'room', 'loved', 'smell', 'rain', 'chamber', 'always', 'despised', 'taste', 'ginger', 'lily', 'candles', 'house'] 

After lemmatization:
['love', 'smell', 'flower', 'room', 'love', 'smell', 'rain', 'chamber', 'always', 'despise', 'taste', 'ginger', 'lily', 'candle', 'house']

alle unnötigen Enden werden entfernt. ed, 

## Redefine the text corpus (pre-processed)

In [108]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['love smell flower room', 
          'love smell rain chamber', 
          'despise taste ginger lily candle house']

jetzt haben wir die sätze nach der lemmatization eingefügt

## Document-term matrix with ngram_range=(1,1)

In [109]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   candle  chamber  despise  flower  ginger  house  lily  love  rain  room  \
0       0        0        0       1       0      0     0     1     0     1   
1       0        1        0       0       0      0     0     1     1     0   
2       1        0        1       0       1      1     1     0     0     0   

   smell  taste  
0      1      0  
1      1      0  
2      0      1  


## erklärung

Der CountVectorizer zählt, wie oft jedes Wort in jedem Satz vorkommt. Danach entsteht eine Tabelle (Dokument-Term-Matrix)

Zeilen: Jeder Satz in deiner Sammlung.
Spalten: Jedes einzelne Wort, das in den Sätzen vorkommt.
Werte: Wie oft das Wort in dem jeweiligen Satz vorkommt.

Dies hilf die Sätze in numerischen Darstellung zu erstellen. diese kann man dann für weitere Analysen gebrauchen. 


### ngram_range=(1,1)
N-Gramme: N-Gramme sind Kombinationen von "n" Wörtern, die zusammen auftreten. Zum Beispiel:
Unigramme (n=1): Einzelne Wörter (z. B. "Liebe", "Blume")
Bigramme (n=2): Kombinationen von 2 Wörtern (z. B. "Liebe Blume")
Trigramme (n=3): Kombinationen von 3 Wörtern (z. B. "Liebe die Blume")

Im Fall von ngram_range=(1,1) bedeutet es, dass wir nur einzelne Wörter betrachten also Unigramme.


## Document-term matrix with ngram_range=(2,2)

In [110]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   candle house  despise taste  flower room  ginger lily  lily candle  \
0             0              0            1            0            0   
1             0              0            0            0            0   
2             1              1            0            1            1   

   love smell  rain chamber  smell flower  smell rain  taste ginger  
0           1             0             1           0             0  
1           1             1             0           1             0  
2           0             0             0           0             1  


## erklärung
jetzt haben wir ngram_range=(2,2)
wir suchen / finden nun Pärchen. Die Spalten repräsentieren nun nicht mehr einzelne Wörter sondern Paare von Wörter, die in den Sätzen vorkommen.

Zeilen: Repräsentieren die einzelnen Sätze.
Spalten: Repräsentieren die Bigramme (Pärchen von Wörtern) aus den Sätzen.
Werte: Zeigen, wie oft jedes Bigramm in jedem Satz vorkommt.

also Paare wurden gebildet und in die Matrix wird mit 1 oder 0 angegeben, ob diese Paare in den 3 Sätzen vorkommen. 

Gibt detailliertere Analyse der Kombinationen von Wörtern, die gemeinsam auftreten, hilft um die Bedeutung der Wörter zu ermitteln.

## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [111]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 12 

The words in the corpus: 
 {'flower', 'lily', 'ginger', 'chamber', 'smell', 'rain', 'love', 'house', 'room', 'despise', 'taste', 'candle'}

Term Frequency (TF):
   flower    lily  ginger  chamber  smell  rain  love   house  room  despise  \
0    0.25  0.0000  0.0000     0.00   0.25  0.00  0.25  0.0000  0.25   0.0000   
1    0.00  0.0000  0.0000     0.25   0.25  0.25  0.25  0.0000  0.00   0.0000   
2    0.00  0.1667  0.1667     0.00   0.00  0.00  0.00  0.1667  0.00   0.1667   

    taste  candle  
0  0.0000  0.0000  
1  0.0000  0.0000  
2  0.1667  0.1667  


misst jetzt wie jäufig ein Wort in den einzelnen Sätzen vorkommt. 

### Inverse Document Frequency (IDF)

In [112]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
         flower:     0.4771
           lily:     0.4771
         ginger:     0.4771
        chamber:     0.4771
          smell:     0.1761
           rain:     0.4771
           love:     0.1761
          house:     0.4771
           room:     0.4771
        despise:     0.4771
          taste:     0.4771
         candle:     0.4771


misst jetzt wie wichtig ein Wort in der gesamten MAtrix ist


Ein Wort, das in vielen Dokumenten vorkommt, hat eine geringe Inverse Document Frequency, weil es weniger informativ ist. Ein Wort, das nur in wenigen Dokumenten vorkommt, hat eine hohe IDF, weil es spezifisch und damit informativer ist.

hier zum beispiel: smell -> ist tiefer, weil es öfter vorkommt.

### Term Frequency - Inverse Document Frequency (TF-IDF)

In [113]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
   flower    lily  ginger  chamber  smell    rain   love   house    room  \
0  0.1193  0.0000  0.0000   0.0000  0.044  0.0000  0.044  0.0000  0.1193   
1  0.0000  0.0000  0.0000   0.1193  0.044  0.1193  0.044  0.0000  0.0000   
2  0.0000  0.0795  0.0795   0.0000  0.000  0.0000  0.000  0.0795  0.0000   

   despise   taste  candle  
0   0.0000  0.0000  0.0000  
1   0.0000  0.0000  0.0000  
2   0.0795  0.0795  0.0795  


ist nun der Wert, der die Bedeutung eines Worts im Kontext der Sätze und der ganzen MAtrix bewertet. Ein Wort, das in allen Sätzen vorkommt, hat keinen Informationswert, weil es nicht hilft, zwischen den Sätzen zu unterscheiden.

TF-IDF kombiniert die beiden vorhin erhaltenen Matrizen, um zu bewerten, wie bedeutend ein Wort ist im Satz im vergleich zu den anderen Sätzen.

jetzt merkt man, dass so individuelle wörter wie chamber, flower oder room am höchsten gewertet werden. hingegen smell und love viel tiefer, da es in fast allen Sätzen vorkommt.

## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [114]:
text = '''Regulators imposed a $4.8 billion penalty on Apple Tuesday for violating antitrust laws in its App Store policies.'''


def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent


sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('Regulators', 'NNS', 'O'),
 ('imposed', 'VBD', 'O'),
 ('a', 'DT', 'O'),
 ('$', '$', 'O'),
 ('4.8', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('penalty', 'NN', 'B-NP'),
 ('on', 'IN', 'O'),
 ('Apple', 'NNP', 'O'),
 ('Tuesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('violating', 'VBG', 'O'),
 ('antitrust', 'JJ', 'O'),
 ('laws', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('its', 'PRP$', 'O'),
 ('App', 'NNP', 'O'),
 ('Store', 'NNP', 'O'),
 ('policies', 'NNS', 'O'),
 ('.', '.', 'O')]


- NNS: sind Plural Nouns -> Regulators, laws, policies. Nomen in der Mehrzahl
- VBD: Past tense verbs -> imposed. vergangenheit Verb
- PRP$: Possessive Pronomen -> its... , gehört zu App store policies
- CD: Cardinal Number -> einfach Zahlen, Numerische Werte! auch ausgeschrieben -> two
- NN: Singular Nomen. -> dog, apple fe.
- NNP: Proper Noun  Refers to specific names of people, places, organizations, or titles. Proper nouns are always capitalized, like "Apple" and "Tuesday."

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [115]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-11-26 09:13:33
Python Version: 3.11.10
-----------------------------------
