## Normalization

In [66]:
import string
import re

def normalize_text(text):
    # Usuń znaki interpunkcyjne
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Konwersja na małe litery
    text = text.lower()

    # Usuń nadmiarowe białe znaki
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Przykładowy tekst
sample_text = "To jest przykład tekstu, który potrzebuje normalizacji!  Mamy tu nieco znaków interpunkcyjnych i niepotrzebnych spacji."

# Normalizacja tekstu
normalized_text = normalize_text(sample_text)
print(f"{sample_text}")
print(f"{normalized_text}")

To jest przykład tekstu, który potrzebuje normalizacji!  Mamy tu nieco znaków interpunkcyjnych i niepotrzebnych spacji.
to jest przykład tekstu który potrzebuje normalizacji mamy tu nieco znaków interpunkcyjnych i niepotrzebnych spacji


In [86]:
import nltk
import pandas as pd

from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, casual_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

In [68]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

## Cleaning

In [69]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove whitespaces
    text = text.strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    text = ' '.join([word for word in tokens if word not in stop_words])

    return text

In [70]:
# Example text
sample_text = "Here's an example: NLP involves various preprocessing techniques. For instance, removing stopwords like 'and', 'the', etc., is common."

# Cleaning the text
cleaned_text = clean_text(sample_text)
print(sample_text)
print(cleaned_text)

Here's an example: NLP involves various preprocessing techniques. For instance, removing stopwords like 'and', 'the', etc., is common.
heres example nlp involves various preprocessing techniques instance removing stopwords like etc common


## Tokenization

In [71]:
sample_text = "Tokenization is the process of splitting text into words or phrases. It's a crucial step in NLP."

# Tokenizing the text
print(sample_text)
print(casual_tokenize(sample_text))
print(word_tokenize(sample_text))

Tokenization is the process of splitting text into words or phrases. It's a crucial step in NLP.
['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'words', 'or', 'phrases', '.', "It's", 'a', 'crucial', 'step', 'in', 'NLP', '.']
['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'text', 'into', 'words', 'or', 'phrases', '.', 'It', "'s", 'a', 'crucial', 'step', 'in', 'NLP', '.']


## Stopword removal

In [72]:
# Example text
text = "This is an example sentence demonstrating stop word filtration."

# NLTK's default list of stop words
stop_words = set(stopwords.words('english'))

# Tokenize the text
words = word_tokenize(text)

# Remove stop words
filtered_text = [word for word in words if not word.lower() in stop_words]

print("Text: ")
print("  ", text)
print("  ", " ".join(filtered_text))

print("Words:")
print("  ", words)
print("  ", filtered_text)

Text: 
   This is an example sentence demonstrating stop word filtration.
   example sentence demonstrating stop word filtration .
Words:
   ['This', 'is', 'an', 'example', 'sentence', 'demonstrating', 'stop', 'word', 'filtration', '.']
   ['example', 'sentence', 'demonstrating', 'stop', 'word', 'filtration', '.']


## Stemming

In [73]:
# Create a new Porter stemmer
stemmer = PorterStemmer()

# Example text
text = "The boys are playing football and one boy is scoring goals."

# Tokenize the text
words = word_tokenize(text)

# Perform stemming on each word
stemmed_words = [stemmer.stem(word) for word in words]

print("Text: ")
print("  ", text)
print("  ", " ".join(stemmed_words))

print("Words:")
print("  ", words)
print("  ", stemmed_words)

Text: 
   The boys are playing football and one boy is scoring goals.
   the boy are play footbal and one boy is score goal .
Words:
   ['The', 'boys', 'are', 'playing', 'football', 'and', 'one', 'boy', 'is', 'scoring', 'goals', '.']
   ['the', 'boy', 'are', 'play', 'footbal', 'and', 'one', 'boy', 'is', 'score', 'goal', '.']


## Lemmatization

In [74]:
# Create a new WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Example text
text = "The boys are playing football and one boy is scoring goals."

# Tokenize the text
words = word_tokenize(text)

# Perform lemmatization on each word
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("Text: ")
print("  ", text)
print("  ", " ".join(lemmatized_words))

print("Words:")
print("  ", words)
print("  ", lemmatized_words)

Text: 
   The boys are playing football and one boy is scoring goals.
   The boy are playing football and one boy is scoring goal .
Words:
   ['The', 'boys', 'are', 'playing', 'football', 'and', 'one', 'boy', 'is', 'scoring', 'goals', '.']
   ['The', 'boy', 'are', 'playing', 'football', 'and', 'one', 'boy', 'is', 'scoring', 'goal', '.']


## Tagging

In [75]:
# Example text
text = "The quick brown fox jumps over the lazy dog"

# Tokenize the text
words = word_tokenize(text)

# Perform POS tagging
tagged_words = nltk.pos_tag(words)

print("Words:")
print("  ", words)
print("Tagged words:")
print("  ", tagged_words)

Words:
   ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Tagged words:
   [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


## N-grams

In [76]:
# Example text
text = "Hello world, this is a test sentence for generating n-grams"

# Tokenize the text
tokens = word_tokenize(text)

# Function to generate n-grams
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Generate bigrams (n=2), trigrams (n=3), and 4-grams
bigrams = generate_ngrams(tokens, 2)
trigrams = generate_ngrams(tokens, 3)
fourgrams = generate_ngrams(tokens, 4)

print("Bigrams:")
print(f"  {bigrams}")
print("Trigrams:")
print(f"  {trigrams}")
print("4-grams:")
print(f"  {fourgrams}")

Bigrams:
  [('Hello', 'world'), ('world', ','), (',', 'this'), ('this', 'is'), ('is', 'a'), ('a', 'test'), ('test', 'sentence'), ('sentence', 'for'), ('for', 'generating'), ('generating', 'n-grams')]
Trigrams:
  [('Hello', 'world', ','), ('world', ',', 'this'), (',', 'this', 'is'), ('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'generating'), ('for', 'generating', 'n-grams')]
4-grams:
  [('Hello', 'world', ',', 'this'), ('world', ',', 'this', 'is'), (',', 'this', 'is', 'a'), ('this', 'is', 'a', 'test'), ('is', 'a', 'test', 'sentence'), ('a', 'test', 'sentence', 'for'), ('test', 'sentence', 'for', 'generating'), ('sentence', 'for', 'generating', 'n-grams')]


## Frequency encoding

In [81]:

# Sample data
data = {'Category': ['Apple', 'Banana', 'Apple', 'Orange', 'Banana', 'Orange']}
df = pd.DataFrame(data)

# Frequency encoding
frequency_encoding = df['Category'].value_counts().to_dict()
df['Category_Freq'] = df['Category'].map(frequency_encoding)

print(df)

  Category  Category_Freq
0    Apple              2
1   Banana              2
2    Apple              2
3   Orange              2
4   Banana              2
5   Orange              2


## One-hot encoding 

In [87]:
import pandas as pd

# Sample data
data = {'Category': ['Apple', 'Banana', 'Apple', 'Orange', 'Banana', 'Orange']}
df = pd.DataFrame(data)

# One-hot encoding
one_hot_encoded = pd.get_dummies(df['Category'], prefix='Category')

print(one_hot_encoded)


   Category_Apple  Category_Banana  Category_Orange
0            True            False            False
1           False             True            False
2            True            False            False
3           False            False             True
4           False             True            False
5           False            False             True


## TF-IDF
TF-IDF(t,d)=TF(t,d)×IDF(t)

In [88]:
# Sample documents
documents = [
    "The sky is blue.",
    "The sun is bright.",
    "The sun in the sky is bright.",
    "We can see the shining sun, the bright sun."
]

# Create a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply the vectorizer to the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get the names of the features (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert the matrix to a dense array and put it in a DataFrame for better readability
import pandas as pd
df = pd.DataFrame(tfidf_matrix.todense(), columns=feature_names)

print(df)

       blue    bright       can        in        is       see   shining  \
0  0.659191  0.000000  0.000000  0.000000  0.420753  0.000000  0.000000   
1  0.000000  0.522109  0.000000  0.000000  0.522109  0.000000  0.000000   
2  0.000000  0.321846  0.000000  0.504235  0.321846  0.000000  0.000000   
3  0.000000  0.239102  0.374599  0.000000  0.000000  0.374599  0.374599   

        sky       sun       the        we  
0  0.519714  0.000000  0.343993  0.000000  
1  0.000000  0.522109  0.426858  0.000000  
2  0.397544  0.321846  0.526261  0.000000  
3  0.000000  0.478204  0.390963  0.374599  


## Continuous word representation

In [95]:
from gensim.models import Word2Vec, FastText

# Sample sentences
sentences = ["The quick brown fox jumps over the lazy dog",
             "I love dogs and foxes",
             "The dog is in the garden"]

# Tokenizing the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Training a Word2Vec model
model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Getting the vector for a word
vector = model.wv['fox']
print(vector)

[-0.00950012  0.00956222 -0.00777076 -0.00264551 -0.00490641 -0.0049667
 -0.00802359 -0.00778358 -0.00455321 -0.00127536 -0.00510299  0.00614054
 -0.00951662 -0.0053071   0.00943715  0.00699133  0.00767582  0.00423474
  0.00050709 -0.00598114  0.00601878  0.00263503  0.00769943  0.00639384
  0.00794257  0.00865741 -0.00989575 -0.0067557   0.00133757  0.0064403
  0.00737382  0.00551698  0.00766163 -0.00512557  0.00658441 -0.00410837
 -0.00905534  0.00914168  0.0013314  -0.00275968 -0.00247784 -0.00422048
  0.00481234  0.00440022 -0.00265336 -0.00734188 -0.00356585 -0.00033661
  0.00609589 -0.00283734 -0.00012089  0.00087973 -0.00709565  0.002065
 -0.00143242  0.00280215  0.00484222 -0.00135202 -0.00278014  0.00773865
  0.0050456   0.00671352  0.00451564  0.00866716  0.00747497 -0.00108189
  0.00874764  0.00460172  0.00544063 -0.00138608 -0.00204132 -0.00442435
 -0.0085152   0.00303773  0.00888319  0.00891974 -0.00194235  0.00608616
  0.00377972 -0.00429597  0.00204292 -0.00543789  0.008

## Word embeddings

### Word2Vec

In [96]:
# Sample sentences
sentences = ["I love machine learning", "I am exploring NLP", "I am a beginner in machine learning"]

# Tokenization
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train the Word2Vec model
model = Word2Vec(tokenized_sentences, min_count=1)

# Access vector for one word
print(model.wv['machine'])

[-8.2426779e-03  9.2993546e-03 -1.9766092e-04 -1.9672764e-03
  4.6036304e-03 -4.0953159e-03  2.7431143e-03  6.9399667e-03
  6.0654259e-03 -7.5107943e-03  9.3823504e-03  4.6718083e-03
  3.9661205e-03 -6.2435055e-03  8.4599797e-03 -2.1501649e-03
  8.8251876e-03 -5.3620026e-03 -8.1294188e-03  6.8245591e-03
  1.6711927e-03 -2.1985089e-03  9.5136007e-03  9.4938548e-03
 -9.7740470e-03  2.5052286e-03  6.1566923e-03  3.8724565e-03
  2.0227872e-03  4.3050171e-04  6.7363144e-04 -3.8206363e-03
 -7.1402504e-03 -2.0888723e-03  3.9238976e-03  8.8186832e-03
  9.2591504e-03 -5.9759365e-03 -9.4026709e-03  9.7643770e-03
  3.4297847e-03  5.1661171e-03  6.2823449e-03 -2.8042626e-03
  7.3227035e-03  2.8302716e-03  2.8710044e-03 -2.3803699e-03
 -3.1282497e-03 -2.3701417e-03  4.2764368e-03  7.6057913e-05
 -9.5842788e-03 -9.6655441e-03 -6.1481940e-03 -1.2856961e-04
  1.9974159e-03  9.4319675e-03  5.5843508e-03 -4.2906962e-03
  2.7831673e-04  4.9643586e-03  7.6983096e-03 -1.1442233e-03
  4.3234206e-03 -5.81437

### GloVe

In [101]:
def load_glove_model(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        model = {}
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = [float(val) for val in split_line[1:]]
            model[word] = embedding
        return model

glove_model = load_glove_model('data/glove/glove.6B.50d.txt') # replace with your path to GloVe file

# Access vector for one word
print(glove_model['machine'])

[-0.34165, -0.81267, 1.4513, 0.05914, -0.080801, 0.39567, 0.10064, -0.5468, -0.18887, 0.11364, -0.040956, -0.5637, -0.32191, 0.15968, -0.59756, -0.14571, -0.77074, 1.2955, -0.72002, -0.90818, 0.76644, 0.05346, -0.0031632, -0.15341, 0.22065, -1.191, -1.0775, -0.29768, 1.327, -0.51359, 2.6229, -0.67411, -0.82558, 0.14283, -0.014214, 0.90775, 0.66828, 0.48431, 0.1543, 0.26044, 1.0191, 0.015872, -0.75325, 0.58992, 0.4546, -0.19678, 0.42138, -0.43168, 0.11985, 0.14094]


In [102]:
len(glove_model)

400000

### fastText

In [98]:
# Sample sentences
sentences = ["I love machine learning", "I am exploring NLP", "I am a beginner in machine learning"]

# Tokenization
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train the fastText model
model = FastText(tokenized_sentences, min_count=1)

# Access vector for one word
print(model.wv['machine'])

[-3.3493250e-04 -3.2697237e-05  1.5724794e-03 -4.3578114e-04
 -1.0922313e-03 -2.7957156e-03 -9.5741020e-04 -5.5972039e-04
 -1.2796433e-05 -9.3388435e-04  1.9502465e-03  6.0458242e-05
  2.4036055e-04  8.6894573e-04  8.0683391e-04 -5.5166340e-04
 -1.0096406e-03  7.2244304e-04  1.0901949e-03  1.2013994e-03
 -1.0446154e-03  4.9545517e-04 -3.3387766e-04  7.2457740e-04
 -1.6949677e-03  1.7823363e-04  2.5758869e-03 -8.3602348e-04
 -6.9080963e-04  1.6026372e-03  3.0975667e-04  4.6462650e-04
  4.1846634e-04  3.9506372e-04  8.9535628e-05  2.1438985e-03
  3.5811459e-05  1.1164135e-03 -4.3014187e-04  2.8502289e-04
  1.1015051e-03  4.1251557e-04  6.0182286e-04 -2.0248196e-03
 -9.8156133e-06 -1.2037809e-03 -8.1958412e-04  1.5580680e-04
 -1.5764225e-03 -7.0900656e-05  7.9386518e-04  2.8888283e-03
  1.0871483e-03 -3.7539122e-04 -2.4249775e-03 -1.1455865e-03
  5.2204775e-04 -4.4789197e-04 -3.2897203e-04 -3.6794288e-04
  6.4575358e-04 -1.9342760e-05 -1.9213838e-04 -4.3491388e-04
  5.4921664e-04  3.15767