In [1]:
import nltk

In [2]:
sentences = "The cat is in the box. The cat likes the box. The box is over the cat."

## Preprocessing

### Tokenizacja

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# tokenizacja
from nltk.tokenize import word_tokenize

tokens = word_tokenize(sentences)
print(tokens)

['The', 'cat', 'is', 'in', 'the', 'box', '.', 'The', 'cat', 'likes', 'the', 'box', '.', 'The', 'box', 'is', 'over', 'the', 'cat', '.']


### Lowercase

In [6]:
tokens = [token.lower() for token in tokens]
print(tokens)

['the', 'cat', 'is', 'in', 'the', 'box', '.', 'the', 'cat', 'likes', 'the', 'box', '.', 'the', 'box', 'is', 'over', 'the', 'cat', '.']


### Usunięcie tokenów nie są alfanumeryczne

In [8]:
tokens = [token for token in tokens if token.isalpha()]
print(tokens)

['the', 'cat', 'is', 'in', 'the', 'box', 'the', 'cat', 'likes', 'the', 'box', 'the', 'box', 'is', 'over', 'the', 'cat']


### Usunięcie stopwords

**Stopwords** - najczęściej występujące słowa w języku, które nie niosą ze sobą żadnej konkretnej treści.

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords

stopwords_list = stopwords.words('english')
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
tokens = [token for token in tokens if token not in stopwords_list]
print(tokens)

['cat', 'box', 'cat', 'likes', 'box', 'box', 'cat']


### Lematyzacja

In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
from nltk.stem import WordNetLemmatizer

# Inicjalizacja WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
print(tokens)

['cat', 'box', 'cat', 'like', 'box', 'box', 'cat']


## Model BoW

In [14]:
from collections import Counter

c = Counter(tokens)
print(c)

Counter({'cat': 3, 'box': 3, 'like': 1})


In [15]:
# dwa najczęściej występujące tokeny w tekście
print(c.most_common(2))

[('cat', 3), ('box', 3)]


### Biblioteka Gensim

Biblioteka Gesim jest inną popularną biblioteką do przetwarzania pozwala nam w prosty sposób budować korpusy i słowniki. Korpus to zbiór tekstów służących do wykonywania zadań przetwarzania języka naturlanego.

In [16]:
!pip install gensim

Collecting gensim
  Using cached gensim-4.3.1-cp311-cp311-win_amd64.whl (23.9 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-6.3.0-py3-none-any.whl (56 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.1 smart-open-6.3.0



[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:
from nltk.tokenize import word_tokenize
from gensim.corpora.dictionary import Dictionary

# nasz dokument (lista napisów)
my_documents = [
    'Nearly all great ideas follow a similar creative process and this article explains how this process works. Understanding this is important because creative thinking is one of the most useful skills you can possess.',
    'Not doing something will always be faster than doing it. This statement reminds me of the old computer programming saying, Remember that there is no code faster than no code.',
    'He went on to become a trailblazer in the field of photography and held over 70 patents by the end of his career. His story of creativity and innovation, which I will share now, is a useful case study for understanding the 5 key steps of the creative process.',
    'He spent the rest of the decade experimenting with new photography techniques and learning about cameras, printers, and optics.',   
]

# preprocessing (lowercase, usuwam znaki interpunkcyjne i stopwords)
tokenized_docs = [word_tokenize(article.lower()) for article in my_documents]
for idx, article_word_list in enumerate(tokenized_docs):
    new_article_word_list = []
    for token in article_word_list:
        if (not token.isalpha()) or token in stopwords_list:
            continue
        else:
            new_article_word_list.append(token)
    tokenized_docs[idx]=new_article_word_list

print(tokenized_docs)

[['nearly', 'great', 'ideas', 'follow', 'similar', 'creative', 'process', 'article', 'explains', 'process', 'works', 'understanding', 'important', 'creative', 'thinking', 'one', 'useful', 'skills', 'possess'], ['something', 'always', 'faster', 'statement', 'reminds', 'old', 'computer', 'programming', 'saying', 'remember', 'code', 'faster', 'code'], ['went', 'become', 'trailblazer', 'field', 'photography', 'held', 'patents', 'end', 'career', 'story', 'creativity', 'innovation', 'share', 'useful', 'case', 'study', 'understanding', 'key', 'steps', 'creative', 'process'], ['spent', 'rest', 'decade', 'experimenting', 'new', 'photography', 'techniques', 'learning', 'cameras', 'printers', 'optics']]


In [34]:
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Z tak przygotowanego zestawy tokenów możemy stworzyć słownik

In [37]:
# Słownik (mapa)
dictionary = Dictionary(tokenized_docs)
print(dictionary.token2id)

{'article': 0, 'creative': 1, 'explains': 2, 'follow': 3, 'great': 4, 'ideas': 5, 'important': 6, 'nearly': 7, 'one': 8, 'possess': 9, 'process': 10, 'similar': 11, 'skills': 12, 'thinking': 13, 'understanding': 14, 'useful': 15, 'works': 16, 'always': 17, 'code': 18, 'computer': 19, 'faster': 20, 'old': 21, 'programming': 22, 'remember': 23, 'reminds': 24, 'saying': 25, 'something': 26, 'statement': 27, 'become': 28, 'career': 29, 'case': 30, 'creativity': 31, 'end': 32, 'field': 33, 'held': 34, 'innovation': 35, 'key': 36, 'patents': 37, 'photography': 38, 'share': 39, 'steps': 40, 'story': 41, 'study': 42, 'trailblazer': 43, 'went': 44, 'cameras': 45, 'decade': 46, 'experimenting': 47, 'learning': 48, 'new': 49, 'optics': 50, 'printers': 51, 'rest': 52, 'spent': 53, 'techniques': 54}


ze słownika możemy otrzymać specjalny korpus

In [38]:
corpus = [dictionary.doc2bow(article) for article in tokenized_docs]
print(corpus)

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)], [(17, 1), (18, 2), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)], [(1, 1), (10, 1), (14, 1), (15, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1)], [(38, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1)]]


Powyższy korpus jest trochę czymś innym niż to co przeważnie mamy na myśli mówiąć korpus języka - czyli zestaw dokumentów. Gensim używa prostego modelu BoW za pomocą którego przekształca każdy dokument w BoW użwając id tokenów i częstość występowania tokenu w dokumencie. Za pomocą Gensim w kilku linijkach możemy otrzymać nowy korpus i BoW.

I ten korpus można łatwo zapisywać, aktualizować i ponownie wykorzystywać dzięki narzędziom biblioteki Gensim.

## Model TF-IDF

In [39]:
from gensim.models.tfidfmodel import TfidfModel

tfidf_corpus = TfidfModel(corpus)

In [40]:
print(corpus[0])

[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)]


In [41]:
print(tfidf_corpus[corpus[0]])

[(0, 0.254000254000381), (1, 0.254000254000381), (2, 0.254000254000381), (3, 0.254000254000381), (4, 0.254000254000381), (5, 0.254000254000381), (6, 0.254000254000381), (7, 0.254000254000381), (8, 0.254000254000381), (9, 0.254000254000381), (10, 0.254000254000381), (11, 0.254000254000381), (12, 0.254000254000381), (13, 0.254000254000381), (14, 0.1270001270001905), (15, 0.1270001270001905), (16, 0.254000254000381)]


Mimo, że token o id 15 występuje w dokumencie 1 raz to wartość przyporządkowana jemu w tfidf jest znacznie niższa.

In [42]:
dictionary[15]

'useful'