In [2]:
# tokenize input
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

example_text = "Hello there, how are you doing today? The weather is great today. The sky is blue. python is awesome"
print(sent_tokenize(example_text))
print(word_tokenize(example_text))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kintal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['Hello there, how are you doing today?', 'The weather is great today.', 'The sky is blue.', 'python is awesome']
['Hello', 'there', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'today', '.', 'The', 'sky', 'is', 'blue', '.', 'python', 'is', 'awesome']


In [4]:
# show stop words
from nltk.corpus import stopwords
nltk.download('stopwords')

print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kintal\AppData\Roaming\nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
# delete stop words from input
text = 'he is a good boy. he is very good in coding'
text = word_tokenize(text)
text_with_no_stopwords = [word for word in text if word not in stopwords.words('english')]
text_with_no_stopwords

['good', 'boy', '.', 'good', 'coding']

In [6]:
# find words stems
from nltk.stem import PorterStemmer

ps = PorterStemmer()    ## создаём объект для PorterStemmer
example_words = ['earn', 'earning', 'earned', 'earns']  ##слова для примера
for w in example_words:
    print(ps.stem(w))   ##выделяем корни слов, используя ps

earn
earn
earn
earn


In [8]:
# lemmatization
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer() ## создаём объект для WordNetLemmatizer
example_words = ['history', 'formality', 'changes']
for w in example_words:
    print(lemmatizer.lemmatize(w))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kintal\AppData\Roaming\nltk_data...


history
formality
change


In [11]:
# fing synonyms and antonyms using wordnet
from nltk.corpus import wordnet

synonyms = []   ## создаём пустой список всех синонимов
antonyms =[]    ## создаём пустой список всех антонимов
for syn in wordnet.synsets('happy'): ## загружаем слово
    for i in syn.lemmas():         ## находим все соответств. леммы
        synonyms.append(i.name())  ## добавляем все синонимы
        if i.antonyms():
            antonyms.append(i.antonyms()[0].name()) ## антонимы
print(set(synonyms)) ## преобразуем их в множество 
                     ## уникальных значений
print(set(antonyms))

{'felicitous', 'happy', 'well-chosen', 'glad'}
{'unhappy'}


In [12]:
# mark parts of speach
import nltk

nltk.download('averaged_perceptron_tagger')
sample_text = '''
An sincerity so extremity he additions. Her yet there truth merit. Mrs all projecting favourable now unpleasing. Son law garden chatty temper. Oh children provided to mr elegance marriage strongly. Off can admiration prosperous now devonshire diminution law.
'''
from nltk.tokenize import word_tokenize
words = word_tokenize(sample_text)
print(nltk.pos_tag(words))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kintal\AppData\Roaming\nltk_data...


[('An', 'DT'), ('sincerity', 'NN'), ('so', 'RB'), ('extremity', 'NN'), ('he', 'PRP'), ('additions', 'VBZ'), ('.', '.'), ('Her', 'PRP$'), ('yet', 'RB'), ('there', 'EX'), ('truth', 'NN'), ('merit', 'NN'), ('.', '.'), ('Mrs', 'NNP'), ('all', 'DT'), ('projecting', 'VBG'), ('favourable', 'JJ'), ('now', 'RB'), ('unpleasing', 'VBG'), ('.', '.'), ('Son', 'NNP'), ('law', 'NN'), ('garden', 'NN'), ('chatty', 'JJ'), ('temper', 'NN'), ('.', '.'), ('Oh', 'UH'), ('children', 'NNS'), ('provided', 'VBD'), ('to', 'TO'), ('mr', 'VB'), ('elegance', 'NN'), ('marriage', 'NN'), ('strongly', 'RB'), ('.', '.'), ('Off', 'CC'), ('can', 'MD'), ('admiration', 'VB'), ('prosperous', 'JJ'), ('now', 'RB'), ('devonshire', 'VBP'), ('diminution', 'NN'), ('law', 'NN'), ('.', '.')]


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [15]:
# Converting text to vector for ml - example
#
# sent1 = he is a good boy
# sent2 = she is a good girl
# sent3 = boy and girl are good 
#         |
#         |
#   After removal of stopwords , lematization or stemming
# sent1 = good boy
# sent2 = good girl
# sent3 = boy girl good  
#         | ### Now we will calculate the frequency for each word by
#         |     calculating the occurrence of each word
# word  frequency
# good     3
# boy      2
# girl     2
#          | ## Then according to their occurrence we assign o or 1 
#          |    according to their occurrence in the sentence
#          | ## 1 for present and 0 fot not present
#          f1  f2   f3
#         girl good boy   
# sent1    0    1    1     
# sent2    1    0    1
# sent3    1    1    1
# ### After this we pass the vector form to machine learning model

In [None]:
# Converting text to vector for ml - code
import pandas as pd

sent = pd.DataFrame(['he is a good boy', 'she is a good girl', 'boy and girl are good'], columns=['text'])
corpus = []
for i in range(0, 3):
    words = sent['text'][i]
    words = word_tokenize(words)
    texts = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    text = ' '.join(texts)
    corpus.append(text)
print(corpus)   #### очищенные данные
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() ## создаём объект для CountVectorizer
X = cv.fit_transform(corpus).toarray()
X  ## векторная форма