In [169]:
"""Simple processing textual data with TextBlob"""

In [None]:
from textblob import TextBlob
from textblob import Sentence, Word
import nltk

In [149]:
text = 'Today is a beautiful day. Tomorrow looks like bad weather.'

In [150]:
blob = TextBlob(text)

In [151]:
blob

TextBlob("Today is a beautiful day. Tomorrow looks like bad weather.")

In [152]:
blob.sentences

[Sentence("Today is a beautiful day."),
 Sentence("Tomorrow looks like bad weather.")]

In [153]:
blob.words

WordList(['Today', 'is', 'a', 'beautiful', 'day', 'Tomorrow', 'looks', 'like', 'bad', 'weather'])

In [154]:
blob.tags

[('Today', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('beautiful', 'JJ'),
 ('day', 'NN'),
 ('Tomorrow', 'NNP'),
 ('looks', 'VBZ'),
 ('like', 'IN'),
 ('bad', 'JJ'),
 ('weather', 'NN')]

In [155]:
TextBlob('My dog is cute').tags

[('My', 'PRP$'), ('dog', 'NN'), ('is', 'VBZ'), ('cute', 'JJ')]

In [156]:
blob.noun_phrases

WordList(['beautiful day', 'tomorrow', 'bad weather'])

In [157]:
blob.sentiment

Sentiment(polarity=0.07500000000000007, subjectivity=0.8333333333333333)

In [158]:
%precision 3

'%.3f'

In [159]:
blob.sentiment.polarity

0.075

In [160]:
blob.sentiment.subjectivity

0.833

In [161]:
for sentence in blob.sentences:
    print(f'{sentence} [{sentence.sentiment}]')

Today is a beautiful day. [Sentiment(polarity=0.85, subjectivity=1.0)]
Tomorrow looks like bad weather. [Sentiment(polarity=-0.6999999999999998, subjectivity=0.6666666666666666)]


In [162]:
from textblob.sentiments import NaiveBayesAnalyzer

In [163]:
blob2 = TextBlob(text, analyzer=NaiveBayesAnalyzer())

In [164]:
blob2

TextBlob("Today is a beautiful day. Tomorrow looks like bad weather.")

In [165]:
blob2.sentiment

Sentiment(classification='neg', p_pos=0.47662917962091056, p_neg=0.5233708203790892)

In [166]:
for sentence in blob2.sentences:
    print(f'{sentence} [{sentence.sentiment}]')

Today is a beautiful day. [Sentiment(classification='pos', p_pos=0.8117563121751951, p_neg=0.18824368782480477)]
Tomorrow looks like bad weather. [Sentiment(classification='neg', p_pos=0.174363226578349, p_neg=0.8256367734216521)]


In [173]:
index = Word('index')

In [174]:
index.pluralize()

'indices'

In [175]:
cacti = Word('cacti')

In [176]:
cacti.singularize()

'cactus'

In [177]:
animals = TextBlob('dog cat fish bird').words

In [178]:
animals.pluralize()

WordList(['dogs', 'cats', 'fish', 'birds'])

In [179]:
Word('children').singularize()

'child'

In [180]:
Word('focus').pluralize()

'foci'

In [181]:
word = Word('theyr')


In [182]:
%precision 2

'%.2f'

In [183]:
word.spellcheck()

[('they', 0.57), ('their', 0.43)]

In [184]:
word.correct()

'they'

In [185]:
sentence = TextBlob('Ths sentense has missplled wrds.')

In [186]:
sentence.correct()

TextBlob("The sentence has misspelled words.")

In [187]:
Sentence('I canot beleive I misspeled thees werds').correct()

Sentence("I cannot believe I misspelled these words")

In [188]:
word = Word('varieties')

In [189]:
word.stem()

'varieti'

In [191]:
word.lemmatize()

'variety'

In [192]:
word2 = Word('strawberries')

In [193]:
word2.stem()

'strawberri'

In [194]:
word2.lemmatize()

'strawberry'

In [196]:
from pathlib import Path

In [197]:
blob2 = TextBlob(Path('Input/Romeo_Juliett.txt').read_text())

In [198]:
blob2.word_counts['juliet']

190

In [199]:
blob2.word_counts['romeo']

315

In [200]:
blob2.word_counts['thou']

278

In [201]:
blob2.words.count('joy')

14

In [202]:
blob2.noun_phrases.count('lady capulet')

46

In [203]:
blob2.word_counts['joy']

14

In [204]:
blob2.word_counts['a']

483

In [205]:
happy = Word('happy')

In [209]:
happy.definitions

['enjoying or showing or marked by joy or pleasure',
 'marked by good fortune',
 'eagerly disposed to act or to be of service',
 'well expressed and to the point']

In [210]:
happy.synsets

[Synset('happy.a.01'),
 Synset('felicitous.s.02'),
 Synset('glad.s.02'),
 Synset('happy.s.04')]

In [211]:
blob.sentiment

Sentiment(polarity=0.07500000000000007, subjectivity=0.8333333333333333)

In [212]:
synonyms = set()
for synset in happy.synsets:
    for lemma in synset.lemmas():
        synonyms.add(lemma.name())

In [213]:
synonyms

{'felicitous', 'glad', 'happy', 'well-chosen'}

In [214]:
lemmas = happy.synsets[0].lemmas()

In [215]:
lemmas[0].antonyms()

[Lemma('unhappy.a.01.unhappy')]

In [216]:
word3 = Word('boat')

In [217]:
word3.synsets

[Synset('boat.n.01'), Synset('gravy_boat.n.01'), Synset('boat.v.01')]

In [218]:
word3.definitions

['a small vessel for travel on water',
 'a dish (often boat-shaped) for serving gravy or sauce',
 'ride in a boat on water']

In [219]:
import nltk

In [220]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/drphyl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [221]:
from nltk.corpus import stopwords

In [222]:
stops = stopwords.words('english')

In [223]:
[word for word in blob.words if word not in stops]

['Today', 'beautiful', 'day', 'Tomorrow', 'looks', 'like', 'bad', 'weather']

In [225]:
blob.ngrams()

[WordList(['Today', 'is', 'a']),
 WordList(['is', 'a', 'beautiful']),
 WordList(['a', 'beautiful', 'day']),
 WordList(['beautiful', 'day', 'Tomorrow']),
 WordList(['day', 'Tomorrow', 'looks']),
 WordList(['Tomorrow', 'looks', 'like']),
 WordList(['looks', 'like', 'bad']),
 WordList(['like', 'bad', 'weather'])]

In [226]:
blob.ngrams(n=5)

[WordList(['Today', 'is', 'a', 'beautiful', 'day']),
 WordList(['is', 'a', 'beautiful', 'day', 'Tomorrow']),
 WordList(['a', 'beautiful', 'day', 'Tomorrow', 'looks']),
 WordList(['beautiful', 'day', 'Tomorrow', 'looks', 'like']),
 WordList(['day', 'Tomorrow', 'looks', 'like', 'bad']),
 WordList(['Tomorrow', 'looks', 'like', 'bad', 'weather'])]

In [227]:
blob.ngrams(n=8)

[WordList(['Today', 'is', 'a', 'beautiful', 'day', 'Tomorrow', 'looks', 'like']),
 WordList(['is', 'a', 'beautiful', 'day', 'Tomorrow', 'looks', 'like', 'bad']),
 WordList(['a', 'beautiful', 'day', 'Tomorrow', 'looks', 'like', 'bad', 'weather'])]

In [228]:
items = blob2.word_counts.items()

In [229]:
items = [item for item in items if item[0] not in stops]

In [230]:
from operator import itemgetter

In [231]:
sorted_items = sorted(items, key=itemgetter(1), reverse=True)

In [232]:
top20 = sorted_items[1:21]

In [233]:
top20

[('romeo', 315),
 ('thou', 278),
 ('juliet', 190),
 ('thy', 170),
 ('capulet', 163),
 ('nurse', 149),
 ('love', 148),
 ('thee', 138),
 ('lady', 117),
 ('shall', 110),
 ('friar', 105),
 ('come', 94),
 ('mercutio', 88),
 ('lawrence', 82),
 ('good', 80),
 ('benvolio', 79),
 ('tybalt', 79),
 ('enter', 75),
 ('go', 75),
 ('night', 73)]

In [234]:
import pandas as pd

In [235]:
df = pd.DataFrame(top20, columns=['word', 'count'])

In [236]:
df

Unnamed: 0,word,count
0,romeo,315
1,thou,278
2,juliet,190
3,thy,170
4,capulet,163
5,nurse,149
6,love,148
7,thee,138
8,lady,117
9,shall,110


In [237]:
from textatistic import Textatistic

In [252]:
text

'Today is a beautiful day. Tomorrow looks like bad weather.'

In [253]:
readability = Textatistic(text)

In [254]:
%precision 3

'%.3f'

In [255]:
readability.dict()

{'char_count': 49,
 'word_count': 10,
 'sent_count': 2,
 'sybl_count': 15,
 'notdalechall_count': 0,
 'polysyblword_count': 2,
 'flesch_score': 74.860,
 'fleschkincaid_score': 4.060,
 'gunningfog_score': 10.000,
 'smog_score': 8.842,
 'dalechall_score': 0.248}

In [256]:
readability.word_count / readability.sent_count

5.000

In [257]:
readability.char_count / readability.word_count

4.900