# Removing HTML Tags

In [9]:
raw_data = """
<p>This movie was <b>amazing</b>! I loved the plot and the visuals.</p>
<div>Worst movie ever. <br>Don't waste your time.</div>
<span style='color:red'>Absolutely loved</span> the characters!
It was <i>okay</i>, not great, not terrible.
Check out the trailer <a href='https://example.com'>here</a>.
"""

In [10]:
import re
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('',data)

In [11]:
striphtml(raw_data)

"\nThis movie was amazing! I loved the plot and the visuals.\nWorst movie ever. Don't waste your time.\nAbsolutely loved the characters!\nIt was okay, not great, not terrible.\nCheck out the trailer here.\n"

# Unicode Normalization->to remove emojis

In [12]:
raw_unicode = "Café\u0301 is the same as Café. I love it! 　Here’s some weird space. 𝓣𝓮𝔁𝔱 😍 👍🏼"

In [13]:
raw_unicode.encode('utf-8')

b'Caf\xc3\xa9\xcc\x81 is the same as Caf\xc3\xa9. I love it! \xe3\x80\x80Here\xe2\x80\x99s some weird space. \xf0\x9d\x93\xa3\xf0\x9d\x93\xae\xf0\x9d\x94\x81\xf0\x9d\x94\xb1 \xf0\x9f\x98\x8d \xf0\x9f\x91\x8d\xf0\x9f\x8f\xbc'

# Spell Check

In [14]:
spell_data = """
This movi was absolutly amazngg! I cudn't belive how beautifl the scenes were.
The dirrector did a greta job, but the actresess were a bit unconvincing.
Overal, I'd recomend it for a casual watch, but not if you're luking for somthing serius.
"""

In [20]:
from textblob import TextBlob

textblb = TextBlob(spell_data)
textblb.correct()


TextBlob("
His move was absolutely amazing! I can't believe how beautiful the scenes were.
The director did a great job, but the actresses were a bit convincing.
Several, I'd recommend it for a casual watch, but not if you're liking for something serious.
")

# Tokenization

In [21]:
dummy_text = """
Artificial intelligence is transforming the world in remarkable ways. From self-driving cars to real-time language translation, AI technologies are becoming a part of everyday life. 
However, with great power comes great responsibility — ethical concerns, data privacy, and bias in algorithms are all challenges that must be addressed. 
Still, the potential benefits are enormous. Will machines ever fully understand human emotions? Only time will tell.
"""


In [23]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [28]:
sent = sent_tokenize(dummy_text)
sent #based on full stop it divided

['\nArtificial intelligence is transforming the world in remarkable ways.',
 'From self-driving cars to real-time language translation, AI technologies are becoming a part of everyday life.',
 'However, with great power comes great responsibility — ethical concerns, data privacy, and bias in algorithms are all challenges that must be addressed.',
 'Still, the potential benefits are enormous.',
 'Will machines ever fully understand human emotions?',
 'Only time will tell.']

In [29]:
for sen in sent:
    print(word_tokenize(sen))

['Artificial', 'intelligence', 'is', 'transforming', 'the', 'world', 'in', 'remarkable', 'ways', '.']
['From', 'self-driving', 'cars', 'to', 'real-time', 'language', 'translation', ',', 'AI', 'technologies', 'are', 'becoming', 'a', 'part', 'of', 'everyday', 'life', '.']
['However', ',', 'with', 'great', 'power', 'comes', 'great', 'responsibility', '—', 'ethical', 'concerns', ',', 'data', 'privacy', ',', 'and', 'bias', 'in', 'algorithms', 'are', 'all', 'challenges', 'that', 'must', 'be', 'addressed', '.']
['Still', ',', 'the', 'potential', 'benefits', 'are', 'enormous', '.']
['Will', 'machines', 'ever', 'fully', 'understand', 'human', 'emotions', '?']
['Only', 'time', 'will', 'tell', '.']
