<a href="https://colab.research.google.com/github/krishna110597/Sentiment-Analysis/blob/main/Tokenizing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import html.entities as htmlentitydefs

In [None]:
emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth      
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
    )"""
regex_strings = (
    # Phone numbers:
    r"""
    (?:
      (?:            # (international)
        \+?[01]
        [\-\s.]*
      )?            
      (?:            # (area code)
        [\(]?
        \d{3}
        [\-\s.\)]*
      )?    
      \d{3}          # exchange
      [\-\s.]*   
      \d{4}          # base
    )"""
    ,
    # Emoticons:
    emoticon_string
    ,
    # url
    r"""(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+""",    
    # HTML tags:
    r"""<[^>]+>"""
    ,
    # Twitter username:
    r"""(?:@[\w_]+)"""
    ,
    # Twitter hashtags:
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
    ,
    # Remaining word types:
    r"""
    (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots. 
    |
    (?:\S)                         # Everything else that isn't whitespace.
    """
    )

In [None]:
# This is the core tokenizing regex:
    
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)

# The emoticon string gets its own regex so that we can preserve case for them as needed:
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)

# These are for regularizing HTML entities to Unicode:
html_entity_digit_re = re.compile(r"&#\d+;")
html_entity_alpha_re = re.compile(r"&\w+;")
amp = "&amp;"   

In [None]:

class Tokenizer:
    def __init__(self, preserve_case=False):
        self.preserve_case = preserve_case

    def tokenize(self, s):
        """
        Argument: s -- any string or unicode object
        Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
        """        
        # Try to ensure unicode:
        try:
            s = str(s)
        except UnicodeDecodeError:
            s = bytes(s).encode('string_escape')
            s = str(s)
        # Fix HTML character entitites:
        s = self.__html2unicode(s)
        # Tokenize:
        words = word_re.findall(s)
        # Possible alter the case, but avoid changing emoticons like :D into :d:
        if not self.preserve_case:            
            words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
        return words

    def tokenize_random_tweet(self):
        """
        If the twitter library is installed and a twitter connection
        can be established, then tokenize a random tweet.
        """
        try:
            import twitter
        except ImportError:
            print ("Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/")
        from random import shuffle
        api = twitter.Api()
        tweets = api.GetPublicTimeline()
        if tweets:
            for tweet in tweets:
                if tweet.user.lang == 'en':            
                    return self.tokenize(tweet.text)
        else:
            raise Exception("Apologies. I couldn't get Twitter to give me a public English-language tweet. Perhaps try again")

    def __html2unicode(self, s):
        """
        Internal metod that seeks to replace all the HTML entities in
        s with their corresponding unicode characters.
        """
        # First the digits:
        ents = set(html_entity_digit_re.findall(s))
        if len(ents) > 0:
            for ent in ents:
                entnum = ent[2:-1]
                try:
                    entnum = int(entnum)
                    s = s.replace(ent, chr(entnum))	
                except:
                    pass
        s = s.replace(amp, " and ")
        # Now the alpha versions:
        ents = set(html_entity_alpha_re.findall(s))
        #ents = filter((lambda x : x != amp), ents)
        for ent in ents:
            entname = ent[1:-1]
            try:            
                s = s.replace(ent, chr(htmlentitydefs.name2codepoint[entname]))
            except:
                pass                    
        return s


In [None]:

if __name__ == '__main__':
    tok = Tokenizer(preserve_case=False)
    samples = (
        u"Extremely disturbing reports from UttarPradesh. The spread in rural areas is much higher in #COVIDSecondWave. With #PanchayatElections starting from tomorrow - people are afraid of rapid spread due to political activity. They should consider postponing the polls. #COVID19",
        u"As Delhi reported the highest single-day spike with 17,282 new #COVID19 cases and 104 deaths, amidst a shortage of beds, hotels and banquets set aside beds for Covid patients.https://t.co/1jZQAasQ0v")

    for s in samples:
        print ("======================================================================")
        print (s)
        tokenized = tok.tokenize(s)
        print ("\n".join(tokenized))


Extremely disturbing reports from UttarPradesh. The spread in rural areas is much higher in #COVIDSecondWave. With #PanchayatElections starting from tomorrow - people are afraid of rapid spread due to political activity. They should consider postponing the polls. #COVID19
extremely
disturbing
reports
from
uttarpradesh
.
the
spread
in
rural
areas
is
much
higher
in
#covidsecondwave
.
with
#panchayatelections
starting
from
tomorrow
-
people
are
afraid
of
rapid
spread
due
to
political
activity
.
they
should
consider
postponing
the
polls
.
#covid19
As Delhi reported the highest single-day spike with 17,282 new #COVID19 cases and 104 deaths, amidst a shortage of beds, hotels and banquets set aside beds for Covid patients.https://t.co/1jZQAasQ0v
as
delhi
reported
the
highest
single-day
spike
with
17,282
new
#covid19
cases
and
104
deaths
,
amidst
a
shortage
of
beds
,
hotels
and
banquets
set
aside
beds
for
covid
patients.https
:/
/t.co/1jzqaasq0v
