## Punctuation features


In [None]:
from json_io import tweet_iterate
from basic_nlp import nlp, PUNCTUATION_RE

import itertools
import re

PATH_SARCASTIC_TWEETS = "../json/sarcastic/unique.json"
n = nlp()

In [None]:
# See lower slides for explanation of is_surrogate_escaped() and remove_surrogate_escaping()

# Suggested methods by author of article

def remove_surrogate_escaping(s, method='ignore'):
    assert method in ('ignore', 'replace'), 'invalid removal method'
    return s.encode('utf-8', method).decode('utf-8')

def is_surrogate_escaped(s):
    try:
        s.encode('utf-8')
    except UnicodeEncodeError as e:
        if e.reason == 'surrogates not allowed':
            return True
        raise
    return False

In [None]:
def punctuation_features(s):
    """
    Process a string for punctuation features. Punctuation defined by the regular expression:
    [\'\!\"\#\$\%\&\/\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\}\|\~\\u2026]
    Which consists of punctuation in string.punctuation, and the unicode char \u2026 (ellipsis)
    
    s: input string
    returns {punctuation_mark: (raw #, % of length of s, % of total # of punctuation marks found in s)}
    
    example:
    punctuation_features("Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed consequat magna eu facilisis!!?")
    {'!': (2, 0.0217, 0.4),
     ',': (1, 0.0109, 0.2),
     '.': (1, 0.0109, 0.2),
     '?': (1, 0.0109, 0.2)}
    """
    
    punctuation_found_list = re.findall(PUNCTUATION_RE, s)
    return {p: (punctuation_found_list.count(p),
                round(punctuation_found_list.count(p)/len(s), 4), 
                round(punctuation_found_list.count(p)/len(punctuation_found_list), 4)) for p in punctuation_found_list} 

In [None]:
for tweet in itertools.islice(tweet_iterate(PATH_SARCASTIC_TWEETS), 25):
    text = tweet["text"]
    if is_surrogate_escaped(text):
        text = remove_surrogate_escaping(text)
    print("text: {} \n punctuation features: {} \n".format(text, punctuation_features(text)))

## Issue: Surrogate escaped strings

Read [http://lucumr.pocoo.org/2013/7/2/the-updated-guide-to-unicode/](http://lucumr.pocoo.org/2013/7/2/the-updated-guide-to-unicode/)

#### TLDR; 
These are unicode strings that cannot be encoded to a unicode encoding because they are actually invalid. These strings are created by APIs that think an encoding is a specific one but cannot guarantee it because the underlying system does not fully enforce that. This functionality is provided by the 'surrogateescape' error handler.

#### Example tweet

Ears lowered big ears to lower \ud83d\udc87\u200d\u2642\ufe0f\ud83d\ude02 #Trim #BigEars #Jokes #Selfie #MugShot #Weekend #Smile #Happy #Sarcasm\u2026 https://t.co/oXK7RHlODU
_______
[https://twitter.com/jasonstats09/status/827589626944770049](https://twitter.com/jasonstats09/status/827589626944770049)

![alt text](https://image.ibb.co/jPkKBF/Screen_Shot_2017_03_16_at_4_03_36_PM.png)

In [None]:
tweet = "Ears lowered big ears to lower \ud83d\udc87\u200d\u2642\ufe0f\ud83d\ude02 #Trim #BigEars #Jokes #Selfie #MugShot #Weekend #Smile #Happy #Sarcasm\u2026 https://t.co/oXK7RHlODU"
# print(tweet)
# Uncomment line above to try, crashes with UnicodeEncodeError (for some reason error is dumped to terminal running jupyter notebook, and crashes notebook kernel)

In [None]:
# Have to encode with "ignore" or "replace" option
print(tweet.encode("utf-8", "ignore").decode("utf-8"))
print(tweet.encode("utf-8", "replace").decode("utf-8"))