# Imports 

In [1]:
# Numpy
import numpy as np

# Pandas
import pandas as pd

# Regular expression
import re

# Natural langage tool kit
import nltk
from nltk.corpus import stopwords

# If you have never downloaded the list of stopwords plz feel free to uncomment this line
# nltk.download('stopwords')

In [2]:
# Convert tweet to lowercase
def to_lower_case(tweet):
    # Convert to lower case
    tweet = tweet.lower()
    return tweet

In [3]:
to_lower_case('Test LOWER caSe')

'test lower case'

In [4]:
# Remove some chars at begining and the end of a tweet
def process_begin_and_end(tweet):
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    return tweet

In [6]:
process_begin_and_end('" the Beging of the tweet is " and white space ')

'the Beging of the tweet is " and white space'

In [12]:
# Replace 2+ dots with a space
""" 
re{ n,}
Matches n or more occurrences of preceding expression.
"""
def process_dots(tweet):
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    return tweet

In [13]:
process_dots('Two dots...')

'Two dots '

In [14]:
# Replace 2+ spaces with a single one
"""
\s = space
\s+ one or more space
"""
def process_spaces(tweet):
    # Replace 2+ spaces with 1 space
    tweet = re.sub(r'\ {2,}', ' ', tweet)
    return tweet

In [15]:
process_spaces('Tweet    with     spaces')

'Tweet with spaces'

In [16]:
# Remove numbers from a tweet
"""
\d = any number
\D = anything but a number
"""
def process_numbers(tweet):
    # Remove digits
    tweet = re.sub(r'\d', '', tweet)
    return tweet

In [17]:
process_numbers('Tweet with 2 numbers')

'Tweet with  numbers'

In [19]:
# I tooked the stopwords of nltk library and removed some word with nuanced sens like 'no', 'dont' etc
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
             "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's",
             'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 
             'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 
             'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
             'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'of', 'at', 
             'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
             'below', 'to', 'from', 'up', 'down', 'in', 'on', 'under', 'again', 'further', 'then', 'once', 
             'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'other', 'some', 
             'such', 'own', 'same', 'so', 'than', 'too', 'will', 'just', 'now']

In [20]:
# Remove all stop words from a tweet
def process_stopwords(tweet):  
    ''' Push stopwords to a set is more time efficient because complexity is O(1) '''
    stop_words = set(stopwords)
    tweet_words = tweet.split()
    result_words  = [word for word in tweet_words if word.lower() not in stop_words]
    tweet = ' '.join(result_words)
    return tweet

In [21]:
process_stopwords('i like doing some nlp, not else')

'like nlp, not else'

In [22]:
# Replace URL with the word 'URL'
def process_url(tweet):
    regex_url = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    tweet = re.sub(regex_url, 'URL', tweet)
    return tweet

In [24]:
process_url('some links https://pythonprogramming.net/ https://www.facebook.com/')

'some links URL URL'

In [48]:
# Repalce tags with the word 'USER_MENTION'
def process_user_mention(tweet):
    regex_user_mention = r'@[\S]+'
    tweet = re.sub(regex_user_mention, 'USER_MENTION', tweet)
    return tweet

In [49]:
process_user_mention('@happy')

'USER_MENTION'

In [78]:
# remove hashtag and replace it with the word without the #
def process_hashtag(tweet):
    """
    tweet_words = tweet.split()
    result_words = [word.strip("#") for word in tweet_words]
    tweet = ' '.join(result_words)
    """
    regex_hashtag = r'#(\S+)' #r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9-_]+)'
    tweet = re.sub(regex_hashtag, r'\1', tweet)
    return tweet

In [82]:
process_hashtag('#hashtag0 #hashtag1 #hashtag2 #hashtag3')

'hashtag0 hashtag1 hashtag2 hashtag3'

In [83]:
# Remove retweets
"""
\b = space around whole words
"""
def process_retweet(tweet):
    regex_retweet = r'\brt\b'
    tweet = re.sub(regex_retweet, '', tweet)
    return tweet

In [84]:
process_retweet(' rt retweet')

'  retweet'

In [88]:
# Remove punctuation
def process_punctuation(tweet):
    # Remove , ; : . ? ! / from tweet
    tweet = re.sub('[,;:.?!/]', '', string=tweet)
    return tweet

In [89]:
process_punctuation('It\' a tweet: with, some punctuation! :')

"It' a tweet with some punctuation "

In [13]:
# Classify emojis into postive ad negative ones
def process_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', 'EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

In [14]:
# Process a single word
def process_word(word):
    # Remove punctuation from the word
    word = re.sub(r'[\'"?!,.():;]', '',word)
    # Any character, except for a new line, one or more, 
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - ' " _ from the word
    word = re.sub(r'[\'"-]', '', word)
    return word

In [15]:
# check if a word does not contain weird symboles
def is_valid_word(word):
    # Check if word begin with alphabet
    return (re.search(r'^[a-zA-Z][a-zA-Z0-9\._]*$',word) is not None)

In [16]:
# Process words of a tweet
def process_words(tweet):
    processed_tweet = []
    tweet_words = tweet.split()
    for word in tweet_words:
        if is_valid_word(word):
            word = process_word(word)
            processed_tweet.append(word)
        else:
            continue # goes to the next word
    tweet = ' '.join(processed_tweet)
    return tweet

In [17]:
def process_tweet(tweet):
    tweet = to_lower_case(tweet)
    tweet = process_url(tweet)
    tweet = process_user_mention(tweet)
    tweet = process_hashtag(tweet)
    tweet = process_retweet(tweet)
    tweet = process_emojis(tweet)
    tweet = process_numbers(tweet)
    tweet = process_dots(tweet)
    tweet = process_begin_and_end(tweet)
    tweet = process_punctuation(tweet)
    tweet = process_stopwords(tweet)
    tweet = process_spaces(tweet)
    tweet = process_words(tweet)
    return tweet

In [18]:
df = pd.read_csv('../data/train_set.csv')

In [19]:
import time
start_time = time.time()
for i in range(len(df)):
    df.at[i, 'tweet'] = process_tweet(df['tweet'][i])
print("--- %s seconds ---" % (time.time() - start_time))

--- 22.82310390472412 seconds ---


In [20]:
for i in range(20):
    print(df['tweet'][i])

USER_MENTION father dysfunctional selfish drags kids dysfunction run
USER_MENTION USER_MENTION thanks lyft credit use cause offer wheelchair vans pdx disapointed getthanked
bihday majesty
model love u take u time
factsguide society motivation
huge fan fare big talking leave chaos pay disputes get allshowandnogo
USER_MENTION camping tomorrow USER_MENTION USER_MENTION USER_MENTION USER_MENTION USER_MENTION USER_MENTION USER_MENTION
next school year year think school exams hate imagine actorslife revolutionschool girl
love land allin cavs champions cleveland clevelandcavaliers
USER_MENTION USER_MENTION welcome gr
ireland consumer price index climbed previous may blog silver gold forex
selfish orlando standwithorlando pulseshooting orlandoshooting biggerproblems selfish heabreaking values love
get see daddy today days gettingfed
USER_MENTION cnn calls michigan middle school chant tcot
comment australia opkillingbay seashepherd helpcovedolphins thecove helpcovedolphins
ouch junior junior yu

In [21]:
# To well understand \1 and \2
regex = r"([a-zA-Z]+) (\d+)"

# This will reorder the string and print:
#   24 of June, 9 of August, 12 of Dec
print(re.sub(regex, r"\2 of \1", "June 24, August 9, Dec 12"))

24 of June, 9 of August, 12 of Dec


In [22]:

smiley_pattern = '^(:\(|:\))+$' # matches only the smileys ":)" and ":("

def test_match(s):
    print('Value: %s; Result: %s' % (
        s,
        'Matches!' if re.match(smiley_pattern, s) else 'Doesn\'t match.'
    ))

should_match = [
    ':)',   # Single smile
    ':(',   # Single frown
    ':):)', # Two smiles
    ':(:(', # Two frowns
    ':):(', # Mix of a smile and a frown
]
should_not_match = [
    '',         # Empty string
    ':(foo',    # Extraneous characters appended
    'foo:(',    # Extraneous characters prepended
    ':( :(',    # Space between frowns
    ':( (',     # Extraneous characters and space appended
    ':(('       # Extraneous duplicate of final character appended
]

print('The following should all match:')
for x in should_match: test_match(x);

print('')   # Newline for output clarity

print('The following should all not match:')
for x in should_not_match: test_match(x);

The following should all match:
Value: :); Result: Matches!
Value: :(; Result: Matches!
Value: :):); Result: Matches!
Value: :(:(; Result: Matches!
Value: :):(; Result: Matches!

The following should all not match:
Value: ; Result: Doesn't match.
Value: :(foo; Result: Doesn't match.
Value: foo:(; Result: Doesn't match.
Value: :( :(; Result: Doesn't match.
Value: :( (; Result: Doesn't match.
Value: :((; Result: Doesn't match.
