# Imports 

In [1]:
# Numpy
import numpy as np

# Pandas
import pandas as pd

# Regular expression
import re

# Natural langage tool kit
import nltk
from nltk.corpus import stopwords

# If you have never downloaded the list of stopwords plz feel free to uncomment this line
# nltk.download('stopwords')

# For processing accented chars
import unidecode

In [2]:
# Convert tweet to lowercase
def to_lower_case(tweet):
    # Convert to lower case
    tweet = tweet.lower()
    return tweet

In [3]:
to_lower_case('Test LOWER caSe')

'test lower case'

In [4]:
# Remove some chars at begining and the end of a tweet
def process_begin_and_end(tweet):
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    return tweet

In [5]:
process_begin_and_end('" the Beging of the tweet is " and white space ')

'the Beging of the tweet is " and white space'

In [6]:
# Replace 2+ dots with a space
""" 
re{n,}
Matches n or more occurrences of preceding expression.
"""
def process_dots(tweet):
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    return tweet

In [7]:
process_dots('Two dots...')

'Two dots '

In [8]:
# Replace 2+ spaces with a single one
"""
\s = space
\s+ one or more space
"""
def process_spaces(tweet):
    # Replace 2+ spaces with 1 space
    tweet = re.sub(r'\ {2,}', ' ', tweet)
    return tweet

In [9]:
process_spaces('Tweet    with     spaces')

'Tweet with spaces'

In [10]:
# Remove numbers from a tweet
"""
\d = any number
\D = anything but a number
"""
def process_digits(tweet):
    # Remove digits
    tweet = re.sub(r'\d', '', tweet)
    return tweet

In [11]:
process_digits('Tweet with 2 numbers')

'Tweet with  numbers'

In [12]:
# I tooked the stopwords of nltk library and removed some word with nuanced sens like 'no', 'dont' etc
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
             "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's",
             'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 
             'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 
             'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
             'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'of', 'at', 
             'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
             'below', 'to', 'from', 'up', 'down', 'in', 'on', 'under', 'again', 'further', 'then', 'once', 
             'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'other', 'some', 
             'such', 'own', 'same', 'so', 'than', 'too', 'will', 'just', 'now']

In [13]:
# Remove all stop words from a tweet
def process_stopwords(tweet):  
    ''' Push stopwords to a set is more time efficient because complexity is O(1) '''
    stop_words = set(stopwords)
    tweet_words = tweet.split()
    result_words  = [word for word in tweet_words if word.lower() not in stop_words]
    tweet = ' '.join(result_words)
    return tweet

In [14]:
process_stopwords('i like doing some nlp, not else')

'like nlp, not else'

In [15]:
# Replace URL with the word 'URL'
def process_url(tweet):
    pattern_url = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    tweet = re.sub(pattern_url, 'URL', tweet)
    return tweet

In [16]:
process_url('some links https://pythonprogramming.net/ https://www.facebook.com/')

'some links URL URL'

In [17]:
# Repalce tags with the word 'USER_MENTION'
def process_user_mention(tweet):
    pattern_user_mention = r'@[\S]+'
    tweet = re.sub(pattern_user_mention, 'USER_MENTION', tweet)
    return tweet

In [18]:
process_user_mention('@happy')

'USER_MENTION'

In [19]:
# remove hashtag and replace it with the word without the #
def process_hashtag(tweet):
    pattern_hashtag = r'#(\S+)+' #r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9-_]+)' 
    tweet_words = tweet.split('#')
    processed_tweet = []
    for word in tweet_words:
        word = re.sub(pattern_hashtag, r'\1', word)
        processed_tweet.append(word)
    tweet = ' '.join(processed_tweet)
    return tweet

In [20]:
process_hashtag('#hashtag#hashtag1#hashtag2 #hashtag3')

' hashtag hashtag1 hashtag2  hashtag3'

In [21]:
# Remove retweets
"""
\b = space around whole words
"""
def process_retweet(tweet):
    pattern_retweet = r'\brt\b'
    tweet = re.sub(pattern_retweet, '', tweet)
    return tweet

In [22]:
process_retweet(' rt retweet')

'  retweet'

In [23]:
# Remove punctuation
def process_punctuation(tweet):
    # Remove , ; : . ? ! / from tweet
    tweet = re.sub('[,;:.?!/]', '', string=tweet)
    return tweet

In [24]:
process_punctuation('It\' a tweet: with, some punctuation! :')

"It' a tweet with some punctuation "

In [25]:
# Classify emojis into postive ad negative ones
def process_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', 'EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

In [26]:
process_emojis(':) :(')

' EMO_POS   EMO_NEG '

In [27]:
# Remove accedtend chars
def process_accented_chars(tweet):
    tweet = unidecode.unidecode(tweet)
    return tweet                                                                              

In [28]:
process_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

In [29]:
# Process a single word
def process_word(word):
    # Remove punctuation from the word
    word = re.sub(r'[\'"?!,.():;]', '',word)
    # Any character, except for a new line, one or more, 
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - ' " _ from the word
    word = re.sub(r'[\'"-]', '', word)
    return word

In [30]:
process_word("doooon't")

'doont'

In [31]:
# Process special chars from word
def process_special_chars(word):
    pattern_special_chars = r'[^a-zA-z0-9\s]'
    word = re.sub(pattern_special_chars, '', word)
    return word

In [32]:
process_special_chars("dedje$")

'dedje'

In [33]:
# check if a word does not contain weird symboles
def is_valid_word(word):
    # Check if word begin with alphabet
    return (re.search(r'^[a-zA-Z][a-zA-Z0-9\._]*$',word) is not None)

In [34]:
is_valid_word("dededhe")

True

In [35]:
# Process words of a tweet
def process_words(tweet):
    processed_tweet = []
    tweet_words = tweet.split()
    for word in tweet_words:
        if is_valid_word(word):
            word = process_word(word)
            processed_tweet.append(word)
        else:
            word = process_special_chars(word)
            processed_tweet.append(word)
            # continue # goes to the next word
    tweet = ' '.join(processed_tweet)
    return tweet

In [36]:
def process_tweet(tweet):
    tweet = to_lower_case(tweet)
    tweet = process_url(tweet)
    tweet = process_user_mention(tweet)
    tweet = process_hashtag(tweet)
    tweet = process_retweet(tweet)
    tweet = process_emojis(tweet)
    tweet = process_digits(tweet)
    tweet = process_dots(tweet)
    tweet = process_words(tweet)
    tweet = process_stopwords(tweet)
    tweet = process_punctuation(tweet)
    tweet = process_spaces(tweet)
    tweet = process_begin_and_end(tweet)
    return tweet

In [43]:
df = pd.read_csv('../data/raw/train_set.csv')

In [44]:
import time
start_time = time.time()
for i in range(len(df)): 
    df.at[i, 'tweet_normalized'] = process_tweet(df['tweet'][i])
print("--- %s seconds ---" % (time.time() - start_time))

--- 7.520820379257202 seconds ---


In [45]:
df.head()

Unnamed: 0,id,label,tweet,tweet_normalized
0,1,0,@user when a father is dysfunctional and is s...,USER_MENTION father dysfunctional selfish drag...
1,2,0,@user @user thanks for #lyft credit i can't us...,USER_MENTION USER_MENTION thanks lyft credit c...
2,3,0,bihday your majesty,bihday majesty
3,4,0,#model i love u take with u all the time in ...,model love u take u time ur
4,5,0,factsguide: society now #motivation,factsguide society motivation


In [46]:
df_processed = df[['id', 'label', 'tweet_normalized']]

In [52]:
df_processed.head()

Unnamed: 0,id,label,tweet_normalized
0,1,0,USER_MENTION father dysfunctional selfish drag...
1,2,0,USER_MENTION USER_MENTION thanks lyft credit c...
2,3,0,bihday majesty
3,4,0,model love u take u time ur
4,5,0,factsguide society motivation


In [55]:
df_processed.to_csv('../data/processed/train_set.csv', encoding='utf-8',index=False)