In [2]:
import pandas as pd
import re
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Functions

In [7]:
def preprocess_apply(tweet,contractions_dict):
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    hashtagPattern    = '#[^\s]+'
    alphaPattern      = "[^a-z0-9<>]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"

    # Defining regex for emojis
    smileemoji        = r"[8:=;]['`\-]?[)d]+"
    sademoji          = r"[8:=;]['`\-]?\(+"
    neutralemoji      = r"[8:=;]['`\-]?[\/|l*]"
    lolemoji          = r"[8:=;]['`\-]?p+"
    tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Replace all emojis.
    tweet = re.sub(r'<3', '<heart>', tweet)
    tweet = re.sub(smileemoji, '<smile>', tweet)
    tweet = re.sub(sademoji, '<sadface>', tweet)
    tweet = re.sub(neutralemoji, '<neutralface>', tweet)
    tweet = re.sub(lolemoji, '<lolface>', tweet)

    for contraction, replacement in contractions_dict.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

# Cleaning

## Read input

In [9]:
data_path = "data/data.csv"
column_names = ["sentiment", "ids", "date", "flag", "user", "text"]
df = pd.read_csv('./data/input.csv',
                      encoding="ISO-8859-1" , names=column_names)

In [10]:
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1}) # Normalize target values

## Create constraction dict

In [13]:
# Reading contractions.csv and storing it as a dict.
contractions = pd.read_csv('./data/contractions.csv', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']


## Cleaning

In [14]:
df['processed_text'] = df.text.apply(lambda x: preprocess_apply(x,contractions_dict))

### Save cleaned

In [16]:
df[["sentiment","processed_text"]].to_csv("data/cleaned_data.csv",index=False)

# Viz world cloud

In [None]:
all_words = ' '.join([text for text in df['processed_text']]) 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off')
plt.show()