# Classifying Tweet Emotions

## Data Preprocessing

In [1]:
# Import necessary modules
import pandas as pd
import numpy as np
import re
import fasttext

In [2]:
# Read data
df = pd.read_csv('data/text_emotion.csv', usecols=list(range(1,4)))

### Data Cleansing

In [3]:
def preprocess_tweet(tweet):
    
    # To lowercase (not good for VADER)
    tweet = tweet.lower()
    
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    
    #Convert @username to "@user"
    tweet = re.sub('@[^\s]+','@user',tweet)
    
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ')
    
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uFFFF')
    
    # Convert hyperlinks ->>>> For now just replace with http
    tweet = re.sub(r'https?:\/\/.*\/\w*', 'http', tweet)

#     #Remove @user
#     tweet = re.sub('@[^\s]+','',tweet)
    
#     # Remove tickers such as USD ($)
#     tweet = re.sub(r'\$\w*', '', tweet)
    
#     # Remove hashtags (not good for VADER)
#     tweet = re.sub(r'#\w*', '', tweet)
    
#     # Remove Punctuation and split 's, 't, 've with a space for filter
#     tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet)
    
#     # Remove words with 2 or fewer letters
#     tweet = re.sub(r'\b\w{1,2}\b', '', tweet)

    return tweet

# Preprocess "content"
df['content'] = df.content.apply(preprocess_tweet)

In [4]:
# Drop rows with sentiment "empty"
df = df[df.sentiment != 'empty']

# Drop rows with one or less characters in the tweet
df.drop(df[df.content.str.len()<2].index, inplace=True)

# Change sentiment of the tweets with only mentions to "neutral"
df.loc[df.content.str.replace("@[^\s]+", "").str.len()<3, 'sentiment'] = "neutral"

### Feature Engineering

In [5]:
# Create a sentiment dictionary to map EMOTIONS to SENTIMENTS.
sentiment_dict = {'boredom': 'negative',
                  'hate': 'negative',
                  'sadness': 'negative',
                  'anger': 'negative',
                  'worry': 'negative',
                  'relief': 'positive',
                # 'empty': 'neutral',
                  'happiness': 'positive',
                  'love': 'positive',
                  'enthusiasm': 'positive',
                  'neutral': 'neutral',
                  'surprise':'positive',
                  'fun': 'positive'
                 }

# Create a feature "polarity"
df['polarity'] = df.sentiment.map(sentiment_dict)

In [6]:
def count_mentions(text):
    '''Returns number of mentions in a string.'''
    
    # Split the string into words
    words = text.split()
    
    # Create a list of words that are mentions
    mentions = [word for word in words if word.startswith("@")]
    
    # Return number of mentions
    return(len(mentions))

# Create a feature "mention_count"
df['mention_count'] = df['content'].apply(count_mentions)

In [7]:
def count_hashtags(text):
    '''Returns number of hashtags in a text.'''
    
    # Split the string into words
    words = text.split()
    
    # Create a list of words that are hashtags
    hashtags = [word for word in words if word.startswith("#")]
    
    # Return number of hashtags
    return(len(hashtags))

# Create a feature "hashtag_count"
df['hashtag_count'] = df['content'].apply(count_hashtags)

In [8]:
# Create a feature char_count
df['char_count'] = df['content'].apply(len)

# Create a new column "has_link"
df['has_link'] = df.content.str.contains("http")*1

# Create a new column "ex_point"
df['ex_point'] = df.content.str.contains("!")*1

# Create a new column "qu_mark"
df['qu_mark'] = df.content.str.contains("\?")*1

# Create a new column "multi_period"
df['multi_period'] = df.content.str.contains("\.\.+")*1

In [9]:
# Load pretrained language model
language_model = fasttext.load_model('data/lid.176.bin')

def detect_fasttext(tweet):
    # Predict language
    prediction = language_model.predict(tweet)
    label = prediction[0][0].split("__label__")[1]
    return label

# Create a feature "language"
df['language'] = df.content.apply(detect_fasttext)




In [10]:
# Create a DataFrame only with english text
df = df[df['language'] == 'en']

# Drop the language column
df.drop('language', axis=1, inplace=True)

In [11]:
df.shape

(38619, 11)

In [12]:
df.head()

Unnamed: 0,sentiment,author,content,polarity,mention_count,hashtag_count,char_count,has_link,ex_point,qu_mark,multi_period
1,sadness,wannamama,layin n bed with a headache ughhhh...waitin on...,negative,0,0,59,0,0,0,1
2,sadness,coolfunky,funeral ceremony...gloomy friday...,negative,0,0,35,0,0,0,1
3,enthusiasm,czareaquino,wants to hang out with friends soon!,positive,0,0,36,0,1,0,0
4,neutral,xkilljoyx,@user we want to trade with someone who has ho...,neutral,1,0,77,0,0,0,0
5,worry,xxxPEACHESxxx,re-pinging @user why didn't you go to prom? bc...,negative,1,0,75,0,0,1,0


### Feature Selection

In [13]:
# Drop unnecessary columns
df.drop('author', axis=1, inplace=True)

## Save Engineered Data

In [14]:
df.to_pickle("data/emotions_preprocessed.pkl")