In [1]:
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib as plt

#Natural Language Toolkit
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [3]:
#Load in the DF of tweets
tweet_df = pd.read_csv('tweets1.csv', index_col=0, encoding='utf-8')

#Consider only tweets from the US
tweet_df = tweet_df[tweet_df.country == 'United States']

#Load in the dataframe of word-happiness rankings
sent_df = pd.DataFrame.from_csv('Data_Set_S1.txt', sep='\t', index_col=None)

#Load in stopwords
cachedStopWords = stopwords.words("english")

Let's prepare some regular expressions to handle abbreviations that people sometimes extend for emphasis like 'lollllll' or 'lmaoooo' or 'hahaha'. (Haven't finished this yet).

In [4]:
import re

#lollll -> lol. lmaoooooo -> lmao.
emph_stem = lambda s: re.sub(r'((\w)\2{2,})','\\2', s)

#this has it's own happiness rating
lolol = lambda s: re.sub(r'lolol(ol)*', 'lolol', s)

#(all three of these appear separately in the word positivity list)
haha = lambda s: re.sub(r'[a|h]{3,4}','haha', s)
hahaha = lambda s: re.sub(r'[a|h]{5,6}','hahaha', s)
hahahaha = lambda s: re.sub(r'[a|h]{7,140}','hahahaha', s)

def apply_re(word):
    if lolol(word) != word:
        return lolol(word)
    elif haha(word) != word:
        return haha(word)
    elif hahaha(word) != word:
        return hahaha(word)
    elif hahahaha(word) != word:
        return hahahaha(word)
    elif emph_stem(word) != word:
        return emph_stem(word)
    return word

#Let's create some functions to get simple happiness metrics for each tweet.

Here's one that takes in raw tweet text, removes stopwords and punctuation (this includes emoticons, but maybe it shouldn't), and returns the tweet as a list of words.

In [5]:
def tweet_to_list(tweet):
    
    tweet_list = nltk.tokenize.word_tokenize(tweet)
    tweet_list = [word.lower() for word in tweet_list if word.isalpha()]
    
    cachedStopWords.append('https')
    tweet_list = [word for word in tweet_list if word not in cachedStopWords]
    tweet_list = [apply_re(word) for word in tweet_list]
    
    return tweet_list

The function avg_happiness takes in the list of words in a tweet, obtains the frequency distribution of each word, then calculates a weighted average "happiness" score based on each word's happiness rating (from Data_Set_S1).

In [6]:
def avg_happiness(text_list):
    freqdist = nltk.FreqDist(text_list).items()
    
    happ_sum = 0
    count = 0
    
    for i in freqdist:
        word_row = sent_df[sent_df.word == i[0]]
        happ_score = word_row.happiness_average.values
        for i in happ_score:
            happ_sum += int(i)
            count += 1
            
    if count == 0:
        return None
    
    return float(happ_sum)/count

Now create a new row in tweet_df with happiness scores for each tweet. Note that with only around 35,000 tweets, the following takes ~30 minutes to run. This means that we probably shouldn't use all of the million+ tweets we're collecting — rather, we should probably try to sample a reasonable, equal number of tweets from each state in the hopes of obtaining balanced representation (i.e., excluding some tweets from the more twitter-heavy states once we have enough overall).

In [None]:
%time

happiness_score = lambda tweet: avg_happiness(tweet_to_list(tweet))

happ_scores = tweet_df.text.apply(happiness_score)

In [None]:
#add column of happiness scores
tweet_df['happiness'] = pd.Series(happ_scores)

#remove tweets with NaN happiness scores
tweet_df.dropna(inplace=True)

tweet_df.head()

In [None]:
city_list = ['New York', 'Detroit', 'Chicago']

for city in city_list:
    happ_scores = tweet_df.happiness[tweet_df.city==city]
    ax = sns.kdeplot(happ_scores, label=city)

In [None]:
def norm(col):
    return (col - np.mean(col))/(max(col) - min(col))

tweet_df.norm_happ = norm(tweet_df.happiness)

#add boolean tweet positivity indicator
tweet_df['bool_happ'] = pd.Series(tweet_df.norm_happ > 0)

print 'Ratio positive tweets in New York: ' + str(np.mean(tweet_df[tweet_df.city=='New York'].bool_happ))
print 'Ratio positive tweets in Arizona: ' + str(np.mean(tweet_df[tweet_df.city=='Arizona'].bool_happ))

In [None]:
tweet_df.to_csv('tweets_w_sent.csv', encoding='utf-8')