In [1]:
import numpy as np
import pandas as pd
import csv
import operator 
from collections import Counter, defaultdict

In [2]:
tweets = pd.read_pickle('cleaned_non_retweets.pkl')
tweets.shape

(14283, 15)

In [5]:
sample_tweet = tweets['text'][0]
sample_tweet

'"@ntvuganda: Kizza Besigye has been arrested by Police in Naguru for unclear reasons #UgandaDecides https://t.co/n17GCGQHUc"'

In [6]:
#Importing beautiful soup to clean the tweet texts
from bs4 import BeautifulSoup
# b_classifier = BeautifulSoup(sample_tweet)
# print b_classifier.get_text()

In [13]:
import re

emoticons_str = r"""(?:
        [:=;] 
        [oO\-]? 
        [D\)\]\(\]/\\OpP] 
    )"""
# Eyes
# Nose (optional)
# Mouth
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        #tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
        tokens = [token if tokens_re.search(token) else token.lower() for token in tokens]
    return tokens

In [14]:
sample_tweet_ = preprocess(sample_tweet)

In [15]:
sample_tweet_

['"',
 '@ntvuganda',
 ':',
 'Kizza',
 'Besigye',
 'has',
 'been',
 'arrested',
 'by',
 'Police',
 'in',
 'Naguru',
 'for',
 'unclear',
 'reasons',
 '#UgandaDecides',
 'https://t.co/n17GCGQHUc',
 '"']

In [None]:
#Tokenization from NLTK

In [16]:
import nltk
from nltk.corpus import stopwords
stop = stopwords.words("english")

#### Creatinga function for re-usability

In [22]:
def tweets_to_words(tweets,stem):
    # 1. Remove HTML
    tweets_text = BeautifulSoup(tweets, 'lxml').get_text() 
    # 2. Remove non-letters        
    tweet_words = preprocess(tweets_text)
    # 3. Convert to lower case, split into individual words  
    tweets_lower = [tweet.lower() for tweet in tweet_words]
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words= [w for w in tweets_lower if not w in stops]
    meaningful_words = [x.decode("utf-8").encode('utf','ignore') for x in meaningful_words]
    #
    # 6. Doing Stemming or Lemmatization (Normalising the text)
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    if (stem=='S'):  # Choosing between Stemming ('S') and Lemmatization ('L')
        stemmer=PorterStemmer()
        final_words=[stemmer.stem(x) for x in meaningful_words]
    else: 
        lemma=WordNetLemmatizer()
        final_words=[lemma.lemmatize(x) for x in meaningful_words]
    # 7. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join(final_words))  

In [24]:
tweets_to_words(tweets['text'][0],"L")

u'" @ntvuganda : kizza besigye arrested police naguru unclear reason #ugandadecides https://t.co/n17gcgqhuc "'

In [None]:
num_tweets = tweets['text'].size

### Now let's loop through and clean all of the training set 
### at once (this might take a few minutes depending on your computer)

In [None]:
print "Cleaning and parsing the training set movie reviews...\n"
clean_tweets = []
for i in xrange(0, num_tweets):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Review %d of %d\n" % ( i+1, num_tweets)                                                                    
    clean_tweets.append(tweets_to_words(tweets['text'][i],"L"))

In [None]:
df = pd.DataFrame(clean_tweets, columns=['tweets'])

In [None]:
df.to_csv('final_cleaned_tweets_new.csv', encoding='utf-8')

In [3]:
search_word = 'besigye' # pass a term
search_museveni = 'museveni'
search_kiggundu = 'kiggundu'
count_search = Counter()
count_museveni = Counter()
count_kiggundu = Counter()
count_all = Counter()

In [4]:
# remember to include the other import from the previous post
com = defaultdict(lambda : defaultdict(int))
#We could also look for a specific term and extract its most frequent co-occurrences.

reader = csv.reader(open('final_cleaned_tweets_new.csv'), delimiter=",")
tweet_data = []
for row in reader:
    tweet_data.append(row[1])

from nltk.corpus import stopwords
import string
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via','\xe2','\x80', '\xa6','@nbstv','\xed', 'RT', '\xa0','\xbd','I'
                                                  ,'amp','We','says','\x82', '\xb8', '\x8a', '\xbc', '\xbe','\x8c', '\x9c', ':/'
                                                  ,'know','The','https','11','78','96','If','one','go','\xad','\xad', 
                                                   '\xc2','\xc2', '\xcb','\xc2', '\xc3','\xae', '\xc2','\xad', '\xcb',
                                                   '\xaa', '\xc2','\xad', '\xc3','\xa2', '\xc2','\xad', '\xae','\xc2', 
                                                   '\xc5','\xad', '\xc5','\xa4', '\xc2','\xaa', '\xad','\x98', '\xc2',
                                                   '\xc3', '\xc5','\xa2', '\xad', '\xc3', '\xcb', '\xb9', '\xc2', '\xae', '\xcb', 
                                                   '\xb6', '\xc2','\xad','\xc2','\xc3', '\xc5','\xbf','\x9e', '\xb0','\xab', '\xb0',
                                                   '\x8f', '\xb0','\x9d', '\xb0','\x8d', '\xb0','\x81', '\xb0','vote', '\xb0', '\xb0', '\xb0']

count_terms = Counter()
tweet_data[3]
toks = []
for i in range(len(tweet_data)):
#     tokens = preprocess(tweet_data[i])
#     toks.append(tokens)
#     terms_all = [term for term in preprocess(tweet_data[i])]
    terms_all = [term.lower() for term in preprocess(tweet_data[i]) if term not in stop and not term.startswith(('#', '@'))] 
    # Count terms only once, equivalent to Document Frequency
    terms_single = set(terms_all)
    # Count hashtags only
    terms_hash = [term.lower() for term in preprocess(tweet_data[i]) if term.startswith('#')]
    # Count terms only (no hashtags, no mentions)
    terms_only = [term.lower() for term in preprocess(tweet_data[i]) if term not in stop and not term.startswith(('#', '@'))] 
    # mind the ((double brackets))
    # startswith() takes a tuple (not a list) if 
    # we pass a list of inputs
        # Update the counter
    
    # Build co-occurrence matrix
    for i in range(len(terms_only)-1):            
        for j in range(i+1, len(terms_only)):
            w1, w2 = sorted([terms_only[i], terms_only[j]])                
            if w1 != w2:
                com[w1][w2] += 1
#We could also look for a specific term and extract its most frequent co-occurrences.
    if search_word in terms_only:
            count_search.update(terms_only)
    if search_museveni in terms_only:
        count_museveni.update(terms_only)
    if search_kiggundu in terms_only:
            count_kiggundu.update(terms_only)

    count_all.update(terms_all)
    count_terms.update(terms_hash)
#toks
# Print the first 5 most frequent words
print '\n Print the first 5 most frequent words'
print count_all.most_common(40)
print '\n Print the first 5 most frequent hashtags'
print count_terms.most_common(40)


com_max = []
# For each term, look for the most common co-occurrent terms
for t1 in com:
    t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
    for t2, t2_count in t1_max_terms:
        com_max.append(((t1, t2), t2_count))
# Get the most frequent co-occurrences
terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)
print '\n Get the most frequent co-occurrences'
print(terms_max[:20])

print '\n\n'
print("Co-occurrence for %s:" % search_word)
print(count_search.most_common(40))
print '\n\n'
print("Co-occurrence for %s:" % search_museveni)
print(count_museveni.most_common(40))
print '\n\n'
print("Co-occurrence for %s:" % search_kiggundu)
print(count_kiggundu.most_common(40))


 Print the first 5 most frequent words
[('uganda', 1335), ('election', 1298), ('voting', 1246), ('polling', 1164), ('station', 1118), ('medium', 934), ('social', 885), ('besigye', 883), ('ugandan', 835), ('museveni', 748), ('ballot', 711), ('u', 701), ('result', 674), ('people', 668), ('time', 564), ('material', 557), ('day', 485), ('pm', 449), ('today', 438), ('voter', 430), ('ec', 422), ('arrested', 421), ('still', 420), ('2', 417), ('like', 412), ('police', 393), ('president', 374), ('m7', 368), ('say', 354), ('4', 350), ("it's", 343), ('paper', 322), ('candidate', 322), ('vpn', 320), ('twitter', 311), ('yet', 306), ('blocked', 297), ('get', 295), ('delay', 289), ("don't", 275)]

 Print the first 5 most frequent hashtags
[('#ugandadecides', 14282), ('#ugandaelections', 751), ('#uganda', 395), ('#musevenidecides', 368), ('#ivoted', 138), ('#wesigebesigye', 96), ('#socialmedia', 80), ('#besigye', 78), ('#besigyewon', 78), ('#ipledgepeaceug', 78), ('#ugdecides', 66), ('#museveni', 64)



### Secondly, our first plot. Using the list of most frequent terms (without hashtags) 
#### from our data set, we want to plot their frequencies

# Sentiment Analysis is one of the interesting applications of text analytics. 

In [5]:
count_top = count_all.most_common(20)
print count_top

[('uganda', 1335), ('election', 1298), ('voting', 1246), ('polling', 1164), ('station', 1118), ('medium', 934), ('social', 885), ('besigye', 883), ('ugandan', 835), ('museveni', 748), ('ballot', 711), ('u', 701), ('result', 674), ('people', 668), ('time', 564), ('material', 557), ('day', 485), ('pm', 449), ('today', 438), ('voter', 430)]


In [6]:
terms_top = terms_max[:100]
print terms_top

[(('polling', 'station'), 1012), (('medium', 'social'), 881), (('material', 'voting'), 366), (('ballot', 'paper'), 345), (('election', 'uganda'), 297), (('station', 'voting'), 281), (('polling', 'voting'), 261), (('arrested', 'besigye'), 256), (('besigye', 'kizza'), 239), (('pm', 'voting'), 230), (('ballot', 'box'), 215), (('7', 'pm'), 213), (('material', 'polling'), 212), (('besigye', 'museveni'), 184), (('provisional', 'result'), 183), (('election', 'peaceful'), 183), (('4', 'pm'), 180), (('7', 'voting'), 172), (('material', 'station'), 167), (('arrested', 'kizza'), 164), (('day', 'election'), 164), (('time', 'voting'), 161), (('medium', 'uganda'), 161), (('social', 'uganda'), 158), (('museveni', 'uganda'), 157), (('museveni', 'president'), 145), (('mobile', 'money'), 144), (('facebook', 'twitter'), 143), (('blocked', 'social'), 141), (('blocked', 'medium'), 140), (('delay', 'voting'), 132), (('election', 'ugandan'), 132), (('election', 'fair'), 131), (('voting', 'yet'), 126), (('ele

In [7]:
import math
from __future__ import division

In [8]:
n_docs = len(count_all)

In [9]:
n_docs

16350

In [10]:
# n_docs is the total n. of tweets
p_t = {}
p_t_com = defaultdict(lambda : defaultdict(int))

i = 0

if i < 20:
    for term, n in count_all.items():
        p_t[term] = n / n_docs
        for t2 in com[term]:
            p_t_com[term][t2] = com[term][t2] / n_docs
    i = i+1

In [11]:
def read_words(words_file):
    return [word for line in open(words_file, 'r') for word in line.split()]
positive_vocab = read_words('positive-words-edit.txt')
negative_vocab = read_words('negative-words-edit.txt')

In [None]:
i = 0
pmi = defaultdict(lambda : defaultdict(int))

if i < 20:
    for t1 in p_t:
        for t2 in com[t1]:
            denom = p_t[t1] * p_t[t2]
            pmi[t1][t2] = math.log((p_t_com[t1][t2] / denom),2)
    i = i+1
 
semantic_orientation = {}
p = 0
if p < 20:
    for term, n in p_t.items():
        positive_assoc = sum(pmi[term][tx] for tx in positive_vocab)
        negative_assoc = sum(pmi[term][tx] for tx in negative_vocab)
        semantic_orientation[term] = positive_assoc - negative_assoc
    p = p+1

# We can print out the semantic orientation for some terms

In [14]:
semantic_sorted = sorted(semantic_orientation.items(), 
                         key=operator.itemgetter(1), 
                         reverse=True)
top_pos = semantic_sorted[:100]
top_neg = semantic_sorted[-50:]

print 'The Top Positive Terms'
print(top_pos)
print '\nThe Top Negative Terms'
print(top_neg)

The Top Positive Terms
[('church', 40.73243619920966), ('friend', 37.81271437162385), ('article', 36.99788432309667), ('crossed', 36.773402613852475), ('enjoy', 36.60466873498684), ('gave', 35.19359840488293), ('economic', 34.787376579081254), ('ensuring', 33.258736638037135), ('heart', 31.91018642123106), ('god', 30.958631786321995), ('https://t.co/uj96xhqorp', 30.940075327285683), ('apwoyo', 30.940075327285683), ('humbling', 30.940075327285683), ('23', 30.45662736374392), ('bos', 28.85452801032922), ('col', 28.835085053412946), ('2021', 28.571096478517358), ('anything', 27.796474100244964), ('condition', 27.77408957855264), ('addition', 27.360080315589723), ('apac', 26.132720405228078), ('ba', 26.08376778801862), ('https://t.co/s2pc6cfml0', 25.294777163337056), ('https://t.co/vhfh11mhhf', 25.086453871536854), ('kisoro', 24.891616988752922), ('build', 24.625811012020034), ('https://t.co/7zmgp00848', 24.50336378514847), ('god-like', 24.398823799980583), ('kotido', 24.1164750750382), ('

In [13]:
print("besigye: %f" % semantic_orientation['besigye'])
print("museveni: %f" % semantic_orientation['museveni'])
print("kiggundu: %f" % semantic_orientation['kiggundu'])

besigye: -49.405135
museveni: -32.226432
kiggundu: -16.960529


In [24]:
word1 = 'museveni'
word2 = 'president'
print(semantic_orientation[word1], 
      semantic_orientation[word2]) 
semantic_orientation[word1] + \
semantic_orientation[word2]

(-32.226431875358955, -27.984023654857886)


-60.21045553021684

In [19]:
len(semantic_orientation)

10732