In [86]:
import re
from pymongo import MongoClient
from collections import Counter, defaultdict
import requests
import datetime

In [87]:
client = MongoClient()

In [88]:
twitter_blacklist = set(['BobWhitaker2016', 'DrJillStein', 'JoeBiden', 'TheRealRoseanne',
                         'eugenepuryear', 'glorialariva', 'system.indexes', 'verminsupreme'])

In [89]:
fb_blacklist = set(['Eugene4DC', 'drjillstein', 'VerminSupreme', 'RobertWWhitaker', 'system.indexes'])

In [90]:
instagram_blacklist = set(['eugene4dc', 'officialroseannebarr', 'system.indexes', 'vp'])

In [91]:
handle_dict = requests.get('http://api.electionscrape.com/handles').json()

In [92]:
def get_twitter_text():
    tweet_dict = defaultdict(list)
    
    for coll_name in client.twitter.collection_names():
        if not coll_name in twitter_blacklist:
            # Get candidate slug
            for key in handle_dict:
                if coll_name in handle_dict[key]['twitter']:
                    cand_slug = key
                    
            coll = client.twitter[coll_name]
            for doc in coll.find():
                dt_str = doc['created_at']
                # if 2015
                if dt_str.split(' ').pop() == '2015':
                    tweet_dict[cand_slug].append(doc['text'])
    
    return tweet_dict

In [93]:
def get_facebook_text():
    fb_dict = defaultdict(list)
    
    for coll_name in client.facebook.collection_names():
        if not coll_name in fb_blacklist:
            # Get candidate slug
            for key in handle_dict:
                if coll_name in handle_dict[key]['facebook']:
                    cand_slug = key
                    
            coll = client.facebook[coll_name]
            for doc in coll.find():
                dt_str = doc['created_time']
                # if 2015
                if dt_str.split('-')[0] == '2015':
                    if 'message' in doc:
                        fb_dict[cand_slug].append(doc['message'])
    
    return fb_dict    

In [94]:
def get_instagram_text():
    insta_dict = defaultdict(list)
    
    for coll_name in client.instagram.collection_names():
        if not coll_name in instagram_blacklist:
            # Get candidate slug
            for key in handle_dict:
                if coll_name in handle_dict[key]['instagram']:
                    cand_slug = key
            
#             print coll_name
            coll = client.instagram[coll_name]
            for doc in coll.find():
                utc_secs = float(doc['created_time'])
                year = datetime.datetime.fromtimestamp(utc_secs).year
                # if 2015
                if year == 2015 and doc['caption']:
                        insta_dict[cand_slug].append(doc['caption']['text'])
    
    return insta_dict   

In [95]:
twitter_dict = get_twitter_text()

In [96]:
fb_dict = get_facebook_text()

In [97]:
insta_dict = get_instagram_text()

In [98]:
text_dict = defaultdict(list)
list_of_dicts = [twitter_dict, fb_dict, insta_dict]

for d in list_of_dicts:
    for key in d:
        text_dict[key].extend(d[key])

In [141]:
# Get stopwords
stopwords = set(re.findall(r'\w+', open('stopwords.txt', 'r').read()) + map(str, range(10)))

In [142]:
# Convert all text to one big string
all_text_str = ' '.join(map(lambda x: ' '.join(x), text_dict.values())).lower()

# Remove urls
all_text_str = re.sub(r'http[^\s]+', '', all_text_str)

# Tokenize
all_tokens = filter(lambda x: not x in stopwords, re.findall(r'\w+', all_text_str))

# Count
all_word_count = Counter(all_tokens)


# Terrorism vs. Economy

In [143]:
terrorism = all_word_count['terrorism']
terrorist = all_word_count['terrorist']
terrorists = all_word_count['terrorists']
attack = all_word_count['attack']
security = all_word_count['security']
threat = all_word_count['threat']

terror_tot = terrorism + terrorist + attack + terrorists + security + threat
print terror_tot

3341


In [144]:
economy = all_word_count['economy']
job = all_word_count['job']
jobs = all_word_count['jobs']
wage = all_word_count['wage']
wages = all_word_count['wages']
labor = all_word_count['labor']

econ_tot = economy + jobs + wage + wages + job + labor
print econ_tot

3418


# Guys vs. Gals 

In [145]:
# some words likely on blacklist 

#gals_tot = all_word_count[('woman', 'she', 'her', 'girl', 'gal')]

#gals=set[('woman', 'she', 'her', 'girl', 'gal')]
#gals_tot = all_word_count[lambda x: x in gals]

woman = all_word_count['woman']
she = all_word_count['she']
her = all_word_count['her']
girl = all_word_count['girl']
gal = all_word_count['gal']

gals_tot = woman + she + her + girl + gal
print gals_tot





1927


In [146]:
# some words likely on blacklist 
man = all_word_count['man']
he = all_word_count['he']
him = all_word_count['him']
boy = all_word_count['boy']
guy = all_word_count['guy']

guys_tot = man + he + him + boy + guy
print guys_tot

5614


# Family vs. Corporation 

In [147]:
family = all_word_count['family']
families = all_word_count['families']
child = all_word_count['child']
kid = all_word_count['kid']
home = all_word_count['home']

family_tot = family + child + kid + home + families
print family_tot

2649


In [148]:
corporation = all_word_count['corporation']
corporate = all_word_count['corporate']
company = all_word_count['company']
business = all_word_count['business']
ceo = all_word_count['ceo']
executive = all_word_count['executive']

bus_tot = corporation + corporate + company + business + ceo + executive
print bus_tot

1310
