### Set up the Tweet DataFrame

In [1]:
# Import necessary libraries/packages
import json
import pprint
import pandas as pd

In [2]:
# Read tweeets.json file and parse into a Python dictionary
tweet_list = []
with open('tax_tweets.json') as f:
    for line in f:
        tweet_list.append(json.loads(line))

In [3]:
# Check structure of Tweet JSON file
pprint.pprint(tweet_list[:1])

[{'contributors': None,
  'coordinates': None,
  'created_at': 'Tue Apr 19 03:58:52 +0000 2016',
  'entities': {'hashtags': [],
               'symbols': [],
               'urls': [],
               'user_mentions': [{'id': 110606529,
                                  'id_str': '110606529',
                                  'indices': [3, 15],
                                  'name': 'Jack Graham',
                                  'screen_name': 'jackngraham'}]},
  'favorite_count': 0,
  'favorited': False,
  'filter_level': 'low',
  'geo': None,
  'id': 722273234150789120,
  'id_str': '722273234150789120',
  'in_reply_to_screen_name': None,
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'is_quote_status': False,
  'lang': 'en',
  'place': None,
  'retweet_count': 0,
  'retweeted': False,
  'retweeted_status': {'contributors': None,
                       'coordinates': None,
                 

In [4]:
# Create a tweet class to store attributes for each tweet

class TaxTweets(object):
    
    def __init__(self, screen_name="", profile_description="", 
                 text="", user_mentions=""):
        self.screen_name = screen_name
        self.profile_description = profile_description
        self.text = text
        self.user_mentions = user_mentions
        
    def get_screen_name(self):
        """return screen name of twitter user"""
        return self.screen_name
    
    def get_profile_description(self):
        """return profile description of user"""
        return self.profile_description
    
    def get_text(self):
        """return text of twitter user"""
        return self.text
    
    def get_user_mentions(self):
        """return screen names of users mentioned in tweet"""
        return self.user_mentions

In [5]:
# Create an empty list to store each tweet set of attributes
tweet_list2 = []

# Iterate over each tweet set and get username, text, users mentioned, and 
# profile description in the tweet
for tweet in tweet_list:
    
    user_mentions_list = [tweet['entities']['user_mentions'][i]['screen_name']
                          for i in range(0, len(tweet['entities']['user_mentions']))]
        
    tweet_list2.append(TaxTweets(tweet['user']['screen_name'], 
                                 tweet['user']['description'],
                                 tweet['text'],
                                 user_mentions_list)) 

In [6]:
# Create tweets DataFrame

# Set column names
column_names = ('ScreenName', 'Profile_Description', 'Text', 'User_Mentions')
# Create an empty dictionary to hold tweet sets
dict_list = []

# Iterate over each tweet set in list and zip together tweet attributes along
# with column name to create dataframe
for tweet in tweet_list2:
    dict_list.append(dict(zip(column_names,
                                  [tweet.get_screen_name(), 
                                   tweet.get_profile_description(),
                                   tweet.get_text(), 
                                   tweet.get_user_mentions()
                                  ]
                             )))

# Create tweets dataframe
tweet_df = pd.DataFrame(dict_list)

In [7]:
# Look at first 10 rows of tweets dataframe
tweet_df.head(10)
tweet_df.tail(10)

Unnamed: 0,Profile_Description,ScreenName,Text,User_Mentions
4970,Award winning Designer/ Architect/ Design Dire...,RKMustafa,Stop Trying To File Your Tax Returns From Coac...,[]
4971,Jus a Single laid bac kat who likes 2hav a goo...,DurdyGP,RT @CBSLA: Some Coachella goers tried to mail ...,[CBSLA]
4972,we will see,queensusana24,Press TV Interview: G-20 Crackdown on Tax Have...,[1lefthook]
4973,,chiquisholla,RT @InmigrantNacion: @JimPressOffice\n#AINF\n#...,"[InmigrantNacion, JimPressOffice, GOP]"
4974,@SpaceX once responded to a mission idea I had...,HAL9000and1,"Hi, we're NASA and we want tax payers money......",[]
4975,"North Broward Preparatory School June 26, 1993",tam1i,6 Good Reasons to File a Tax Extension #TaxDea...,[]
4976,DissidentRight publishes original work through...,adissidentright,I *almost* pity the people who think of indivi...,[]
4977,,nanner_lp,RT @micnews: This Michigan lawmaker explains w...,[micnews]
4978,,Marchant9876,RT @margokingston1: Albo: Leaks show Govt adop...,[margokingston1]
4979,‏صِلَتَُ بـ ربگ ، هيَّ بوابتك للحاة ،،\nوَ على...,victoriasuthe17,RT @Adel__Almalki: #tech #news ( #TaxDay )Netf...,[Adel__Almalki]


### Clean up the Tweets

In [8]:
# Import necessary libraries to clean up tweets
import re
import string

In [9]:
# Create a list of stopwords to remove from tweets
# List of stopwords taken from http://stackoverflow.com/questions/5511708/adding-words-to-nltk-stoplist
stopword_list = []
with open('stopwords.txt') as f:

    for line in f:
        line = line.split('\n')
        stopword_list.append(line[0])
        
# Add some Twitter specific words to the list
stopword_list.append('rt')
stopword_list.append('tax')
stopword_list.append('taxday')
stopword_list.append('taxes')

# Create a string of punctuations and numbers to remove from the tweets
punct_num = string.punctuation + string.digits

In [10]:
def textProcessing(wordlist):
    """
    Cleans up string by removing urls, numbers, and punctuations.
    Strips white space.  
    Makes the words lower case and splits up words.
    Removes stop words.
    """
    # Remove urls
    # Reference: http://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet
    wordlist = re.sub(r"http\S+", "", wordlist)
    
    # Remove punctionations and numbers
    for aChar in wordlist:
        if aChar in punct_num:
            wordlist = wordlist.replace(aChar, '')

    # Strip white space, make lower case, and split into words
    wordlist = wordlist.strip().lower().split()
    
    # Filter out stop words
    wordlist2 = []
    for aWord in wordlist:
        if aWord not in stopword_list:
            wordlist2.append(aWord)
            
    
            
    return wordlist2

In [11]:
# Apply the textProcessing function to tweet_df
tweet_df['Text_Words'] = tweet_df['Text'].map(lambda x: textProcessing(x)).map(lambda x: ", ".join(x))

In [12]:
# Converting the User_Mentions column from a list to string
tweet_df['User_Mentions'] = tweet_df['User_Mentions'].map(lambda x: ", ".join(x))

In [13]:
# Check how the cleaned texts look like
tweet_df['Text_Words'].head(50)

0     jackngraham, days, week, someday, day, lord, c...
1     turbotax, sound, taxesdone, music, ears, 🎶, fo...
2     worldforbernie, corrupt, oligarchs, pay, fair,...
3     thelibertymove, remember, things, kid, hear, t...
4     adelalmalki, tech, news, netflix, shares, plun...
5     realalexjones, flashback, irs, insider, expose...
6     moveon, day, pa, pays, pattoomey, amp, senateg...
7               deadpoolmovie, daddy’s, saving, unicorn
8     drjillstein, join, cut, military, budget, amp,...
9               deadpoolmovie, daddy’s, saving, unicorn
10    gop, day, democrats, eager, hands, americans’,...
11    rumsfeldoffice, half, century, filing, correct...
12                              natshupe, abolishtheirs
13    rogerkver, pay, roads, schools, hospitals, tax...
14    drjillstein, best, reign, wasteful, federal, s...
15    adelalmalki, tech, news, netflix, shares, plun...
16    adelalmalki, tech, news, netflix, shares, plun...
17    “government, money, government, finds, mon

In [14]:
# Look at first few rows of tweet_df
tweet_df.head(10)

Unnamed: 0,Profile_Description,ScreenName,Text,User_Mentions,Text_Words
0,,LaTrelleSmart,RT @jackngraham: There are 7 days in a week an...,jackngraham,"jackngraham, days, week, someday, day, lord, c..."
1,"Animal lover, foodie extraordinaire, vintage c...",Calliope116,RT @turbotax: Because the sound of #TaxesDone ...,turbotax,"turbotax, sound, taxesdone, music, ears, 🎶, fo..."
2,,mrbacombits,RT @WorldForBernie: Corrupt oligarchs can't ha...,"WorldForBernie, BernieSanders, generalelectric","worldforbernie, corrupt, oligarchs, pay, fair,..."
3,"Peace, Liberty, Freedom. Politically Uncorrect...",Origanalist,RT @TheLibertyMove: On this #TaxDay let's reme...,TheLibertyMove,"thelibertymove, remember, things, kid, hear, t..."
4,‏‏‏‏اللهم إني أعوذ بك من شر ا عملت، ومن شر ما ...,ruthsimpson8641,RT @Adel__Almalki: #tech #news ( #TaxDay )Netf...,Adel__Almalki,"adelalmalki, tech, news, netflix, shares, plun..."
5,,diazed85,RT @RealAlexJones: FLASHBACK: IRS Insider Expo...,RealAlexJones,"realalexjones, flashback, irs, insider, expose..."
6,,imzayde,"RT @MoveOn: $52,369.35/DAY: Amount PA pays for...","MoveOn, PatToomey, SenateGOP","moveon, day, pa, pays, pattoomey, amp, senateg..."
7,,princykaundal,RT @deadpoolmovie: Daddy’s still saving up for...,deadpoolmovie,"deadpoolmovie, daddy’s, saving, unicorn"
8,Being human is being here.,lynn_mistie,RT @DrJillStein: Join my #TaxDay call to cut m...,DrJillStein,"drjillstein, join, cut, military, budget, amp,..."
9,Be a slut do whatever you want,PhilipInTheDark,RT @deadpoolmovie: Daddy’s still saving up for...,deadpoolmovie,"deadpoolmovie, daddy’s, saving, unicorn"


### Classify Political Affiliation of Twitter Users

In [15]:
# Import necessary library to assign political affiliation
import pickle

In [16]:
# Open the list of trump followers
with open("trump2.pickle", "rb") as f:
    trump_list = pickle.load(f)

In [17]:
# Open the list of bernie followers
bernie_list = []
with open("bernie.pickle", "rb") as f:
    dump = pickle.load(f)
    
    # Get screen name of bernie followers
    for user in dump:
        bernie_list.append(user.screen_name)

In [18]:
# Open the list of hilary followers
hilary_list = []
with open("hilary.pickle", "rb") as f:
    dump = pickle.load(f)
    
    # Get screen name of bernie followers
    for user in dump:
        hilary_list.append(user.screen_name)

In [19]:
# Open the list of ted cruz followers
cruz_list = []
with open("cruz.pickle", "rb") as f:
    dump = pickle.load(f)
    
    # Get screen name of bernie followers
    for user in dump:
        cruz_list.append(user.screen_name)

In [20]:
# Check that the lists loaded correctly
len(cruz_list)
cruz_list[:5]

len(hilary_list)
hilary_list[:5]

len(trump_list)
trump_list[:5]

len(bernie_list)
bernie_list[:5]

['jkline_TGN', 'PiekarzKinga', 'DCDanielCollazo', 'AlkhaldiFadel', 'jmock89']

In [21]:
def classifyPoliticalAffiliation(user, profile_desc, user_mentions):
    """This function returns the label 'dem' or 'rep'
    depending on certain criteria.
    """
    # Check to see if the user is a follower of any of the 4 presidential candidates
    if user in trump_list:
        return "Rep"
    if user in bernie_list:
        return "Dem"
    if user in hilary_list:
        return "Dem"
    if user in cruz_list:
        return "Rep"

    # Check to see if the user's profile description contain any of the following words
    # that are commonly used to describe the two political parties
    dem = ["progressive", "democrat", "liberal", "socialist", "egalitarian", 
           "bleeding heart", "left-wing", "pro-choice", "obama", "hilary", 
           "clinton", "bernie", "sanders"]
    gop = ["conservative", "teaparty", "tea party", "republican", "gop", 
           "right-wing", "nra", "pro-life", "trump", "cruz", "rubio"]

    if profile_desc != None:
        profile_list = profile_desc.split()

        for word in profile_list:
            word = word.lower()
            if any(word in profile_list for word in dem):
                return "Dem"
            if any(word in profile_list for word in gop):
                return "Rep"

    # Check if tweet mentioned one of the presidential candidates
    mentions_dem_list = ["WorldForBernie", "BernieSanders", "HilaryClinton"]
    mentions_rep_list = ["realDonaldTrump", "tedcruz"]
    
    if user_mentions != None:
        for user in user_mentions:
            if any(word in user_mentions for word in mentions_dem_list):
                return "Dem"
            if any(word in user_mentions for word in mentions_rep_list):
                return "Rep"

In [22]:
# Apply classifyPoliticalAffiliation function to tweet_df
tweet_df['Political_Affiliation'] = list(map(classifyPoliticalAffiliation, tweet_df["ScreenName"], 
                                             tweet_df["Profile_Description"], tweet_df["User_Mentions"]))

### Label the Tweets as Positive or Negative

In [23]:
# I tried using the list of positive and negative words as labeled by Finn Årup Nielsen, 
# but the result was not very good (none of the tweets were labeled as pos or neg) so I had to scrap this idea
# Reference: Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
#            sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903

# afinn_dict = {}

# with open("AFINN-96.txt") as f:
#     for line in f:
#         line_list = line.split('\t')
#         key = line_list[0]
#         value = line_list[1]
#         afinn_dict[key] = int(value)
            
# def classifyTaxTweet(word_list):
#     """Classifies a tweet as 'positive' or 'negative' according to the sum
#     of positive and negative words in the tweet.  
#     """
#     score = sum(map(lambda word: afinn_dict.get(word, 0), word_list))
#     if score > 0:
#         return "neg"
#     elif score < 0:
#         return "pos"
#     else:
#         return "neutral"
    
# # Apply analysis to twitter texts
# tweet_df['Text_Class'] = list(map(classifyTaxTweet, tweet_df["Text_Words"]))

In [24]:
# Import required libraries for labeling tweets
from nltk.corpus import movie_reviews
from featx import label_feats_from_corpus, split_label_feats
from featx import bag_of_words
from nltk.classify import NaiveBayesClassifier

In [25]:
# Train Naive Bayes Classifer on 'Movie Reviews' corpus in NLTK

# These codes are taken from the book, Natural language processing with python,
# which has been referenced in the bibliography section of my executive summary

lfeats = label_feats_from_corpus(movie_reviews)
train_feats, test_feats = split_label_feats(lfeats, split=0.75)
nb_classifier = NaiveBayesClassifier.train(train_feats)

In [26]:
# Use the bag-of-words approach to analyze the tweets
def classifyTaxTweet(word_list):
    text = bag_of_words(word_list)
    
    return nb_classifier.classify(text) 

In [27]:
# Apply analysis to twitter texts
tweet_df['Text_Label'] = list(map(classifyTaxTweet, tweet_df["Text_Words"]))

In [28]:
# Look at positive & negative tweets by political affiliation
tweet_df.groupby([tweet_df['Text_Label'], tweet_df['Political_Affiliation']]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Profile_Description,ScreenName,Text,User_Mentions,Text_Words,Political_Affiliation,Text_Label
Text_Label,Political_Affiliation,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
neg,Dem,133,178,178,178,178,178,178
neg,Rep,163,211,211,211,211,211,211
pos,Dem,14,14,14,14,14,14,14
pos,Rep,6,9,9,9,9,9,9


In [29]:
# Save tweet_df to a csv file to import for further analysis in R
tweet_df.to_csv('tweet_df.csv', encoding='utf-8')