In [194]:
# import libraries
import pandas as pd
import numpy as np

Perform lift and sentiment analysis on each candidate and issue:
![title](taskC.png)

To do the above analysis, you will have to write a script to parse each tweet and extract parts that pertain to a candidate and attribute. Provide details of how you accomplished this task in a script. 

In [195]:
# read the csv into a dataframe
tweets = pd.read_csv("tot_tweets.csv")
tweets = tweets.drop(columns=['id'])

In [196]:
tweets.head()

Unnamed: 0,location,text
0,"Tennessee, USA","b""RT @AliAdair22: \xf0\x9f\x90\xa6Next, Beto O'Rourke @BetoORourke, running for U.S. Senate in #Texas. Please follow, tweet, contribute, volunteer, anything\xe2\x80\xa6"""
1,,"b""RT @AliAdair22: \xf0\x9f\x90\xa6Next, Beto O'Rourke @BetoORourke, running for U.S. Senate in #Texas. Please follow, tweet, contribute, volunteer, anything\xe2\x80\xa6"""
2,,"b""Ted Cruz, Beto O'Rourke try to rally Latino voters in Texas Senate race https://t.co/EHvO0Zz7yh #FoxNews @MRiveraFoxNews"""
3,America,b'RT @RonNehring: Third poll now showing Cruz with 8 or 9 point solid lead over Beto O\xe2\x80\x99Rourke (D-Hollywood). Follows Quinnipiac and NY Times\xe2\x80\xa6'
4,West Texas,"b'Beto O\xe2\x80\x99Rourke, the Democratic congressman from El Paso, has made the Texas race for US Senate one of the most widel\xe2\x80\xa6 https://t.co/f9i3QqeJ6U'"


In [197]:
# Let's write a function to take care of the names
def replace_names(text):
    
    text = text.lower()
    
    beto_words = ['@betoorourke' , "beto o'rourke", "o'rourke", 'rourke', 'beto']
    cruz_words = ['@tedcruz', 'ted cruz', 'ted', 'cruz']
    
    for w in beto_words:
        try:
            text = text.replace(w, ' beto ')
        except:
            pass
    for w in cruz_words:
        try:
            text = text.replace(w, ' cruz ')
        except:
            pass
    
    return text

In [198]:
# apply replace_names() on each tweet
text_column = []
for t in tweets.itertuples():
    text = t.text
    new_text = replace_names(text)
    text_column.append(new_text)

In [199]:
# replace old text with new text
tweets.text = text_column

In [200]:
for i in tweets.itertuples():
    text = i.text
    print(text)
    break

b"rt @aliadair22: \xf0\x9f\x90\xa6next,   beto     beto  , running for u.s. senate in #texas. please follow, tweet, contribute, volunteer, anything\xe2\x80\xa6"


In [201]:
# word frequency
# grab word frequency using nltk library
import nltk
# stop words
from nltk.corpus import stopwords

In [202]:
# put all tweets together
text=tweets["text"].values.tolist()
text=" ".join(text)
text = text.lower()

In [203]:
# tokenize the text 
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)

In [204]:
# let's do a word count
from collections import Counter

# counter object
words = Counter()

# update counter with new words
words.update(tokens)

In [205]:
# first order check
words.most_common()[:10]

[('xe2', 6577),
 ('x80', 6452),
 ('b', 5743),
 ('rt', 4791),
 ('beto', 4561),
 ('cruz', 4463),
 ('the', 4411),
 ('xa6', 3879),
 ('texas', 3135),
 ('in', 3112)]

In [206]:
# store the stop words
stopwords = stopwords.words('english')[:]

In [207]:
# lets do this again: remove stop words, single letter variable names
filter_words = Counter(x for x in tokens if x not in stopwords and x.isdigit() == False and len(x) != 1 and x.isalpha() == True)

In [208]:
filter_words.most_common()[:1000]

[('rt', 4791),
 ('beto', 4561),
 ('cruz', 4463),
 ('texas', 3135),
 ('senate', 3003),
 ('https', 2088),
 ('co', 2007),
 ('race', 1956),
 ('vote', 950),
 ('single', 785),
 ('msnbc', 735),
 ('word', 722),
 ('win', 703),
 ('every', 662),
 ('care', 656),
 ('repeal', 652),
 ('act', 652),
 ('pre', 652),
 ('insis', 651),
 ('affordable', 651),
 ('including', 651),
 ('protections', 651),
 ('existing', 650),
 ('condi', 650),
 ('debate', 502),
 ('nplease', 491),
 ('believe', 487),
 ('retweet', 483),
 ('sample', 483),
 ('bigger', 479),
 ('size', 478),
 ('guy', 401),
 ('leads', 352),
 ('show', 345),
 ('skateboard', 343),
 ('still', 337),
 ('side', 336),
 ('election', 334),
 ('polling', 331),
 ('skates', 327),
 ('across', 327),
 ('stage', 327),
 ('hoping', 327),
 ('dont', 327),
 ('history', 319),
 ('texassenate', 301),
 ('million', 289),
 ('want', 256),
 ('looking', 241),
 ('cbsnews', 238),
 ('ahead', 237),
 ('problem', 237),
 ('integrity', 235),
 ('serious', 232),
 ('becaus', 232),
 ('campaign', 22

In [209]:
pd.set_option('display.max_colwidth', -1)

In [210]:
# lets search for issues
tweets[tweets['text'].str.contains("")]

Unnamed: 0,location,text
0,"Tennessee, USA","b""rt @aliadair22: \xf0\x9f\x90\xa6next, beto beto , running for u.s. senate in #texas. please follow, tweet, contribute, volunteer, anything\xe2\x80\xa6"""
1,,"b""rt @aliadair22: \xf0\x9f\x90\xa6next, beto beto , running for u.s. senate in #texas. please follow, tweet, contribute, volunteer, anything\xe2\x80\xa6"""
2,,"b"" cruz , beto try to rally latino voters in texas senate race https://t.co/ehvo0zz7yh #foxnews @mriverafoxnews"""
3,America,b'rt @ronnehring: third poll now showing cruz with 8 or 9 point solid lead over beto o\xe2\x80\x99 beto (d-hollywood). follows quinnipiac and ny times\xe2\x80\xa6'
4,West Texas,"b' beto o\xe2\x80\x99 beto , the democratic congressman from el paso, has made the texas race for us senate one of the most widel\xe2\x80\xa6 https://t.co/f9i3qqej6u'"
5,"Dallas, TX",b'rt @wfaa: does texas\xe2\x80\x99 senate race belong to texans? https://t.co/ccbbwh5pmt https://t.co/gxnrzdjhid'
6,,"b""rt @lizlogan76: beto could win this, because of aca alone. this is the winning issue, get the word out, vote for beto if you valu\xe2\x80\xa6"""
7,"Atlanta, GA","b""rt @foxnews: cruz , beto try to rally latino voters in the texas senate race. https://t.co/ynxbe6uvpb"""
8,Oregon,"b""rt @patriot_musket: lots of really good news for house and senate republicans in this weekend's and today's polls. \n\nand lol that cook stil\xe2\x80\xa6"""
9,Australia,"b""rt @amhotflash: beto 's campaign has raised more money in a single quarter, than any senate race in the history of this country. # beto is ma\xe2\x80\xa6"""


In [211]:
# Let's write a function to take care of the ISSUES
def replace_issues(text):
    
    text = text.lower()
    
    healthcare = ['affordable care act' , "healthcare", "aca"]
    taxes = ['tax', 'taxes']
    climatechange = ['climate', 'climatechange']
    bordercontrol = ['border', 'immigration', 'bordersecurity', 'borders']
    character = ['lyin', 'leader']
    
    for w in healthcare:
        try:
            text = text.replace(w, ' healthcare ')
        except:
            pass
    for w in taxes:
        try:
            text = text.replace(w, ' taxes ')
        except:
            pass
    for w in climatechange:
        try:
            text = text.replace(w, ' climatechange ')
        except:
            pass
    for w in bordercontrol:
        try:
            text = text.replace(w, ' bordercontrol ')
        except:
            pass

    for w in character:
        try:
            text = text.replace(w, ' character ')
        except:
            pass
    
    return text

In [212]:
# apply replace_issues() on each tweet
text_column = []
for t in tweets.itertuples():
    text = t.text
    new_text = replace_issues(text)
    text_column.append(new_text)

In [213]:
# replace old text with new text
tweets.text = text_column

In [214]:
tweets[tweets['text'].str.contains("taxes")]

Unnamed: 0,location,text
3118,"New York, USA","b'rt @rjc: \xe2\x80\x9ci want to cut your taxes es; congressman o\xe2\x80\x99 beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to keep\xe2\x80\xa6'"
3154,"Tyler,Texas","b'rt @rjc: \xe2\x80\x9ci want to cut your taxes es; congressman o\xe2\x80\x99 beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to keep\xe2\x80\xa6'"
3189,"St Louis, MO","b'rt @charlesppierce: i watched the texas senate debate tonight. i wish republicans would stop using the ""jfk ran on taxes cuts"" dodge to excus\xe2\x80\xa6'"
3253,Republic of Texas,"b'rt @rjc: \xe2\x80\x9ci want to cut your taxes es; congressman o\xe2\x80\x99 beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to keep\xe2\x80\xa6'"
3287,"Arizona, USA","b'rt @rjc: \xe2\x80\x9ci want to cut your taxes es; congressman o\xe2\x80\x99 beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to keep\xe2\x80\xa6'"
3292,Texas,"b'rt @rjc: \xe2\x80\x9ci want to cut your taxes es; congressman o\xe2\x80\x99 beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to keep\xe2\x80\xa6'"
3366,"San Antonio, TX","b'rt @rjc: \xe2\x80\x9ci want to cut your taxes es; congressman o\xe2\x80\x99 beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to keep\xe2\x80\xa6'"
3493,"Richmond, VA","b""rt @apcentralregion: an #apfactcheck on the cruz and beto senate debate in texas examines claims on oil taxes es, impeachment and\xe2\x80\xa6"""
3496,"San Antonio, TX","b'rt @rjc: \xe2\x80\x9ci want to cut your taxes es; congressman o\xe2\x80\x99 beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to keep\xe2\x80\xa6'"
3561,Texas,"b'rt @rjc: \xe2\x80\x9ci want to cut your taxes es; congressman o\xe2\x80\x99 beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to keep\xe2\x80\xa6'"


In [215]:
# do another word count
# put all tweets together
text=tweets["text"].values.tolist()
text=" ".join(text)
text = text.lower()

# tokenize the text 
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)

# counter object
words = Counter()

# update counter with new words
words.update(tokens)

In [216]:
# issues
issues = ['healthcare', 'taxes', 'climatechange', 'bordercontrol', 'character']

In [217]:
# lets do this again: remove stop words, single letter variable names
filter_words = Counter(x for x in tokens if x not in stopwords and x.isdigit() == False and len(x) != 1 and x.isalpha() == True and x in issues)

In [218]:
filter_words.most_common()[:]

[('healthcare', 730),
 ('character', 289),
 ('taxes', 196),
 ('bordercontrol', 64),
 ('climatechange', 36)]

Issues:
* healthcare
* character
* taxes
* bordercontrol

#### Calculate Lift for Candidate / Issue

Get count of tweets mentioning Beto / Cruz, and count of all tweets with Beto and/or Cruz in it

In [219]:
beto_count = 0
cruz_count = 0
n_tweets = 0
for t in tweets.itertuples():
    if 'beto' in t.text and 'cruz' in t.text:
        beto_count += 1
        cruz_count += 1
        n_tweets += 1
    elif 'beto' in t.text:
        beto_count += 1
        n_tweets += 1
    elif 'cruz' in t.text:
        cruz_count += 1        
        n_tweets += 1

Get count of tweets for each issue!

In [220]:
healthcare_count = 0
character_count = 0
taxes_count = 0
bordercontrol_count = 0
for t in tweets.itertuples():
    if 'healthcare' in t.text:
        healthcare_count += 1
    elif 'character' in t.text:
        character_count += 1
    elif 'taxes' in t.text:
        taxes_count += 1
    elif 'bordercontrol' in t.text:
        bordercontrol_count += 1

Get count of tweets with 

Beto/healthcare, Beto/character, Beto/taxes, Beto/bordercontrol


Cruz/healthcare, Cruz/character, Cruz/taxes, Cruz/bordercontrol

In [221]:
# beto and issues
bh_count = 0
bc_count = 0
bt_count = 0
bb_count = 0

# cruz and issues
ch_count = 0
cc_count = 0
ct_count = 0
cb_count = 0

for t in tweets.itertuples():
    # beto
    if 'beto' in t.text and 'healthcare' in t.text:
        bh_count += 1
    if 'beto' in t.text and 'character' in t.text:
        bc_count += 1
    if 'beto' in t.text and 'taxes' in t.text:
        bt_count += 1
    if 'beto' in t.text and 'bordercontrol' in t.text:
        bb_count += 1
    
    # cruz
    if 'cruz' in t.text and 'healthcare' in t.text:
        ch_count += 1
    if 'cruz' in t.text and 'character' in t.text:
        cc_count += 1
    if 'cruz' in t.text and 'taxes' in t.text:
        ct_count += 1
    if 'cruz' in t.text and 'bordercontrol' in t.text:
        cb_count += 1

In [222]:
candidate_counts = [beto_count, cruz_count]
issue_counts = [healthcare_count, character_count, taxes_count, bordercontrol_count]
combo_counts = [bh_count, bc_count, bt_count, bb_count, ch_count, cc_count, ct_count, cb_count]
lifts = [] # order: [0] beto vs healthcare, [1] beto vs character, [2] beto vs taxes, [3] beto vs bordercontrol
           #        [4] cruz vs healthcare, [5] cruz vs character, [6] cruz vs taxes, [7] cruz vs bordercontrol
index = 0
for cand_count in candidate_counts:
    for issue_count in issue_counts:
        lifts.append( (len(tweets) * combo_counts[index]) / (cand_count * issue_count) )
        index +=1 
lifts_copy = lifts[:]

In [223]:
lifts = np.reshape(lifts, (2, 4)).T # reshape for similarities matrix

In [224]:
similarities = pd.DataFrame(lifts, columns=['beto', 'cruz'], index=['healthcare', 'character', 'taxes', 'bordercontrol'])

In [225]:
similarities

Unnamed: 0,beto,cruz
healthcare,0.171859,1.546513
character,1.674005,0.524548
taxes,1.275317,1.225257
bordercontrol,0.412301,1.088287


## Sentiment Analysis

In [226]:
# ask if key word in the string
# split into list
# identify the index of the key word
# get list of +/- 3 indexes from key word
# concated list back to string
# return string
def get_substring(key_word, s):
    """
        1. ask if key word in the string
        2. split into list
        3. identify the index of the key word
        4. get list of +/- radius indexes from key word
        5. concated list back to string
        6. return string
    """

    # get rid of "b'" at beginning of tweet
    s = s[2:]

    # get rid of rt stuff if there
    if 'rt @' in s:
        end_of_rt = s.index(':')+2
        s = s[end_of_rt:]

    s_list = s.split()

    radius = 8 # set the radius

    # using this ugly mess to try to get beto or cruz if they are next to commas or exclamation
    try:
        kw_index = s_list.index(key_word)
    except:
        try:
            kw_index = s_list.index(key_word+',')
        except:
            try:
                kw_index = s_list.index(key_word+'!')
            except:
                try:
                    kw_index = s_list.index(key_word+'.')
                except:
                    return

    # get up until key word
    sub_s_beg = []
    if radius > kw_index:
        sub_s_beg = s_list[:kw_index]
    else:
        sub_s_beg = s_list[kw_index-radius:kw_index]

    # get key word and after
    len_kw_index_to_end = len(s_list[kw_index+1:])
    sub_s_end = []
    if radius < len_kw_index_to_end:
        sub_s_end = s_list[kw_index:kw_index+radius+1]
    else:
        sub_s_end = s_list[kw_index:]

    sub_s = (' ').join(sub_s_beg + sub_s_end)
    return sub_s

In [227]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [228]:
# sentiment analyzer object
analyser = SentimentIntensityAnalyzer()

In [229]:
# lyft values > 1
ch_sent = []
ct_sent = []
cb_sent = []
bc_sent = []
bt_sent = []

for t in tweets.itertuples():
    
    # Beto and taxes
    if 'beto' in t.text and 'taxes' in t.text:
        sub_text = get_substring('beto', t.text)
        if sub_text:
            snt = analyser.polarity_scores(sub_text)
            bt_sent.append(snt['compound'])
        
    # Beto and character
    if 'beto' in t.text and 'character' in t.text:
        sub_text = get_substring('beto', t.text)
        
        if sub_text:
            snt = analyser.polarity_scores(sub_text)
            bc_sent.append(snt['compound'])

    # cruz and taxes
    if 'cruz' in t.text and 'taxes' in t.text:
        sub_text = get_substring('cruz', t.text)
        print(sub_text)
        if sub_text:
            snt = analyser.polarity_scores(sub_text)
            print(snt['compound'])
            ct_sent.append(snt['compound'])
            
    # cruz and bordercontrol
    if 'cruz' in t.text and 'bordercontrol' in t.text:
        sub_text = get_substring('cruz', t.text)
        if sub_text:
            snt = analyser.polarity_scores(sub_text)
            cb_sent.append(snt['compound'])

    # cruz and healthcare
    if 'cruz' in t.text and 'healthcare' in t.text:
        sub_text = get_substring('cruz', t.text)
        if sub_text:
            snt = analyser.polarity_scores(sub_text)
            ch_sent.append(snt['compound'])

beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to
0.0772
beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to
0.0772
beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to
0.0772
beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to
0.0772
beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to
0.0772
beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to
0.0772
an #apfactcheck on the cruz and beto senate debate in texas examines claims
0.0
beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to
0.0772
beto wants to raise your taxes es,\xe2\x80\x9d mr. cruz said in his closing remarks. \xe2\x80\x9ci want to
0.0772
beto wan

In [230]:
# get average of list
def avg(lst): 
    return sum(lst) / len(lst)

In [231]:
# order [0] beto/big [1] cruz/big [2] beto/small [3] cruz/small
sentiments = [avg(ch_sent), avg(ct_sent), avg(cb_sent), avg(bc_sent), avg(bt_sent)]

In [232]:
sentiments

[0.0020593607305936074,
 0.07625853658536597,
 0.799931578947369,
 0.5177222222222215,
 -0.19130481927710813]

In [233]:
final_table = pd.DataFrame(None, columns=['lift', 'sentiment score'], index=['healthcare & beto', 'character& beto', 'taxes & beto', 'bordercontrol & beto',
                                                                             'healthcare & cruz','character& cruz','taxes & cruz','bordercontrol & cruz'])

In [234]:
final_table['lift']=lifts_copy

In [235]:
final_table

Unnamed: 0,lift,sentiment score
healthcare & beto,0.171859,
character& beto,1.674005,
taxes & beto,1.275317,
bordercontrol & beto,0.412301,
healthcare & cruz,1.546513,
character& cruz,0.524548,
taxes & cruz,1.225257,
bordercontrol & cruz,1.088287,


In [236]:
final_table['sentiment score']['healthcare & cruz']= sentiments[0]
final_table['sentiment score']['taxes & cruz']= sentiments[1]
final_table['sentiment score']['bordercontrol & cruz']= sentiments[2]
final_table['sentiment score']['character& beto']= sentiments[3]
final_table['sentiment score']['taxes & beto']= sentiments[4]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [237]:
final_table

Unnamed: 0,lift,sentiment score
healthcare & beto,0.171859,
character& beto,1.674005,0.517722
taxes & beto,1.275317,-0.191305
bordercontrol & beto,0.412301,
healthcare & cruz,1.546513,0.00205936
character& cruz,0.524548,
taxes & cruz,1.225257,0.0762585
bordercontrol & cruz,1.088287,0.799932


The lift between taxes and beto is at 1.275317 and the sentiment score is -0.191305. Beto should focus addressing why he supports tax increases and illustrate the pros of such policy to address the current negative sentiment.

The lift between character and beto is at 1.674005 and the sentiment score is 0.517722. Beto should continue to demonstate his image as a candidate with high integrity. One distinctive feature seen in the tweets concerning his character involve words such as "leader" and "proud to support". 

The lift between bordercontrol and cruz is at 1.088287 and the sentiment score is 0.799932. Cruz should continue to strongly support his strict policies on immigration. 

The lift between healthcare and cruz is at 1.546513 and the sentiment score is 0.00205936. Although there is a clear association between cruz and healthcare the sentiment score is neutral and so cruz has an opportunity to leverage this buzz and gain more supporters. Our data tells us that people are aware of healthcare as an issue, knowing that the candidates can use this as a talking point in each of their favor. 

The lift between taxes and cruz is at 1.225257 and the sentiment score is 0.0762585. Cruz should continue to emphasize his support for lower taxes. This should be a core differentiator for Cruz as Beto has a negative sentiment on the same issue with a lift value. 
