# Task E
_Use the location data collected in step A to find out lifts and sentiments regarding the candidates in large versus small cities/towns in Texas._

In [1]:
import numpy as np
import pandas as pd

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

Read in all tweets into dataframs.

In [2]:
tweets = pd.read_csv('tot_tweets.csv')

In [3]:
tweets = tweets.drop(columns=['id'])

In [4]:
tweets.shape

(5741, 2)

In [5]:
tweets.head()

Unnamed: 0,location,text
0,"Tennessee, USA","b""RT @AliAdair22: \xf0\x9f\x90\xa6Next, Beto O..."
1,,"b""RT @AliAdair22: \xf0\x9f\x90\xa6Next, Beto O..."
2,,"b""Ted Cruz, Beto O'Rourke try to rally Latino ..."
3,America,b'RT @RonNehring: Third poll now showing Cruz ...
4,West Texas,"b'Beto O\xe2\x80\x99Rourke, the Democratic con..."


#### Name replacement

In [6]:
# Let's write a function to take care of the names
def replace_names(text):
    
    text = text.lower()
    
    beto_words = ['@betoorourke' , '#betoorourke', '#betonbeto', '#betoforsenate', "beto o'rourke", "o'rourke", 'rourke', '#vetobeto', '#vetobetofortexas']
    cruz_words = ['@tedcruz', '#tedcruz', '#cruzcrew', '#choosecruz' 'ted cruz', 'ted']
    
    for w in beto_words:
        try:
            text = text.replace(w, 'beto')
        except:
            pass
    for w in cruz_words:
        try:
            text = text.replace(w, 'cruz')
        except:
            pass
    
    return text

In [7]:
# apply replace_names() on each tweet
text_column = []
for t in tweets.itertuples():
    text = t.text
    new_text = replace_names(text)
    text_column.append(new_text)

In [8]:
tweets.text = text_column

In [9]:
tweets.shape

(5741, 2)

#### City stuff

In [10]:
# Let's get a list of what we'll consider big cities/towns in Texas.
large_cities = open('large_cities.txt', 'r')
cities = []
for line in large_cities:
    l = line.strip()
    l_list = l.split()
    if l_list[2].isalpha():
        city = l_list[1] + ' ' + l_list[2]
    else:
        city = l_list[1]
    cities.append(city)
cities = cities[0:11]
print (cities)

['Houston', 'San Antonio', 'Dallas', 'Austin', 'Fort Worth', 'El Paso', 'Arlington', 'Corpus Christi', 'Plano', 'Laredo', 'Lubbock']


In [11]:
# get column for location that is either Big, Small, or none
location_column = []
for t in tweets.itertuples():
    if type(t.location) == str:
        for c in cities:
            if c in t.location:
                location_column.append('Big')
                break
            elif c == cities[-1] and (', TX' in t.location or ', Texas' in t.location):
                location_column.append('Small')
                break
            elif c == cities[-1]:
                location_column.append('none')
                break
    else:
        location_column.append('none')

In [12]:
# replace old location column with new
tweets.location = location_column

In [13]:
tweets.shape

(5741, 2)

In [14]:
tweets.head(6)

Unnamed: 0,location,text
0,none,"b""rt @aliadair22: \xf0\x9f\x90\xa6next, beto b..."
1,none,"b""rt @aliadair22: \xf0\x9f\x90\xa6next, beto b..."
2,none,"b""cruz cruz, beto try to rally latino voters i..."
3,none,b'rt @ronnehring: third poll now showing cruz ...
4,none,"b'beto o\xe2\x80\x99beto, the democratic congr..."
5,Big,b'rt @wfaa: does texas\xe2\x80\x99 senate race...


#### Calculate Lift for Candidate / City size

Get count of tweets mentioning Beto / Cruz, and count of all tweets with Beto and/or Cruz in it

In [15]:
beto_count = 0
cruz_count = 0
n_tweets = 0
for t in tweets.itertuples():
    if t.location != 'none':
        if 'beto' in t.text and 'cruz' in t.text:
            beto_count += 1
            cruz_count += 1
            n_tweets += 1
        elif 'beto' in t.text:
            beto_count += 1
            n_tweets += 1
        elif 'cruz' in t.text:
            cruz_count += 1        
            n_tweets += 1

Get count of tweets from large / small cities in Texas

In [16]:
big_city_count = 0
small_city_count = 0
for t in tweets.itertuples():
    if t.location == 'Big':
        big_city_count += 1
    elif t.location == 'Small':
        small_city_count += 1

Get count of tweets with Beto/Big, Beto/Small, Cruz/Big, Cruz/Big

In [17]:
bb_count = 0
bs_count = 0
cb_count = 0
cs_count = 0
for t in tweets.itertuples():
    if 'beto' in t.text and t.location == 'Big':
        bb_count += 1
    if 'beto' in t.text and t.location == 'Small':
        bs_count += 1
    if 'cruz' in t.text and t.location == 'Big':
        cb_count += 1
    if 'cruz' in t.text and t.location == 'Small':
        cs_count += 1

Calculate lift Beto/Big, Beto/Small, Cruz/Big, Cruz/Big

In [18]:
candidate_counts = [beto_count, cruz_count]
city_counts = [big_city_count, small_city_count]
combo_counts = [bb_count, bs_count, cb_count, cs_count]
lifts = [] # order: [0] beto vs big, [1] beto vs small, [2] cruz vs big, [3] cruz vs small
i = 0
for cand_count in candidate_counts:
    for city_count in city_counts:
        lifts.append( (n_tweets * combo_counts[i]) / (cand_count * city_count) )
        i+=1

In [19]:
lifts = np.reshape(lifts, (2, 2)).T # reshape for similarities matrix

In [20]:
similarities = pd.DataFrame(lifts, columns=['beto', 'cruz'], index=['big city', 'small city'])

#### <i>Lift Matrix<i>

In [21]:
similarities

Unnamed: 0,beto,cruz
big city,0.779859,0.796016
small city,0.773519,0.733456


#### Start sentimental stuff :)

Define <i>get_substring(key_word, s)<i>

In [22]:
# ask if key word in the string
# split into list
# identify the index of the key word
# get list of +/- 3 indexes from key word
# concated list back to string
# return string
def get_substring(key_word, s):
    """
        1. ask if key word in the string
        2. split into list
        3. identify the index of the key word
        4. get list of +/- radius indexes from key word
        5. concated list back to string
        6. return string
    """

    # get rid of "b'" at beginning of tweet
    s = s[2:]

    # get rid of rt stuff if there
    if 'rt @' in s:
        end_of_rt = s.index(':')+2
        s = s[end_of_rt:]

    s_list = s.split()

    radius = 5 # set the radius

    # using this ugly mess to try to get beto or cruz if they are next to commas or exclamation
    try:
        kw_index = s_list.index(key_word)
    except:
        try:
            kw_index = s_list.index(key_word+',')
        except:
            try:
                kw_index = s_list.index(key_word+'!')
            except:
                try:
                    kw_index = s_list.index(key_word+'.')
                except:
                    return

    # get up until key word
    sub_s_beg = []
    if radius > kw_index:
        sub_s_beg = s_list[:kw_index]
    else:
        sub_s_beg = s_list[kw_index-radius:kw_index]

    # get key word and after
    len_kw_index_to_end = len(s_list[kw_index+1:])
    sub_s_end = []
    if radius < len_kw_index_to_end:
        sub_s_end = s_list[kw_index:kw_index+radius+1]
    else:
        sub_s_end = s_list[kw_index:]

    sub_s = (' ').join(sub_s_beg + sub_s_end)
    return sub_s

In [23]:
analyser = SentimentIntensityAnalyzer()

In [24]:
bb_sent = []
bs_sent = []
cb_sent = []
cs_sent = []
for t in tweets.itertuples():

    if t.location == 'Big':
        if 'beto' in t.text:
            sub_text = get_substring('beto', t.text)
            if sub_text:
                snt = analyser.polarity_scores(sub_text)
                bb_sent.append(snt['compound'])
            
        if 'cruz' in t.text:
            sub_text = get_substring('cruz', t.text)
            if sub_text:
                snt = analyser.polarity_scores(sub_text)
                cb_sent.append(snt['compound'])
            
    elif t.location == 'Small':
        if 'beto' in t.text:
            sub_text = get_substring('beto', t.text)
            if sub_text:
                snt = analyser.polarity_scores(sub_text)
                bs_sent.append(snt['compound'])

        if 'cruz' in t.text:
            sub_text = get_substring('cruz', t.text)
            if sub_text:
                snt = analyser.polarity_scores(sub_text)
                cs_sent.append(snt['compound'])

In [25]:
# get average of list
def avg(lst): 
    return sum(lst) / len(lst) 

In [26]:
# order [0] beto/big [1] cruz/big [2] beto/small [3] cruz/small
sentiments = [avg(bb_sent), avg(cb_sent), avg(bs_sent), avg(cs_sent)]

In [27]:
sentiments = np.reshape(sentiments, (2, 2)) # reshape for similarities matrix

In [28]:
sentiments = pd.DataFrame(sentiments, columns=['beto', 'cruz'], index=['big city', 'small city'])

#### <i>Sentiment matrix<i>

In [29]:
sentiments

Unnamed: 0,beto,cruz
big city,0.063009,-0.033636
small city,0.080568,-0.026323


# Task F

#### Task E

_Note: It needs to be addressed that our results would be better if ran on more data. From our original dataset of 4000+ tweets, we only found just over 400 with locations equal to a legitiment locations in Texas. This is very limiting, and will effect the legitimacy of our results. In our coming analysis, we assume that our data is legitement for the purpose of this assignment._

Our lift matrix between the candidates and big/small cities in Texas tells us that Beto is talked about more by people from small cities and Cruz in big cities. This differences are very small, only changes to the 100th's decimal place. Because of the results given from this data it is hard to give any advice from lift values only.

For our sentiment analysis, it was clear that sentiment was negative for Cruz and positive for Beto. 

For advice, we need to let Ted Cruz's campaign know that on Twitter, sentiment towards their candidate is generally negative. This should be addressed by their campain.