# "US Presidential Candidate Tweets"
> "A visualization of recent tweets from candidates in the 2020 US Presidential Election."

- toc: false
- branch: master
- badges: true
- comments: true
- categories: [Data Visualization]
- hide: false
- search_exclude: false
- image: images/blog_posts/candidate_tweets/candidate_tweets_cover.png

In [142]:
#hide
from nltk.corpus import stopwords 
from textblob import TextBlob
import altair as alt
import pandas as pd
import collections
import itertools
import datetime
import re

ThemeRegistry.enable('vox_theme')

In [178]:
#hide
def generate_date_list(num_days):
    base = datetime.date.today()
    date_list = [(base - datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(num_days)]
    return date_list


def add_columns_to_df(df):  
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['username_label'] = '@' + df['username']
    df['created_at_est'] = df['created_at'].dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
    df['sentiment'] = df['tweet_text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
    df['subjectivity'] = df['tweet_text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)
    df = df.assign(color=['steelblue' if user == 'JoeBiden' else 'red' for user in df['username']])
    df['created_day'] = df['created_at_est'].dt.strftime('%m-%d')
    return df


def generate_df(usernames, number_of_days):
    url = "https://raw.githubusercontent.com/joshkraft/daily-candidate-tweets/main/data/"
    usernames = ["realDonaldTrump", "JoeBiden"]
    dates = generate_date_list(number_of_days)
    df = pd.DataFrame()

    for username in usernames:
        for date in dates:
            file_location = url + username + "/" + str(date) + ".csv"
            try:
                data = pd.read_csv(file_location)
                df = df.append(data)
            except:
                pass

    return df


def extract_n_most_frequent_words(tweets, num_words):
    stop_words = set(stopwords.words('english'))
    words_list = []

    for column, tweet in tweets.iterrows():
        tweet_text = " ".join(
            re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", 
            "", 
            tweet['tweet_text']).split())
        split_text = tweet_text.lower().split()
        words_list.append(split_text)

    flattened_list = list(itertools.chain(*words_list))
    words_list = [word for word in flattened_list if not word in stop_words]

    counts = collections.Counter(words_list).most_common(num_words)
    return pd.DataFrame(counts, columns=['word','times_used'])

In [184]:
#hide
df = generate_df(["realDonaldTrump", "JoeBiden"], 7)
df = add_columns_to_df(df)

trump_words = extract_n_most_frequent_words(df[df.username=='realDonaldTrump'], 12)
biden_words = extract_n_most_frequent_words(df[df.username=='JoeBiden'], 12)

In [199]:
#hide 
# SOURCE: https://github.com/chekos/altair_themes_blog/blob/master/notebooks/vox_theme.py
def vox_theme():
    markColor = '#3e5c69'
    
    return {
        'config': {
            'background': '#fff',
            'arc': { 
                'fill': markColor 
            },
            'area': { 
                'fill': markColor 
            },
            'line': { 
                'stroke': markColor 
            },
            'path': { 
                'stroke': markColor 
            },
            'rect': { 
                'fill': markColor 
            },
            'shape': { 
                'stroke': markColor 
            },
            'symbol': { 
                'fill': markColor 
            },
            'axis': {
                'domainWidth': 0.5,
                'grid': True,
                'labelPadding': 2,
                'tickSize': 5,
                'tickWidth': 0.5,
                'titleFontWeight': 'normal',
            },
            'axisBand': {
                'grid': False,
            },
            'axisX': {
                'gridWidth': 0.2,
            },  
            'axisY': {
                'gridDash': [3],
                'gridWidth': 0.4,
            },
            'legend': {
                'labelFontSize': 14,
                'padding': 1,
                'symbolType': 'square',
            },
            'range': {
                'category': [
                    '#3e5c69',
                    '#6793a6',
                    '#182429',
                    '#0570b0',
                    '#3690c0',
                    '#74a9cf',
                    '#a6bddb',
                    '#e2ddf2',
                ],
            },
        }
        
    }


alt.themes.register('vox_theme', vox_theme)

alt.themes.enable('vox_theme')

ThemeRegistry.enable('vox_theme')

In [206]:
#hide
def threshold_artifacts():

    threshold_boxes = pd.DataFrame([{'positive_min': .5,
                                      'positive_max': 1,
                                      'negative_min': -.5,
                                      'negative_max': -1,
                                      'neutral_min': -.2,
                                      'neutral_max': .2}])

    positive_line = alt.Chart(threshold_boxes).mark_rect(color='#488f31', opacity=0.06).encode(
        y='positive_min:Q',
        y2='positive_max:Q',
    )

    negative_line = alt.Chart(threshold_boxes).mark_rect(color='#f59b56', opacity=0.06).encode(
        y='negative_min:Q',
        y2='negative_max:Q'
    )

    neutral_line = alt.Chart(threshold_boxes).mark_rect(color='gray', opacity=0.1).encode(
        y='neutral_min:Q',
        y2='neutral_max:Q'
    )

    threshold_lines = pd.DataFrame([
        {"th_value": -0.5, "th": "Negative"},
        {"th_value": 0, "th": "Neutral"},
        {"th_value": 0.5, "th": "Positive"},
    ])

    rules = alt.Chart(threshold_lines).mark_rule().encode(
        y='th_value:Q',
        color=alt.value('#224455'),
        opacity=alt.value(0.5)
    )

    text = alt.Chart(threshold_lines).mark_text(
        align='left', dx=-340, dy=-5
    ).encode(
        alt.Y('th_value:Q'),
        text='th',
        opacity=alt.value(0.8)
    )

    return positive_line + neutral_line + negative_line + rules + text 

In [212]:
#hide_input
def plot_main_chart(df):

    click = alt.selection_multi(encodings=['color'])

    brush = alt.selection(type='interval', encodings=['x'])

    base = alt.Chart(df).mark_circle(opacity=0.5, size=120).encode(
                x = alt.X('created_at_est:T',
                    axis = alt.Axis(title='Drag to Select Interval', tickCount=0)),
                y = alt.Y('sentiment:Q',
                    axis = alt.Axis(title='', tickCount=0, grid=False), 
                    scale = alt.Scale(domain=(-1.1, 1.1))),
                color = alt.Color('color', scale=None),
            ).transform_filter(
                click
            ).properties(
                width=700,
                height=450
            )

    upper = base.encode(
                x = alt.X('created_at_est:T',
                    axis = alt.Axis(title='Tweeted At', labels=True),
                    scale=alt.Scale(domain=brush)),
                y = alt.Y('sentiment:Q',
                    axis = alt.Axis(title='Sentiment', tickCount=4), 
                    scale = alt.Scale(domain=(-1, 1))),
                tooltip = ['username', 'tweet_text'],
            ).add_selection(alt.selection_single()
            )

    lower = base.properties(
                height=40
            ).add_selection(brush)
        
    legend = alt.Chart(df).mark_rect().encode(
        y=alt.Y('username_label:N', axis=alt.Axis(title='')),
        color=alt.condition(click, 'color', alt.value('lightgray'), legend=None, scale=None),
        size=alt.value(150)
    ).properties(
        selection=click,
        title="User"
    )


    return (upper + threshold_artifacts() | legend) & lower 

def plot_word_frequency(words, color, title):
    bars = alt.Chart(words).mark_bar(color=color).encode(
        x=alt.X('times_used:Q', 
                axis=alt.Axis(title='Times Used')),
        y=alt.Y('word:O', sort='-x', axis=alt.Axis(title='Word'))
    ).properties(
        title=title
    )

    text = bars.mark_text(
        align='left',
        baseline='middle',
        dx=3  
    ).encode(
        text='times_used:Q'
    ).properties(
        width=300
    )

    return (bars + text)

<h4 align="center">Recent Tweets from @JoeBiden and @realDonaldTrump</h4>

In [213]:
#hide_input
plot_main_chart(df)

<h4 align="center">Most Frequent Words in Recent Tweets</h4>

In [214]:
#hide-input
plot_word_frequency(biden_words, 'steelblue', '@JoeBiden') | plot_word_frequency(trump_words, 'red', '@realDonaldTrump')