In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import csv

In [4]:
import datetime as dt

In [5]:
# Load visualization tools
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [6]:
# Load csv as dataframe
leak = pd.read_csv("trump_schedule_leak.csv")

In [7]:
leak=leak[leak['top_category']!='no_data']

In [8]:
# Prepare start/end times for datetime conversion
for col in ['time_start','time_end']:
    leak[col]=leak['date']+" "+leak[col]

In [9]:
# Convert datetime fields to correct datetime format
for col in ['date','time_start','time_end']:
    leak[col]=pd.to_datetime(leak[col])

In [10]:
# Pull back out time only
for col in ['time_start','time_end']:
    leak[col]=leak[col].dt.time

In [11]:
# Generate list of 15-minute chunks
from datetime import datetime, timedelta

def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

dts = [dt.strftime('%H:%M:%S') for dt in 
       datetime_range(datetime(2019, 1, 1, 0), datetime(2019, 1, 2, 0), 
       timedelta(minutes=15))]

In [12]:
# Create list of all categories
cats = ['no_data']+list(leak['top_category'].unique())

In [13]:
# Encode categories as a number
leak['top_category_encoding']=[cats.index(cat) for cat in leak['top_category']]

In [14]:
# Save numeric encoding information for use later
encodings = {}
for cat in cats:
    encodings[cats.index(cat)]=cat

In [15]:
# Create schedule flag for each 15-minute chunk
for time in dts:
    dttime = dt.datetime.strptime(time, "%H:%M:%S").time()
    leak[time]=((leak['time_start'] <= dttime) & (dttime < leak['time_end']))
    # Use giant number as a flag for empty time blocks, because '0' will be used for indexing
    leak[time] = [leak.index if x else 100000 for x in leak[time]]

In [16]:
schedule = pd.DataFrame(leak.filter(items=(['date']+dts))).groupby(by='date').sum()

In [17]:
for col in schedule.columns:
    # Convert giant number flag for empty time blocks back to 'None'
    schedule[col]=[None if int(x)>10000 else x for x in schedule[col]]

In [18]:
schedule_category=pd.DataFrame(columns=schedule.columns)

In [19]:
nullset = {'top_category':'no_data'}

In [20]:
nullset['top_category']

'no_data'

In [21]:
def lookup_event(value):
    df = pd.DataFrame(columns=schedule.columns)
    # Replace schedule item type numeric encoding with string value for activity
    for col in schedule.columns:
        df[col] = [leak[value][v] if v else nullset[value] for v in schedule[col]]
    return df

In [26]:
top_cat = lookup_event('top_category')

In [29]:
dts[30]

'07:30:00'

In [30]:
top_cat[dts[40]][12]

KeyError: '10:00:00'

# Import Tweet Data

In [419]:
# Import tweets from text file and rename columns
tweets = pd.read_csv("trump_tweets_id_dt.txt",sep=" ", index_col=0).rename(columns={"DateTime":'datetime',"ID":'id'})

In [420]:
# Convert to datetime
tweets['datetime']=pd.to_datetime(tweets['datetime'])

In [421]:
# Sort tweets by datetime
tweets=tweets.sort_values(by="datetime")

In [422]:
# Localize time zone
tweets['datetime']=pd.DatetimeIndex(pd.to_datetime(tweets['datetime'],unit='ms')).tz_convert('US/Eastern').tz_localize(None)

In [423]:
# Separate out date and time for axes
tweets['date']=tweets['datetime'].dt.date
tweets['time']=tweets['datetime'].dt.time

In [424]:
# Clip tweet dataframe to overlap with schedule data
tweets = tweets[tweets['datetime']>df['date'].min()]

In [426]:
# Sort tweets by timestamp
tweets=tweets.sort_values(by='time')

In [427]:
# Remove spurious index
tweets=tweets.reset_index()
tweets.drop(columns=['index'], inplace=True)

# Import Tweet content

In [174]:
import json

In [176]:
# Load tweet content into array of dictionaries
tweetcontent = []
for line in open('all_trump_tweets.json', 'r'):
    tweetcontent.append(json.loads(line))

In [354]:
#list(filter(lambda tweetcontent: tweetcontent['id'] == 1042830534513229825, tweetcontent))[0];

In [343]:
# Pull out tweet text
def find_tweettext(id):
    x = list(filter(lambda tweetcontent: tweetcontent['id'] == id, tweetcontent))[0]
    """
    # Find all onegrams
    onegrams = x['text'].split(' ')
    # Find the number of five-gram sets
    fivegramcount = len(onegrams)%5
    c = 1
    # After every 5th word, add a break character
    while c <= fivegramcount+1:
        onegrams.insert(5*c,"<br />")
        c = c+1
    text = " "
    text = text.join(onegrams)
    text = "\""+text+"\""
    """
    text = x['text']
    return text

In [336]:
# Pull out tweet URL for reference
def find_tweeturl(id):
    x = list(filter(lambda tweetcontent: tweetcontent['id'] == id, tweetcontent))[0]
    try:
        link = x['entities']['urls'][0]['url']
        url = "<a href=\""+link+"\">"+link+"</a>"
    # If there's no URL provided, return an empty string
    except:
        url=''
    return url

In [337]:
# Find the tweet source
# by pulling out the last word in the "Twitter for Android/iPhone" link
# and removing the '</a>' characters
def find_tweetsrc(id):
    x = list(filter(lambda tweetcontent: tweetcontent['id'] == id, tweetcontent))[0]
    src = str.lower(x['source'].split(' ')[-1][:-4])
    return src

In [363]:
# Find the creation date
def find_tweetdate(id):
    x = list(filter(lambda tweetcontent: tweetcontent['id'] == id, tweetcontent))[0]
    date = x['created_at']
    return date

In [364]:
def build_annotation(id):
    text = find_tweettext(id)
    src = find_tweetsrc(id)
    date = find_tweetdate(id)
    annotation = date+"<br />"+src+"<br />\""+text+"\"<br />"
    return annotation

In [433]:
tweets['text']=[find_tweettext(id) for id in tweets['id']]

In [434]:
tweets['url']=[find_tweeturl(id) for id in tweets['id']]

In [435]:
tweets['src']=[find_tweetsrc(id) for id in tweets['id']]

In [436]:
tweets['annotation']=[build_annotation(id) for id in tweets['id']]

In [341]:
tweets.head(10)

Unnamed: 0,datetime,id,date,time,fakedate,faketime,text,url,src,annotation
0,2018-11-11 00:40:50,1061554334276747264,2018-11-11,00:40:50,1900-01-01,1900-01-01 00:40:50,"With proper Forest Management, we <br /> can s...",,iphone,"With proper Forest Management, we <br /> can s..."
1,2018-11-30 00:52:04,1068442531887632384,2018-11-30,00:52:04,1900-01-01,1900-01-01 00:52:04,"Oh, I get it! I <br /> am a very good develope...","<a href=""https://t.co/hujMqC8veV"">https://t.co...",iphone,"Oh, I get it! I <br /> am a very good develope..."
2,2018-11-30 00:59:03,1068444287380025344,2018-11-30,00:59:03,1900-01-01,1900-01-01 00:59:03,....Lightly looked at doing a <br /> building ...,,iphone,....Lightly looked at doing a <br /> building ...
3,2018-12-01 02:16:08,1068826073775964160,2018-12-01,02:16:08,1900-01-01,1900-01-01 02:16:08,"President George H.W. Bush led <br /> a long, ...","<a href=""https://t.co/NV8tJkugEj"">https://t.co...",iphone,"President George H.W. Bush led <br /> a long, ..."
4,2018-12-07 02:18:59,1071001119697108992,2018-12-07,02:18:59,1900-01-01,1900-01-01 02:18:59,Robert Mueller and Leakin’ Lyin’ <br /> James ...,"<a href=""https://t.co/Rp0tVvuIk9"">https://t.co...",iphone,Robert Mueller and Leakin’ Lyin’ <br /> James ...
5,2018-11-26 02:19:41,1067015026995879937,2018-11-26,02:19:41,1900-01-01,1900-01-01 02:19:41,Mexico should move the flag <br /> waving Migr...,"<a href=""https://t.co/IqgnrBTQFl"">https://t.co...",iphone,Mexico should move the flag <br /> waving Migr...
6,2018-11-07 02:21:51,1060130202418864129,2018-11-07,02:21:51,1900-01-01,1900-01-01 02:21:51,Received so many Congratulations from <br /> s...,"<a href=""https://t.co/7lZw3mapFD"">https://t.co...",iphone,Received so many Congratulations from <br /> s...
7,2018-12-20 02:25:51,1075713886874148865,2018-12-20,02:25:51,1900-01-01,1900-01-01 02:25:51,“I’m proud of the President <br /> today to he...,"<a href=""https://t.co/4CZ8WDJLDE"">https://t.co...",iphone,“I’m proud of the President <br /> today to he...
8,2018-12-18 02:28:39,1074989817061093378,2018-12-18,02:28:39,1900-01-01,1900-01-01 02:28:39,"Biggest outrage yet in the <br /> long, windin...","<a href=""https://t.co/9EWfa43jdC"">https://t.co...",iphone,"Biggest outrage yet in the <br /> long, windin..."
9,2018-12-07 02:28:50,1071003598023221248,2018-12-07,02:28:50,1900-01-01,1900-01-01 02:28:50,....Will Robert Mueller’s big time <br /> conf...,"<a href=""https://t.co/2nyd2eYEse"">https://t.co...",iphone,....Will Robert Mueller’s big time <br /> conf...


# Heatmap (1-Trace)

In [277]:
schedule.columns = [datetime.strptime(t, '%H:%M:%S') for t in schedule.columns]

ValueError: time data '1900-01-01 00:00:00' does not match format '%H:%M:%S'

In [278]:
# Convert schedule time to a string
schedule.columns = [str(t) for t in schedule.columns]

In [428]:
# Add a placeholder date to fix datetime alignment problem
tweets['fakedate']='1900-01-01'

In [429]:
# Convert time to a string for merging
tweets['time'] = [str(t) for t in tweets['time']]

In [430]:
# Merge placeholder date and actual timestamp to fix axes alignment problem
tweets['faketime']=tweets['fakedate']+" "+tweets['time']

In [431]:
# Convert new placeholder time to datetime for axes
tweets['faketime']=pd.to_datetime(tweets['faketime'])

In [437]:
heatmap = go.Heatmap(
    z=schedule.values,
    x=schedule.columns,
    y=schedule.index,
    text=schedule_encoded.values,
    hoverinfo='text',
    name='Schedule',
    colorscale='Viridis',
    xgap=0.5,
    ygap=0.5
)

scatterplot = go.Scatter(
    x = tweets['faketime'],
    y = tweets['date'],
    text = tweets['annotation'],
    hoverinfo='text',
    mode = 'markers',
    name='Tweets',
    marker = dict(
        size = 5,
        color = 'red',
        line = dict(
            color = 'white',
            width = 1,
        )
    )
)

data = [heatmap,scatterplot]
#data = [heatmap]

layout = go.Layout(
    title = "Schedule",
    hovermode='closest',
    xaxis=dict(
        title='Time of Day',
        range=[schedule.columns[28],schedule.columns.max()]
    ),
    height=800
)

fig = go.Figure(data=data,layout=layout)
py.offline.iplot(fig)
#py.offline.plot(fig,'trump_schedule_tweet_overlay.html')




In [49]:
z = schedule.values

label = schedule_encoded.values
    
hover=label

fig = ff.create_annotated_heatmap(z,colorscale='Portland')
py.offline.iplot(fig, filename='annotated_heatmap')

# Heatmap using multiple traces

In [50]:
# Create an array of data frame schedules; one for each category
dfs = []
for cat in cats:
    # Create a new dataframe for the selected category with date as index and times as columns
    catdf = pd.DataFrame(df[df['top_category']==cat],columns=(['date']+dts))
    catdf = catdf.groupby(by='date').sum()
    dfs.append(catdf)
    # print(catdf.shape)

In [51]:
for cat in cats:
    # Add missing dates with 0's
    for d in df['date'].unique():
        # If the date isn't already listed in this dataframe
        if d not in dfs[cats.index(cat)].index:
            #Add it and fill all columns with zeroes
            dfs[cats.index(cat)].loc[d]=[0] * len(dfs[cats.index(cat)].columns)
        # If the date is already in the dataframe for this category, don't do anything
        else:
            pass
        # IMPORTANT: Replace all zero's with NULL - sets the transparency in the heatmap
        dfs[cats.index(cat)] = dfs[cats.index(cat)].replace(0, np.nan)
    # print(dfs[cats.index(cat)].shape)

In [52]:
# Demo for querying schedule dataframes:
# dfs[cats.index('executive_time')]['08:00']['2018-11-07']

In [53]:
# Set colors for visualization
colors = ['red','orange','yellow','green','blue','purple','brown','black','pink']

In [54]:
cats

['no_data', 'executive_time', 'meeting', 'lunch', 'travel', 'event']

In [55]:
graphs = []

# Thanks etpinard for the trace hack: https://codepen.io/etpinard/pen/GmbVZq?editors=0010

for cat in cats:
    num = cats.index(cat)
    color = colors[num]
    graph = go.Heatmap(
        name=str(cat),
        z=dfs[cats.index(cat)].values,
        x=list(dfs[cats.index(cat)].columns),
        y=list(dfs[cats.index(cat)].index),
        colorscale=[[0, color], [1, color]],
        colorbar=dict(
            lenmode='pixels',
            len=50,
            y=1-(0.1*num),
            tickvals=[0],
            ticktext=[''],
            title=str(cat)
        )
    )
    graphs.append(graph)

In [56]:
layout = go.Layout(
    title = "Trump Schedule",
    hovermode='closest'
)

fig = go.Figure(data=graphs,layout=layout)
py.offline.iplot(fig)