In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import csv

In [4]:
import datetime as dt

In [217]:
# Load visualization tools
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [218]:
# Load csv as dataframe
df = pd.read_csv("trump_schedule_leak.csv")

In [219]:
df=df[df['top_category']!='no_data']

In [220]:
# Prepare start/end times for datetime conversion
for col in ['time_start','time_end']:
    df[col]=df['date']+" "+df[col]

In [221]:
# Convert datetime fields to correct datetime format
for col in ['date','time_start','time_end']:
    df[col]=pd.to_datetime(df[col])

In [222]:
# Pull back out time only
for col in ['time_start','time_end']:
    df[col]=df[col].dt.time

In [223]:
# Generate list of all days in range
from datetime import datetime, timedelta

def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

days = [dt.strftime('%Y-%m-%d') for dt in 
       datetime_range(df['date'].min(), df['date'].max(), 
       timedelta(days=1))]

In [224]:
# Generate list of 15-minute chunks
from datetime import datetime, timedelta

def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

dts = [dt.strftime('%H:%M') for dt in 
       datetime_range(datetime(2019, 1, 1, 0), datetime(2019, 1, 2, 0), 
       timedelta(minutes=15))]

In [225]:
# Create list of all categories
cats = ['no_data']+list(df['top_category'].unique())

In [226]:
# Encode categories as a number
df['top_category_encoding']=[cats.index(cat) for cat in df['top_category']]

In [227]:
# Save numeric encoding information for use later
encodings = {}
for cat in cats:
    encodings[cats.index(cat)]=cat

In [228]:
# Create schedule flag for each 15-minute chunk
for time in dts:
    dttime = dt.datetime.strptime(time, "%H:%M").time()
    df[time]=((df['time_start'] <= dttime) & (dttime < df['time_end']))*1
    df[time]=df[time]*df['top_category_encoding']

In [229]:
schedule = pd.DataFrame(df.filter(items=(['date']+dts))).groupby(by='date').sum()

In [230]:
schedule_encoded = schedule.replace(encodings)

# Import Tweet Data

In [231]:
tweets = pd.read_csv("trump_tweets_id_dt.txt",sep=" ", index_col=0).rename(columns={"DateTime":'datetime',"ID":'id'})

In [232]:
# Convert to datetime
tweets['datetime']=pd.to_datetime(tweets['datetime'])

In [233]:
tweets=tweets.sort_values(by="datetime")

In [234]:
# Localize time zone
tweets['datetime']=pd.DatetimeIndex(pd.to_datetime(tweets['datetime'],unit='ms')).tz_convert('US/Eastern').tz_localize(None)

In [235]:
# Separate out date and time for axes
tweets['date']=tweets['datetime'].dt.date
tweets['time']=tweets['datetime'].dt.time

In [236]:
# Clip tweet dataframe to overlap with schedule data
tweets = tweets[tweets['datetime']>df['date'].min()]

In [237]:
#tweets['time']=[t.strftime('%H:%M') for t in tweets['time']]

In [241]:
tweets=tweets.sort_values(by='time')

# Heatmap (1-Trace)

In [242]:
schedule.columns = [datetime.strptime(t, '%H:%M') for t in schedule.columns]

TypeError: strptime() argument 1 must be str, not Timestamp

In [243]:
heatmap = go.Heatmap(
    z=schedule.values,
    x=schedule.columns,
    y=schedule.index,
    text=schedule_encoded.values
)

scatterplot = go.Scatter(
    x = tweets['time'],
    y = tweets['date'],
    mode = 'markers'
)

data = [scatterplot,heatmap]

layout = go.Layout(
    title = "Trump Schedule",
    hovermode='closest'
)

fig = go.Figure(data=data,layout=layout)
py.offline.plot(fig)

'file:///Users/janeadmin/Documents/trump_tweet_time/temp-plot.html'

In [None]:
z = schedule.values

label = schedule_encoded.values
    
hover=label

fig = ff.create_annotated_heatmap(z,colorscale='Portland')
py.offline.plot(fig, filename='annotated_heatmap')

# Heatmap using multiple traces

In [None]:
# Create an array of data frame schedules; one for each category
dfs = []
for cat in cats:
    # Create a new dataframe for the selected category with date as index and times as columns
    catdf = pd.DataFrame(df[df['top_category']==cat],columns=(['date']+dts))
    catdf = catdf.groupby(by='date').sum()
    dfs.append(catdf)
    # print(catdf.shape)

In [None]:
for cat in cats:
    # Add missing dates with 0's
    for d in df['date'].unique():
        # If the date isn't already listed in this dataframe
        if d not in dfs[cats.index(cat)].index:
            #Add it and fill all columns with zeroes
            dfs[cats.index(cat)].loc[d]=[0] * len(dfs[cats.index(cat)].columns)
        # If the date is already in the dataframe for this category, don't do anything
        else:
            pass
        # IMPORTANT: Replace all zero's with NULL - sets the transparency in the heatmap
        dfs[cats.index(cat)] = dfs[cats.index(cat)].replace(0, np.nan)
    # print(dfs[cats.index(cat)].shape)

In [None]:
# Demo for querying schedule dataframes:
# dfs[cats.index('executive_time')]['08:00']['2018-11-07']

In [None]:
# Set colors for visualization
colors = ['red','orange','yellow','green','blue','purple','brown','black','pink']

In [None]:
cats

In [None]:
graphs = []

# Thanks etpinard for the trace hack: https://codepen.io/etpinard/pen/GmbVZq?editors=0010

for cat in cats:
    num = cats.index(cat)
    color = colors[num]
    graph = go.Heatmap(
        name=str(cat),
        z=dfs[cats.index(cat)].values,
        x=list(dfs[cats.index(cat)].columns),
        y=list(dfs[cats.index(cat)].index),
        colorscale=[[0, color], [1, color]],
        colorbar=dict(
            lenmode='pixels',
            len=50,
            y=1-(0.1*num),
            tickvals=[0],
            ticktext=[''],
            title=str(cat)
        )
    )
    graphs.append(graph)

In [None]:
layout = go.Layout(
    title = "Trump Schedule",
    hovermode='closest'
)

fig = go.Figure(data=graphs,layout=layout)
py.offline.plot(fig)