In [626]:
import pandas as pd

In [627]:
import numpy as np

In [628]:
import csv

In [629]:
import datetime as dt

In [630]:
# Load visualization tools
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [693]:
# Load csv as dataframe
leak = pd.read_csv("trump_schedule_leak.csv")

In [694]:
# Prepare start/end times for datetime conversion
for col in ['time_start','time_end']:
    leak[col]=leak['date']+" "+leak[col]

In [695]:
import pytz
from pytz import timezone

In [696]:
# Convert datetime fields to correct datetime format
for col in ['date','time_start','time_end']:
    leak[col]=pd.to_datetime(leak[col])
    leak[col]=[timezone('US/Eastern').localize(x) for x in leak[col]]

In [697]:
# Pull back out time only
for col in ['time_start','time_end']:
    leak[col]=leak[col].dt.time

In [698]:
# Generate list of 15-minute chunks
from datetime import datetime, timedelta

def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

dts = [dt.strftime('%H:%M:%S') for dt in 
       datetime_range(datetime(2019, 1, 1, 0), datetime(2019, 1, 2, 0), 
       timedelta(minutes=15))]

In [699]:
# Create list of all categories
cats = ['no_data']+list(leak['top_category'].unique())

In [700]:
# Encode categories as a number
leak['top_category_encoding']=[cats.index(cat) for cat in leak['top_category']]

In [701]:
# Save numeric encoding information for use later
encodings = {}
for cat in cats:
    encodings[cats.index(cat)]=cat

In [702]:
# Create schedule flag for each 15-minute chunk
for time in dts:
    dttime = dt.datetime.strptime(time, "%H:%M:%S").time()
    leak[time]=((leak['time_start'] <= dttime) & (dttime < leak['time_end']))
    # Use giant number as a flag for empty time blocks, because '0' will be used for indexing
    #leak[time] = [x*y if x else 0 for x,y in leak[time],leak['top_category_encoding']]
    leak[time] = leak[time]*leak['top_category_encoding']
    leak[time]=[0 if x==np.nan else x for x in leak[time]]

In [703]:
schedule = pd.DataFrame(leak.filter(items=(['date']+dts))).groupby(by='date').sum()

# Import Tweet Data

In [704]:
# Import tweets from text file and rename columns
tweets = pd.read_csv("trump_tweets_id_dt.txt",sep=" ", index_col=0).rename(columns={"DateTime":'datetime',"ID":'id'})

In [705]:
# Convert to datetime
tweets['datetime']=pd.to_datetime(tweets['datetime'])

In [706]:
# Sort tweets by datetime
tweets=tweets.sort_values(by="datetime")

In [707]:
tweets['datetime'][1]

Timestamp('2018-09-20 13:39:08+0000', tz='UTC')

In [708]:
# Localize time zone
#tweets['datetime']=pd.DatetimeIndex(pd.to_datetime(tweets['datetime'],unit='ms')).tz_convert('US/Eastern').tz_localize(None)


# Localize time zone
tweets['datetime']=[x.tz_convert('US/Eastern') for x in tweets['datetime']]

In [709]:
tweets['datetime'][1]

Timestamp('2018-09-20 09:39:08-0400', tz='US/Eastern')

In [710]:
# Separate out date and time for axes
tweets['date']=tweets['datetime'].dt.date
tweets['time']=tweets['datetime'].dt.time

In [650]:
# Clip tweet dataframe to overlap with schedule data
tweets = tweets[tweets['datetime']>leak['date'].min()]

In [651]:
# Sort tweets by timestamp
tweets=tweets.sort_values(by='time')

In [652]:
# Remove spurious index
tweets=tweets.reset_index()
tweets.drop(columns=['index'], inplace=True)

# Import Tweet content

In [653]:
import json

In [654]:
# Load tweet content into array of dictionaries
tweetcontent = []
for line in open('all_trump_tweets.json', 'r'):
    tweetcontent.append(json.loads(line))

In [655]:
#list(filter(lambda tweetcontent: tweetcontent['id'] == 1042830534513229825, tweetcontent))[0];

In [656]:
# Pull out tweet text
def find_tweettext(id):
    x = list(filter(lambda tweetcontent: tweetcontent['id'] == id, tweetcontent))[0]
    # Find all onegrams
    onegrams = x['text'].split(' ')
    # Find the number of five-gram sets
    fivegramcount = len(onegrams)%5
    c = 1
    # After every 5th word, add a break character
    while c <= fivegramcount+1:
        onegrams.insert(5*c,"<br />")
        c = c+1
    text = " "
    text = text.join(onegrams)
    text = "\""+text+"\""
    text = x['text']
    return text

In [657]:
# Pull out tweet URL for reference
def find_tweeturl(id):
    x = list(filter(lambda tweetcontent: tweetcontent['id'] == id, tweetcontent))[0]
    try:
        link = x['entities']['urls'][0]['url']
        url = "<a href=\""+link+"\">"+link+"</a>"
    # If there's no URL provided, return an empty string
    except:
        url=''
    return url

In [658]:
# Find the tweet source
# by pulling out the last word in the "Twitter for Android/iPhone" link
# and removing the '</a>' characters
def find_tweetsrc(id):
    x = list(filter(lambda tweetcontent: tweetcontent['id'] == id, tweetcontent))[0]
    src = str.lower(x['source'].split(' ')[-1][:-4])
    return src

In [659]:
# Find the creation date
def find_tweetdate(id):
    x = list(filter(lambda tweetcontent: tweetcontent['id'] == id, tweetcontent))[0]
    date = x['created_at']
    return date

In [660]:
def build_annotation(id):
    text = find_tweettext(id)
    date = find_tweetdate(id)
    annotation = date+"<br />"+text+"\"<br />"
    return annotation

In [661]:
tweets['text']=[find_tweettext(id) for id in tweets['id']]

In [662]:
tweets['url']=[find_tweeturl(id) for id in tweets['id']]

In [663]:
tweets['src']=[find_tweetsrc(id) for id in tweets['id']]

In [664]:
tweets['annotation']=[build_annotation(id) for id in tweets['id']]

# Heatmap (1-Trace)

In [711]:
schedule.columns = [datetime.strptime(t, '%H:%M:%S') for t in schedule.columns]

In [712]:
# Convert schedule time to a string
schedule.columns = [str(t) for t in schedule.columns]

In [713]:
# Add a placeholder date to fix datetime alignment problem
tweets['fakedate']='1900-01-01'

In [668]:
# Convert time to a string for merging
tweets['time'] = [str(t) for t in tweets['time']]

In [669]:
# Merge placeholder date and actual timestamp to fix axes alignment problem
tweets['faketime']=tweets['fakedate']+" "+tweets['time']

In [670]:
# Convert new placeholder time to datetime for axes
tweets['faketime']=pd.to_datetime(tweets['faketime'])

In [687]:
tweets['faketime'][1]

Timestamp('1900-01-01 00:52:04')

In [714]:
# Convert schedule time to datetime for axes
schedule.columns=pd.to_datetime(schedule.columns)
schedule_desc.columns=pd.to_datetime(schedule_desc.columns)

In [672]:
# Separate iPhone and Android Tweets
iphone_tweets = tweets[tweets['src']=="iphone"]
android_tweets = tweets[tweets['src']=="studio"]
ipad_tweets = tweets[tweets['src']=="ipad"]

In [673]:
pd.to_datetime('1900-01-01 '+dts[32])

Timestamp('1900-01-01 08:00:00')

In [715]:
schedule_desc=schedule.replace(encodings)

In [680]:
def queryevent(time,day):
    codetime = pd.to_datetime('1900-01-01 '+dts[4*time])
    activity = encodings[schedule[codetime][day]]
    date = schedule.index[day].strftime('%m-%d-%y')
    print("At ",time," on ",date," trump was doing ",activity)

In [681]:
queryevent(10,1)

At  10  on  11-08-18  trump was doing  event


In [688]:
# Localize time zone
tweets['faketime']=[timezone('US/Eastern').localize(x) for x in tweets['faketime']]

In [716]:
# Localize time zone
schedule.columns=[timezone('US/Eastern').localize(x) for x in schedule.columns]

In [717]:
schedule.columns[1]

Timestamp('1900-01-01 00:15:00-0456', tz='US/Eastern')

In [720]:
heatmap = go.Heatmap(
    z=schedule.values,
    x=schedule.columns,
    y=schedule.index,
    text=schedule.columns,
    hoverinfo='text',
    name='Schedule',
    colorscale='Viridis',
    xgap=0.5,
    ygap=0.5
)


iphone = go.Scatter(
    x = iphone_tweets['faketime'],
    y = iphone_tweets['date'],
    text = iphone_tweets['annotation'],
    hoverinfo='text',
    mode = 'markers',
    name='Tweets',
    marker = dict(
        size = 5,
        color = 'red',
        line = dict(
            color = 'white',
            width = 1,
        )
    )
)

android = go.Scatter(
    x = android_tweets['faketime'],
    y = android_tweets['date'],
    text = android_tweets['annotation'],
    hoverinfo='text',
    mode = 'markers',
    name='Tweets',
    marker = dict(
        size = 5,
        color = 'orange',
        line = dict(
            color = 'white',
            width = 1,
        )
    )
)

ipad = go.Scatter(
    x = ipad_tweets['faketime'],
    y = ipad_tweets['date'],
    text = ipad_tweets['annotation'],
    hoverinfo='text',
    mode = 'markers',
    name='Tweets',
    marker = dict(
        size = 5,
        color = 'yellow',
        line = dict(
            color = 'white',
            width = 1,
        )
    )
)

data = [heatmap, iphone, android, ipad]
#data = [iphone, android, ipad]

layout = go.Layout(
    title = "Schedule",
    hovermode='closest',
    xaxis=dict(
        title='Time of Day'
    ),
    height=800
)

fig = go.Figure(data=data,layout=layout)
py.offline.iplot(fig)
py.offline.plot(fig,filename='trump_schedule_tweet_overlay.html')



'file:///Users/janeadmin/Documents/trump_tweet_time/trump_schedule_tweet_overlay.html'