In [2]:
import pystache, plotly, json, random, sys, yaml, glob, os
import pandas as pd
import plotly.graph_objects as go

from google.cloud import bigquery
from google.oauth2 import service_account
from datetime import datetime


# Create Interactive Diffusion Graph for Retweets of the CDC that Trump also retweeted

1. Reads yaml configuration file from `/home/jupyter/data/www/covid19-static-pages/configs/cdc-trump.special`
2. Queries for retweets from Big Query including extra user information.
3. Processes and produces simplified JSON output of retweets.
3. Reads simplified JSON into Plotly and writes JSON configurations for Plotly Graphs and HTML files.

In [3]:
credentials = service_account.Credentials.from_service_account_file(
    '/home/jupyter/covid-19-data/.credentials/google-connector.json')
project_id = 'crypto-eon-164220'
client = bigquery.Client(credentials=credentials, project=project_id)

## 1. Get top retweets from Big Query

In [4]:
def get_top_N_retweeted_tweets(table, N=25): #Limits to 25 for safety
    sys.stderr.write("Querying for top {} retweets in {}...".format(N, table))
    
    query_job = client.query("""
SELECT
  id AS retweet_id,
  min(original_tweet_id) AS orig_id,
  min(tweet_text) AS orig_text,
  min(source) AS source,
  min(times_retweeted_) AS times_retweeted,
  min(original_author) AS orig_author,
  min(original_followers) as orig_followers_count,
  min(original_posted) AS orig_posted,
  min(user.screen_name) AS retweeter,
  min(user.id_str) AS retweeter_id,
  min(user.created_at) AS retweeter_join_date,
  min(user.followers_count) AS retweeter_followers_count,
  min(PARSE_TIMESTAMP('%a %b %d %T %z %Y', created_at)) AS retweet_timestamp
FROM
  `crypto-eon-164220.tweets.{TABLE}` tweets,
  (
  SELECT
    MIN(retweeted_status.id) AS original_tweet_id,
    MIN(retweeted_status.text) AS tweet_text,
    MIN(retweeted_status.user.screen_name) AS original_author,
    MIN(retweeted_status.user.followers_count) AS original_followers,
    MIN(PARSE_TIMESTAMP('%a %b %d %T %z %Y', retweeted_status.created_at)) AS original_posted,
    COUNT(DISTINCT(id)) AS times_retweeted_
  FROM
    `crypto-eon-164220.tweets.{TABLE}`
  WHERE
    retweeted_status IS NOT NULL
    AND retweeted_status.id >= (SELECT MIN(id) FROM `crypto-eon-164220.tweets.{TABLE}`)
  GROUP BY
    retweeted_status.id
  ORDER BY times_retweeted_ DESC
  LIMIT {N}
  ) topRetweets
WHERE
  topRetweets.original_tweet_id = tweets.retweeted_status.id
GROUP by id
order by orig_id
""".format(TABLE=table,
           N=N) )
    
    sys.stderr.write("done; creating dataframe\n")
    return query_job.result().to_dataframe()


In [5]:
def create_dataframe_for_plotly(df, fileName):

    retweet_counts_by_id = list(df.orig_id.value_counts(ascending=False).keys())

    count = 0;
    to_return = pd.DataFrame()

    for original_id, retweets in df.groupby('orig_id'):
        count += 1;
        topN = retweet_counts_by_id.index(original_id) + 1 # The Top N tweet...
        print("Processing Tweet# {} - TopN [{}] - ID: {})".format(count, topN, original_id))

        sorted_retweets = retweets.sort_values(by='retweet_id').reindex()

        original_tweet = pd.DataFrame([{
            'id': sorted_retweets.iloc[0].orig_id,
            'created_at': sorted_retweets.iloc[0].orig_posted,
            'username': sorted_retweets.iloc[0].orig_author,
            'followers_count': sorted_retweets.iloc[0].orig_followers_count
        }])

        text = sorted_retweets.iloc[0].orig_text

        interested_rows = sorted_retweets[['retweet_id','retweet_timestamp','retweeter','retweeter_followers_count','source','retweeter_id','retweeter_join_date']]
        interested_rows.columns = ['id','created_at','username','followers_count','source','user_id','user_join_date']

        #Add the first row for the original tweet
        x = pd.concat([original_tweet, interested_rows]).reset_index(drop = True) 

        x['followers_count_cumsum'] = x.followers_count.cumsum()
        x['text'] = text
        x['top_N'] = topN
        
        x.created_at = x.created_at.apply(lambda x: x.isoformat())
        
        usernames = set(x.username)
        
        #Check if Trump or a Trump retweeting bot is there:
        if 'idontdonald' in usernames or 'realDonaldTrump' in usernames: 
            to_return = pd.concat([x, to_return])

    current_datestamp = datetime.today().strftime('%Y-%m-%d')
    with open(fileName +"_"+current_datestamp+'.json','w') as f:
        json.dump(to_return.sort_values(by='top_N').to_dict('records'), f)

In [6]:
def get_data(config):
    
    df = get_top_N_retweeted_tweets(config['table'], N= (config.get("topN") or 10) )
    
    create_dataframe_for_plotly(df, config['data'])


## 2. Create Plots

In [7]:
COLORS = plotly.colors.qualitative.Alphabet

In [8]:
def view_colors():
    sns.palplot(COLORS)

def normalize(values, desired_bounds):
    actual_bounds = (min(values), max(values))
    result = [desired_bounds[0] + (x - actual_bounds[0]) *
            (desired_bounds[1] - desired_bounds[0]) / \
            (actual_bounds[1] - actual_bounds[0]) for x in values]
    return [round(x,2) for x in result]

In [9]:
# Read the latest diffusion data...
def read_dataframe_from_file(CONFIG):

    #Get the latest file
    latest_file = sorted(glob.glob(CONFIG['data']+"*.json"))[-1]
    
    sys.stderr.write("Loading "+latest_file+"...")
    to_plot = json.load(open(latest_file,'r')) #Could put error handling here if necessary

    df = pd.DataFrame(to_plot)
    df['timestamp'] = df.created_at.apply(lambda t: pd.Timestamp(t))
    
#     print(df.timestamp.head())
    
    #Convert all timestamps to EST (Trump's timezone)
    df.timestamp = df.timestamp.apply(lambda t: t.tz_convert('EST'))
    
#     print(df.timestamp.head())
    
    sys.stderr.write(("Read {:,} retweets\n".format(len(to_plot))))

    return df

In [10]:
def makeDateStringForName(d):
    
#     return d.isoformat()
    
    return "{} {} {:02d}:{:02d}:{:02d} (EST)".format( d.month_name()[0:3], d.day, d.hour,d.minute,d.second)

def calculate_self_retweets(input_df):

    sys.stderr.write("Discounting self-retweets: [")
    df = pd.DataFrame.copy(input_df, deep=True)
    
    tweet_data = {}

    for topN in df.top_N.unique():
        newCounter = 0
        subValue = 0
        
        trump_retweet_time = None
        this_tweet_data = None

        for idx, row in df[df.top_N == topN].sort_values(by='id').iterrows():
                        
            if newCounter==0:
                thisUser = row.username
                origFollowerCount = row.followers_count
                origTweetID = str(row.id)
                this_tweet_data = {
                    'text' : row.text,
                    'user' : thisUser,
                    'topN' : int(topN),
                    'color': COLORS[topN%len(COLORS)],
                    'id'   : origTweetID,
                    'self-rt' : [],
                    'time' : row.timestamp.isoformat()
                }
            
            else:
                if row.username == thisUser:
                    if row.followers_count > origFollowerCount:
                        subValue = origFollowerCount
                    else:
                        subValue = row.followers_count
                    this_tweet_data['self-rt'].append(
#                         "2020-04-19T01:53:07+00:00"
                        {'x':row.timestamp.isoformat(), 'subValue':subValue}
                    )                    
        
            #Add Trump timestamp to the tweet data for the ranking...
            if row.username == 'realDonaldTrump':
                trump_retweet_time = makeDateStringForName( row.timestamp )
                
                #Also reset "rank" value here
            
            if subValue > 0:
                df.loc[idx,'followers_count_cumsum'] = row.followers_count_cumsum - subValue
            newCounter += 1;
        
        if not trump_retweet_time:
            #Use 5 seconds prior to idontdonald
            row = df[df.top_N == topN].query('username=="idontdonald"')
            print(row.timestamp)
            x = row.timestamp - pd.Timedelta(seconds=5)
            trump_retweet_time = makeDateStringForName( x.iloc[0] ) 
            print(trump_retweet_time)
            
        this_tweet_data['rank'] = trump_retweet_time
            
        tweet_data[trump_retweet_time] = this_tweet_data
            
        sys.stderr.write("{},".format(topN))
        
    sys.stderr.write("] done\n".format(topN))
    
    return (df, tweet_data)

In [11]:
def custom_label(row):
    return "{}: {} followers".format(row.username, row.followers_count)
    

def buildPlotlyGraph(df, tweet_data):
    sys.stderr.write("Plotting...")
    
#     df['globalScaledMarker'] = df.followers_count.apply(lambda x: np.log2(x+1))
    df['globalScaledMarker'] = normalize(list(df.followers_count), (10,100))

    fig = go.Figure()

    for tweetTimeStamp in sorted(tweet_data.keys()):
        topNidx = tweet_data[tweetTimeStamp].get('topN')

        #Should be sorted safely?
        plot_df = df[df.top_N==topNidx].sort_values(by='id')

        if len(plot_df) > 0:

            tweetId   = str(plot_df.head(1).id.values[0])
            tweetText = plot_df.head(1).text.values[0]

            if pd.isna(tweetText):
                raise "No Tweet Text on First Entry"

            color = COLORS[topNidx%len(COLORS)]

            fig.add_trace(go.Scattergl(
                name = tweetTimeStamp, #rank
                x    = plot_df.timestamp, 
                y    = plot_df.followers_count_cumsum,
                mode = 'markers+lines',
                marker = dict(
                    size  = plot_df.globalScaledMarker, #normalize(list(plot_df.followers_count), (10,35)),
                    color = color,
                    opacity = 0.5,
                    line=dict(
                        color='white',
                        width=0.4
                    ),
                ),
                line=dict(
                    color=color,
                    width=0.75,
                ),
                hovertemplate ='%{x} - %{text}',
                text = list(plot_df.apply(lambda row: custom_label(row), axis=1)),
                meta={'u':plot_df.username,
                      'f':plot_df.followers_count },
                showlegend = True
            ))
            sys.stderr.write(".")

    fig.update_layout(
        autosize=False,
        width=1400,
        height=600,
        margin=dict(
            t=1,r=50,l=1,b=1
        ),
        legend=dict(
            x=1,
            y=1,
            traceorder="normal",
            font=dict(
                family="sans-serif",
                size=12,
                color="black"
            ),
    ),
        yaxis_title="Potential Audience Exposure",)

    sys.stderr.write("] Done\n")
    return fig

In [12]:
def buildSingleStaticPlotlyPage(CONFIG):
    sys.stderr.write("Writing HTML... ")

    main_template = open(STATIC_PAGES_ROOT + '/templates/plotly_js_template.html').read()

    with open(STATIC_PAGES_ROOT + "/" + CONFIG['output'],'w') as outFile:
        outFile.write(pystache.render(main_template, CONFIG))
    sys.stderr.write(" view at: http://epic.tweetsonamap.com/covid19-static-pages/"+CONFIG['output']+"\n")

<br><br><hr><br><br>

# Runtime


In [13]:
# GLOBAL VARIABLES?
STATIC_PAGES_ROOT = '/home/jupyter/data/www/covid19-static-pages'

In [14]:
def full_run(yaml_config, query=True, plot=True, write_html=True):
    print("Building page for: {}".format(yaml_config))
    config = yaml.load(open(yaml_config,'r'),
                       Loader=yaml.FullLoader)
    
    if query:
        get_data(config)
    
    if plot:
        config['df'] = read_dataframe_from_file(config)
    
        topN = config.get('topN') or 25
    
        config['no_self_retweets'], config['tweets'] = calculate_self_retweets(config['df'] )

        config['fig'] = buildPlotlyGraph(config['no_self_retweets'], config['tweets'])
    
        figJSON = json.loads(plotly.io.to_json(config['fig']))
        figJSON['tweets'] = config['tweets']
        
        with open(STATIC_PAGES_ROOT +"/docs/"+ config['JSON'],'w') as outFile: 
            json.dump(figJSON, outFile)
            
    if write_html:
        buildSingleStaticPlotlyPage(config)
    
    return config

In [15]:
# Do some testing?
# full_run('/home/jupyter/data/www/covid19-static-pages/configs/covid-maps.yaml')

x = full_run('/home/jupyter/data/www/covid19-static-pages/configs/cdc-trump.special', query=False, plot=True)


Building page for: /home/jupyter/data/www/covid19-static-pages/configs/cdc-trump.special


Querying for top 25 retweets in cdc_userstreams...done; creating dataframe


Processing Tweet# 1 - TopN [3] - ID: 1252632573798473731)
Processing Tweet# 2 - TopN [11] - ID: 1252993080174796800)
Processing Tweet# 3 - TopN [8] - ID: 1253085285161803777)
Processing Tweet# 4 - TopN [6] - ID: 1253341609443250176)
Processing Tweet# 5 - TopN [15] - ID: 1253374836317044736)
Processing Tweet# 6 - TopN [2] - ID: 1253422154256756738)
Processing Tweet# 7 - TopN [13] - ID: 1253468948068290560)
Processing Tweet# 8 - TopN [14] - ID: 1253705772569067520)
Processing Tweet# 9 - TopN [1] - ID: 1253742258853199872)
Processing Tweet# 10 - TopN [7] - ID: 1253793094841090056)
Processing Tweet# 11 - TopN [10] - ID: 1253818998942314496)
Processing Tweet# 12 - TopN [18] - ID: 1254129055219298308)
Processing Tweet# 13 - TopN [17] - ID: 1254857606859898881)
Processing Tweet# 14 - TopN [22] - ID: 1255971941019762694)
Processing Tweet# 15 - TopN [5] - ID: 1256309675269615616)
Processing Tweet# 16 - TopN [24] - ID: 1256655451195715585)
Processing Tweet# 17 - TopN [16] - ID: 12610443726211891

Loading /home/jupyter/data/diffusion/cdc_userstreams_with_trump_2020-06-17.json...Read 44,256 retweets
Discounting self-retweets: [

4315   2020-04-25 08:02:01-05:00
Name: timestamp, dtype: datetime64[ns, EST]
Apr 25 08:01:56 (EST)


1,2,3,4,5,6,7,8,10,11,12,13,14,15,] done
Plotting.................] Done
Writing HTML...  view at: http://epic.tweetsonamap.com/covid19-static-pages/docs/cdc-trump.html


In [48]:
y = x['no_self_retweets'].sort_values(by=['top_N','id']).set_index('id')

In [49]:
y.head()

Unnamed: 0_level_0,created_at,username,followers_count,source,user_id,user_join_date,followers_count_cumsum,text,top_N,timestamp,globalScaledMarker,time_trump_retweeted
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1253742258853199872,2020-04-24T17:47:02+00:00,CDCgov,2615841,,,,2615841,Household cleaners and disinfectants can cause...,1,2020-04-24 12:47:02-05:00,12.93,Apr 25 08:01:56 (EST)
1253742282479673345,2020-04-24T17:47:08+00:00,monitor_PH,1057,"<a href=""https://twitter.com/pubhealth_watch"" ...",8.699671e+17,Wed May 31 17:21:27 +0000 2017,2616898,Household cleaners and disinfectants can cause...,1,2020-04-24 12:47:08-05:00,10.0,Apr 25 08:01:56 (EST)
1253742322464022528,2020-04-24T17:47:17+00:00,WarnckeMelissa,17,"<a href=""http://twitter.com/download/iphone"" r...",1.144984e+18,Sat Jun 29 14:59:23 +0000 2019,2616915,Household cleaners and disinfectants can cause...,1,2020-04-24 12:47:17-05:00,10.0,Apr 25 08:01:56 (EST)
1253742324145889281,2020-04-24T17:47:18+00:00,tedbrunson,291,"<a href=""http://twitter.com/download/iphone"" r...",22101220.0,Fri Feb 27 03:19:03 +0000 2009,2617206,Household cleaners and disinfectants can cause...,1,2020-04-24 12:47:18-05:00,10.0,Apr 25 08:01:56 (EST)
1253742325215395841,2020-04-24T17:47:18+00:00,HBMS_Warriors,619,"<a href=""https://mobile.twitter.com"" rel=""nofo...",703612800.0,Wed Jul 18 19:05:22 +0000 2012,2617825,Household cleaners and disinfectants can cause...,1,2020-04-24 12:47:18-05:00,10.0,Apr 25 08:01:56 (EST)


In [55]:
def int_or_none(val):
    try:
        return str(int(val))
    except:
        return None

def set_type(val):
    if pd.isna(val):
        return 'original tweet'
    else:
        return 'retweet'

lookup = dict( [(u['topN'], u['rank']) for u in x['tweets'].values()])

y['time_trump_retweeted'] = y.top_N.apply(lambda n: lookup[n])
y['type'] = y.user_id.apply(lambda u: set_type(u))
y.user_id = y.user_id.apply(lambda u: int_or_none(u))

y.head(2)

Unnamed: 0_level_0,created_at,username,followers_count,source,user_id,user_join_date,followers_count_cumsum,text,top_N,timestamp,globalScaledMarker,time_trump_retweeted,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1253742258853199872,2020-04-24T17:47:02+00:00,CDCgov,2615841,,,,2615841,Household cleaners and disinfectants can cause...,1,2020-04-24 12:47:02-05:00,12.93,Apr 25 08:01:56 (EST),original tweet
1253742282479673345,2020-04-24T17:47:08+00:00,monitor_PH,1057,"<a href=""https://twitter.com/pubhealth_watch"" ...",8.699670709848513e+17,Wed May 31 17:21:27 +0000 2017,2616898,Household cleaners and disinfectants can cause...,1,2020-04-24 12:47:08-05:00,10.0,Apr 25 08:01:56 (EST),retweet


In [57]:
# Okay, ready to save some data...
complete_dataframe = pd.DataFrame(y[['created_at','username','followers_count','source','user_id','user_join_date','text','top_N','time_trump_retweeted','type']])
complete_dataframe.head(2)

Unnamed: 0_level_0,created_at,username,followers_count,source,user_id,user_join_date,text,top_N,time_trump_retweeted,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1253742258853199872,2020-04-24T17:47:02+00:00,CDCgov,2615841,,,,Household cleaners and disinfectants can cause...,1,Apr 25 08:01:56 (EST),original tweet
1253742282479673345,2020-04-24T17:47:08+00:00,monitor_PH,1057,"<a href=""https://twitter.com/pubhealth_watch"" ...",8.699670709848513e+17,Wed May 31 17:21:27 +0000 2017,Household cleaners and disinfectants can cause...,1,Apr 25 08:01:56 (EST),retweet


In [58]:
complete_dataframe.to_csv(open('cdc_retweeted_by_trump.csv','w'))