In [1]:
### Functionalize code from experimentation notebook
### WORKING: functions for API call to collect tweets, create full tweet df (100 tweets for all )

### Date: Feb 9, 2023
### Author: Jonathan Chan

#to do - MAIN FN, get twitter df w 100 per user, make network graph using filtered rows for each user, 

### Notes: 
### TEAM TWITTER HANDLES FROM: https://ourworldindata.org/team

In [2]:
#General
import re
import json
import numpy as np
import pandas as pd

#for data collection - webscraping
import requests
from bs4 import BeautifulSoup
import re

#handling plots
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure, text

#for network creation
import networkx as nx

#for data collection - tweepy API 
import tweepy
import twitter_credentials #Twitter credentials stored in a file: twitter_credentials.py

consumer_key = twitter_credentials.CONSUMER_KEY
consumer_secret = twitter_credentials.CONSUMER_SECRET 
access_token = twitter_credentials.ACCESS_TOKEN
access_token_secret = twitter_credentials.ACCESS_SECRET

Goal: to scrape x number of tweets from each OWID team member and create a) a network map, b) a bigrams 

The following code will perform the following steps with one or more functions for each

### Steps:

1. Collect OWID team Twitter handles from the website - 
    output file: owid_team_info.csv (dataframe with columns: name, title, team, twitter_username, other_links
    
2. Iterate through Twitter handles, collect 250 tweets
    Sub function: collect x number of tweets when provided x and twitter handle
    output: store as full_team_tweets.csv
    assume twitter developer credentials are set up already - imported as twitter_credentials.py
    
3. Create Network graph images
    Subfunction: create a network graph image from a list of users in full_team_tweets.csv
       (should work for single user when passing list w single iteM)
    Output: image files for each of the twitter users 

In [3]:
## STEP 1 FUNCTION DECLARATION - scrape https://ourworldindata.org/team for OWID team info

def get_team_info(in_url="https://ourworldindata.org/team"):
    """
    Returns dataframe containing the info for each OWID team member. 
        columns:
            name: name of team member (str)
            title: job position name of team member (str)
            twitter_username: twitter username (str - no @ symbol) OR None 
            other_links: other links found at bottom of each member's section (list of str)
            
    input: 
        in_url: string containing the url for the OWID team page
    Notes:
        - Assume format of in_url matches webpage format of "https://ourworldindata.org/team"
        as of Feb 6, 2023.
        - Unless otherwise indicated, String values are scraped
        as they appear on the OWID team site - includes honorifics and capitalization 
    """
    response=requests.get(in_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    team_info= []
    
    #iterate through div element that contain info for each teammate (class name: wp-block-media-text__content)
    for tm_div in soup.find_all(class_="wp-block-media-text__content"):

        tm_name = tm_div.find("h4").text #name in h4 tag
        tm_title = tm_div.find("h5").text #title in h5 tag
        tm_twitter=None #variable to store twitter handle as string if it exists
        tm_other = [] #list to write non-twitter links to
        
        for p in tm_div.find_all("p"):
            links_list = p.find_all("a", href=True) 
            for link in links_list:
                if 'twitter' in link['href']:
                    tm_twitter = link['href'].split("/")[-1] #split by '/' and take last item to get twitter handle
                    tm_twitter = tm_twitter.replace("@", "") #remove @ symbol - easier to pass to Tweepy module
                else:
                    tm_other.append(link['href']) #add non-twitter links as 'other_links' column

        tm_dict = {
            "name": tm_name,
            "title": tm_title,
            "twitter_username": tm_twitter,
            "other_links": tm_other
        }
        team_info.append(tm_dict) 
    
    #MANUAL CHECKS - spot check 
    assert len(team_info) == 26
    assert "Max Roser" in team_info[0]['name']
    assert "Jason Crawford" in team_info[-1]['name']
    
    return pd.DataFrame(team_info)





In [4]:
## STEP 2: return dataframe containing all tweets for users with a twitter account
#follows format returned in get_team_info(), get_tweet_info()
#SUBFUNCTION - input twtter handle, number of tweets, export raw twitter_data

def get_user_tweets(curr_user="OurWorldInData", num_tweets=100):
    """Returns a list of raw tweet JSON items based on a username
    
    
    Note: 
        Only tested on num_tweets below 100
        REFACTOR: handle queries of tweets more than tweepy limit for a specific
        Assume consumer_key, consumer secret are defined before calling function
        
        please consult developer docs to ensure that your Twitter Developer account
        can access the number of tweets you are interested in per month
        https://developer.twitter.com/en/support/twitter-api/developer-account
    """
    #authenticate twitter credentials
    auth = tweepy.OAuth2AppHandler(consumer_key,
                                   consumer_secret
                                  )
    #add parser to avoid error of non-serializable data: 
    #https://github.com/tweepy/tweepy/issues/1102
    api = tweepy.API(auth, 
                     parser=tweepy.parsers.JSONParser() 
                    )
    #collect num_tweets
    all_tweets = []
    tweets_extended = api.user_timeline(screen_name=curr_user, #replaced id=curr_user to remove warning in jupyter notebook
                                        tweet_mode='extended', 
                                        count=num_tweets
                                       )
    all_tweets = all_tweets + tweets_extended

    return all_tweets


def get_tweets_df(team_list, tweets_per_tm=5):
    """Return a dataframe where each row is a tweet from usernames in team_list. Number of tweets collected is defined
    as tweet_per_tm (default=100). Final dataframe includes tweet date, id, full text, rt/fav count, hashtag list, and list of 
    accounts that the tweet interacted with (mentions, replies, retweets) stored as tuples (tweet poster, interacted account)
    
    input:
        team_list: list of twitter handles of team members to collect tweets from(list of str)
        tweets_per_tm: number of tweets to collect per team member (int)
    
    Notes:
        -Interactions are stored in raw tweet data collected from get_user_tweets() - in tweet['entities']['user_mentions']
        -Retweet and favourite counts only consider the tweet from the team member - does not include retweets/likes on the original tweet
        Refactor: nested for loops
    """
    tweet_list = []
    all_ints = []
    for team_member in team_list:
        curr_tweet_count = 0 #counts the number of tweets actually collected
        tm_tweets = get_user_tweets(team_member, tweets_per_tm)
        for tweet in tm_tweets:
            tweet_ints=[]   
            user_id = tweet['user']['id_str']
            user_name = tweet['user']['screen_name']
            interacted_names = [x['screen_name']for x in tweet['entities']['user_mentions']]
            interacted_ids = [x['id_str'] for x in tweet['entities']['user_mentions']]
            #loop through interacted_names to get tuple of each team member and who they interacted with from this tweet
            for interacted in interacted_names:
                int_tuple = tuple((user_name, interacted))
                tweet_ints.append(int_tuple)
                all_ints.append(int_tuple)
            #Write raw values from tweet data, and write interaction tuple list in interected_tuples
            tweet_dict = {
                "tweet_date": tweet['created_at'],
                "tweet_id": tweet['id_str'],
                "full_text": tweet['full_text'],
                "retweet_count": tweet['retweet_count'],
                "fav_count": tweet['favorite_count'],
                "hashtags": tweet['entities']['hashtags'],
                "user_id": tweet['user']['id_str'], 
                "user_name": tweet['user']['screen_name'],
                "interacted_tuples": tweet_ints
            }
            tweet_list.append(tweet_dict)
            curr_tweet_count += 1
        print("TWEETS COLLECTED FROM {}: {}".format(team_member, str(curr_tweet_count)))    
    return pd.DataFrame(tweet_list)

In [6]:
### STEP 3 FUNCTION - function to create network graphs in matplotlib 
def create_nx_graph(in_df):
    """Returns a network graph showing what accounds that users in in_df['user_name'] interacted with on Twitter
    input:
        in_df: dataframe that includes any users in in_df['user_name'] to make graph of interactions for
    Notes:
    - Orange will be used for any nodes (twitter accounts) that are part of the OWID team, even if they are an interacted account
    instead of a use that get_user_tweets was called on
    
    """
    graph = nx.Graph()
    plot_title = "Twitter Interactions for username: " + ", ".join(in_df['user_name'].unique().tolist())
    all_interactions = []
    for i, row, in in_df.iterrows():
        if len(row['interacted_tuples']) > 0:
            for interaction in row['interacted_tuples']:
                all_interactions.append(interaction)
                graph.add_edge(interaction[0], interaction[1])
    #get the degree for each node, then get the values
    deg = nx.degree(graph)
    degree_values = [v for k, v in deg]
    node_size = [v * 100 for v in degree_values] #multiply by 100 to show properly 
    position = nx.fruchterman_reingold_layout(graph)
    color_map = []
    for node in graph:
        if node in tweets_df['user_name'].unique():
            color_map.append('#dda63a')
        else:
            color_map.append('#26bde2')

    fig = plt.figure(3,figsize=(30,30))
    plt.title(plot_title,fontdict={'fontsize':29})
    nx.draw_networkx(graph, pos=position, node_color=color_map,
                     with_labels=True, node_size=node_size,
            font_weight='bold')
    plt.close()
    return fig


In [7]:
#STEP 1 - scrape team info from team page using get_team_info()
#declare lists to show team members with/without twitter account 

team_df_filename = "output/team_info.csv"
team_df = get_team_info()

#Get list of accounts that have twitter, do not have twitter
team_twittered = [x for x in team_df['twitter_username'] if x]
team_twitterless = [x['name'] for i,x in team_df.iterrows() if not x['twitter_username']]

team_df.to_csv(team_df_filename)
print("Saved team_df to filename:", team_df_filename)

Saved team_df to filename: output/team_info.csv


In [8]:
#STEP 2 - get tweets from Twitter API, save to tweets dataframe
tweets_df_filename = "output/full_team_tweets.csv"
tweets_df = get_tweets_df(team_twittered, 100)
tweets_df.to_csv(tweets_df_filename)
print("Saved tweets_df to filename:", team_df_filename)

TWEETS COLLECTED FROM MaxCRoser: 100
TWEETS COLLECTED FROM eortizospina: 100
TWEETS COLLECTED FROM _HannahRitchie: 99
TWEETS COLLECTED FROM redouad: 100
TWEETS COLLECTED FROM larsyencken: 100
TWEETS COLLECTED FROM mathisonian: 100
TWEETS COLLECTED FROM nat_ahuja: 100
TWEETS COLLECTED FROM parriagadap: 100
TWEETS COLLECTED FROM danyx23: 100
TWEETS COLLECTED FROM salonium: 65
TWEETS COLLECTED FROM MarcelGerber9: 100
TWEETS COLLECTED FROM charliegiattino: 99
TWEETS COLLECTED FROM JoeHasell: 100
TWEETS COLLECTED FROM bbherre: 100
TWEETS COLLECTED FROM bnjmacdonald: 100
TWEETS COLLECTED FROM lucasrodesg: 100
TWEETS COLLECTED FROM mallika_snyder: 65
TWEETS COLLECTED FROM f_spooner: 98
TWEETS COLLECTED FROM jasoncrawford: 100
Saved tweets_df to filename: output/team_info.csv


In [9]:
##STEP 3 - rite network graph images - loop through each name in tweets_df, 
#create a network graph image for that users interactions, 
#save to outputs/images

fig_list = []
export_folder = "output/images/"
for i, team_member in enumerate(tweets_df['user_name'].unique()):
    filtered_df = tweets_df.loc[tweets_df['user_name']==team_member]
    tm_fig = create_nx_graph(filtered_df)
    fig_list.append(tm_fig)
    
for i, team_member in enumerate(tweets_df['user_name'].unique()):
    fig = fig_list[i]
    if len(str(i)) < 2:
        out_name = export_folder + "plot_0" + str(i) + "_" + team_member + ".png"
    else:
        out_name = export_folder + "plot_" + str(i) + "_" + team_member + ".png"
    fig.savefig(out_name)
    print("SAVED IMAGE:", out_name)
    print("----")

SAVED IMAGE: output/images/plot_00_MaxCRoser.png
----
SAVED IMAGE: output/images/plot_01_EOrtizOspina.png
----
SAVED IMAGE: output/images/plot_02__HannahRitchie.png
----
SAVED IMAGE: output/images/plot_03_redouad.png
----
SAVED IMAGE: output/images/plot_04_larsyencken.png
----
SAVED IMAGE: output/images/plot_05_mathisonian.png
----
SAVED IMAGE: output/images/plot_06_nat_ahuja.png
----
SAVED IMAGE: output/images/plot_07_parriagadap.png
----
SAVED IMAGE: output/images/plot_08_DanyX23.png
----
SAVED IMAGE: output/images/plot_09_salonium.png
----
SAVED IMAGE: output/images/plot_10_MarcelGerber9.png
----
SAVED IMAGE: output/images/plot_11_charliegiattino.png
----
SAVED IMAGE: output/images/plot_12_JoeHasell.png
----
SAVED IMAGE: output/images/plot_13_bbherre.png
----
SAVED IMAGE: output/images/plot_14_bnjmacdonald.png
----
SAVED IMAGE: output/images/plot_15_lucasrodesg.png
----
SAVED IMAGE: output/images/plot_16_mallika_snyder.png
----
SAVED IMAGE: output/images/plot_17_f_spooner.png
----
S

In [11]:
#Print out names of members that were not included in analysis
#do not have twitter handle that was scrapable on teams page: https://ourworldindata.org/team

print("NO TWITTER AVAILABLE: ", team_twitterless)

NO TWITTER AVAILABLE:  ['Matthieu Bergel', 'Marwa Boukarim', 'Natalie Reynolds-Garcia', 'Valerie Rogers Muigai', 'Dr. Pablo Rosado', 'Ike Saunders', 'Mojmir Vinkler']
