In [4]:
### Functionalize code from experimentation notebook

### Date: Feb 7, 2023
### Author: Jonathan Chan

### Notes: 
### TEAM TWITTER HANDLES FROM: https://ourworldindata.org/team

In [5]:
#General
import re
import json
import numpy as np
import pandas as pd

#for data collection - webscraping
import requests
from bs4 import BeautifulSoup
import re

#handling plots
import matplotlib.pyplot as plt

#for network creation
import networkx as nx

#for data collection - tweepy API 
import tweepy
import twitter_credentials #Twitter credentials stored in a file: twitter_credentials.py

consumer_key = twitter_credentials.CONSUMER_KEY
consumer_secret = twitter_credentials.CONSUMER_SECRET 
access_token = twitter_credentials.ACCESS_TOKEN
access_token_secret = twitter_credentials.ACCESS_SECRET

Goal: to scrape x number of tweets from each OWID team member and create a) a network map, b) a bigrams 

The following code will perform the following steps with one or more functions for each

### Steps:

1. Collect OWID team Twitter handles from the website - 
    output file: owid_team_info.csv (dataframe with columns: name, title, team, twitter_username, other_links
    
2. Iterate through Twitter handles, collect 250 tweets
    output: store as raw_tweets_DATE.json
    assume twitter developer credentials are set up already - imported as twitter_credentials.py
    
3. From raw_tweets_DATE.csv,
    sub function: from json, get list of tuples in (user, interaction) format
    sub function: from list of tuples, get network graph

In [23]:
## STEP 1:

def get_team_info(in_url="https://ourworldindata.org/team"):
    """
    Returns dataframe containing the info for each OWID team member. 
        columns:
            name: name of team member (str)
            title: job position name of team member (str)
            twitter_username: twitter username (str - no @ symbol) OR None 
            other_links: other links found at bottom of each member's section (list of str)
            
    input: 
        in_url: string containing the url for the OWID team page
    Notes:
        - Assume format of in_url matches webpage format of "https://ourworldindata.org/team"
        as of Feb 6, 2023.
        - Unless otherwise indicated, String values are scraped
        as they appear on the OWID team site - includes honorifics and capitalization 
    """
    response=requests.get(in_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tm_list= []
    
    #iterate through div element that contain info for each teammate (class name: wp-block-media-text__content)
    for tm_div in soup.find_all(class_="wp-block-media-text__content"):

        tm_name = tm_div.find("h4").text #name in h4 tag
        tm_title = tm_div.find("h5").text #title in h5 tag
        tm_twitter=None #variable to store twitter handle as string if it exists
        tm_other = [] #list to write non-twitter links to
        
        for p in tm_div.find_all("p"):
            links_list = p.find_all("a", href=True) 
            for link in links_list:
                if 'twitter' in link['href']:
                    tm_twitter = link['href'].split("/")[-1] #split by '/' and take last item to get twitter handle
                    tm_twitter = tm_twitter.replace("@", "") #remove @ symbol - easier to pass to Tweepy module
                else:
                    tm_other.append(link['href']) #add non-twitter links as 'other_links' column

        tm_dict = {
            "name": tm_name,
            "title": tm_title,
            "twitter_username": tm_twitter,
            "other_links": tm_other
        }
        tm_list.append(tm_dict) 
    
    #MANUAL CHECKS - spot check 
    assert len(team_info) == 26
    assert "Max Roser" in team_info[0]['name']
    assert "Jason Crawford" in team_info[-1]['name']
    
    return pd.DataFrame(tm_list)

url = "https://ourworldindata.org/team"

# team_df = get_team_info(url)
team_df = get_team_info()

In [24]:
team_df.head()

Unnamed: 0,name,title,twitter_username,other_links
0,Dr. Max Roser,Founder and Director,MaxCRoser,[https://ourworldindata.org/history-of-our-wor...
1,Dr. Esteban Ortiz-Ospina,Head of Strategy and Operations,eortizospina,"[https://global-change-data-lab.org/, mailto:e..."
2,Dr. Hannah Ritchie,Deputy Editor and Science Outreach Lead,_HannahRitchie,[mailto:hannah@ourworldindata.org]
3,Edouard Mathieu,Head of Data and Research,redouad,[]
4,Dr. Lars Yencken,Head of Engineering,larsyencken,[mailto:lars@ourworldindata.org]


In [46]:
## STEP 2: connect to API, intake team_df, 
#return json containing all tweets, as well as list of no twitter data available 

#SUBFUNCTION - input twtter handle, number of tweets
#FUNCTION - creates the JSON 

def get_user_tweets(username="OurWorldInData", num_tweets=100):
    """Returns a list of raw tweet JSON items based on a username
    
    
    Note: 
        REFACTOR: handle queries of tweets more than tweepy limit for a specific
        Assume consumer_key, consumer secret are defined before calling function
        
        please consult developer docs to ensure that your Twitter Developer account
        can access the number of tweets you are interested in per month
        https://developer.twitter.com/en/support/twitter-api/developer-account
    """
    #authenticate twitter credentials
    auth = tweepy.OAuth2AppHandler(
    consumer_key, consumer_secret
    )
    api = tweepy.API(auth)
    
    #collect num_tweets
    all_tweets = []
    tweets_extended = api.user_timeline(id=curr_user, 
                                        tweet_mode='extended', 
                                        count=num_tweets)
    all_tweets = all_tweets + tweets_extended
    assert len(all_tweets) == num_tweets
    print("COLLECTED: {} TWEETS FROM {}".format(str(num_tweets), username))
    return all_tweets

In [47]:
get_user_tweets('MaxCRoser', num_tweets=5)


Unexpected parameter: id


COLLECTED 5 TWEETS FROM MaxCRoser


[Status(_api=<tweepy.api.API object at 0x7f80cab9d460>, _json={'created_at': 'Tue Feb 07 13:57:28 +0000 2023', 'id': 1622957721078140928, 'id_str': '1622957721078140928', 'full_text': 'What do experts in artificial intelligence expect for the future? \n\nA new article in which we bring together a range of sources on AI timelines.\nThis chart shows the responses from 812 AI experts who were surveyed in three studies.\n\n→ Here is the article https://t.co/CWWVQTJL41 https://t.co/2HupwDtxl0', 'truncated': False, 'display_text_range': [0, 278], 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/CWWVQTJL41', 'expanded_url': 'https://ourworldindata.org/ai-timelines', 'display_url': 'ourworldindata.org/ai-timelines', 'indices': [255, 278]}], 'media': [{'id': 1622957269192146944, 'id_str': '1622957269192146944', 'indices': [279, 302], 'media_url': 'http://pbs.twimg.com/media/FoXnCZvWAAANqwy.jpg', 'media_url_https': 'https://pbs.twimg.com/media/FoXnC

AttributeError: 'API' object has no attribute 'get_users_followers'