In [1]:
## imports
import pandas as pd
import numpy as np
import re
import requests
import yaml
import tweepy
import pickle
import plotnine
from plotnine import *
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', None)

## function to load credentials yaml
def load_creds(path: str):
    with open(path, 'r') as stream:
        try:
            creds = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return(creds)

# Setup: authenticate to Twitter API

In [2]:
## read creds file
creds = load_creds("../my_cred_JH.yaml")

## use bearer token to initialize a tweepy client (connection with twitter API)
client = tweepy.Client(bearer_token= creds['twitter_api']['bearer_token'])

## print type
print(type(client))

<class 'tweepy.client.Client'>


# Activity

1. Choose a public user (e.g., a politician; celebrity) and pull 100 tweets from their timeline and metadata about those tweets. When pulling metadata, make sure to get the conversation_id and count of replies (latter is in public_metrics)
2. Choose one of their tweets to focus on that got a lot of replies and get the conversation_id of that tweet
3. Paste the conversation id of that tweet into a query using [this documentation for query building](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#examples). Note that the whole query should be a string, e.g. 'conversation_id:1334987486343299072'.
4. Similar to example 1.1 in the example code, use the `search_recent_tweets` method to pull tweets that are in response to the focal tweet from step 2. If this returns no data, manually try a different `conversation_id` (some tweets start retrievable "conversations", others don't).
5. Place them in a dataframe and do some text analysis of the results (eg sentiment; tokenizing and top words)


In [3]:
## focal user: MayorBowser

### step 1: get numeric id
mmb_id = client.get_user(username= "MayorBowser").data['id']

### step 2: pull max 200 tweets
tweet_attr = ['id',
              'created_at', 'author_id', 
             'text', 'lang', 'geo',
             'conversation_id', 'public_metrics']
mmb_t_resp = client.get_users_tweets(id = mmb_id,
                max_results = 100, 
                tweet_fields = tweet_attr)

print(type(mmb_t_resp))


<class 'tweepy.client.Response'>


In [4]:
### step 3: clean up; this time, i want to get all the diff engagement metrics
### so i'm modifying the function
### function to iterate over attributes
def pull_attr_flat(one_tweet, which_attr):
    all_attr = [one_tweet[attr]   
                for attr in which_attr]
    return(all_attr)

def pull_attr_nested(one_tweet, attr = 'public_metrics'):
    all_attr_vals = [value[1] for value in
                    one_tweet[attr].items()]
    return all_attr_vals

tweet_att_1 = pd.DataFrame([pull_attr_flat(one_tweet,
                [att for att in tweet_attr if att != "public_metrics"]) 
                for one_tweet in mmb_t_resp.data],
                columns = [att for att in tweet_attr if att != "public_metrics"])

tweet_att_2 = pd.DataFrame([pull_attr_nested(one_tweet) 
                for one_tweet in mmb_t_resp.data],
                columns = [key[0] for key in mmb_t_resp.data[0]['public_metrics'].items()])

## combine
mmb_tweet_att = pd.concat([tweet_att_1, tweet_att_2], axis = 1)
mmb_tweet_att.sort_values(by = 'reply_count', ascending = False).head()

Unnamed: 0,id,created_at,author_id,text,lang,geo,conversation_id,retweet_count,reply_count,like_count,quote_count
79,1578082330568163328,2022-10-06 17:58:41+00:00,976542720,"This week we set a goal to increase Black Homeownership in the District by 20,000, by 2030. This is the first step in undoing discriminatory housing policies that locked many Black families out of homeownership throughout the twentieth century. https://t.co/8vNwCPb1zA",en,,1578082330568163328,14,9,33,0
63,1579215149835378688,2022-10-09 21:00:06+00:00,976542720,"Cedar Hill Urgent Care, GW Health is the first location to open as part of a partnership between the District and UHS. \n\nThis urgent care is a critical piece of an entire health care system serving communities east of the Anacostia. The facility will open for service Oct. 10th. https://t.co/kMH9kx3DE0",en,,1579215149835378688,6,5,35,2
43,1580315623506300928,2022-10-12 21:53:00+00:00,976542720,LIVE: Mayor Bowser Celebrates Official Completion of The Wharf https://t.co/F71SLNFZiU,en,,1580315623506300928,18,5,35,1
82,1578010651598807040,2022-10-06 13:13:52+00:00,976542720,Today join @dc_aap and @dchealthlink for Townhall Thursdays. Get all the information you need to protect your family during this season. https://t.co/g7ZRsWzUdP,en,,1578010651598807040,9,5,8,0
44,1580279963349094409,2022-10-12 19:31:18+00:00,976542720,LIVE: Mayor Bowser Hosts Media Avail https://t.co/8amRA3JGJ2,en,,1580279963349094409,16,5,19,4


In [5]:
## use query function to pull some replies
### step 1: get conversation id in automated way
focal_convo = mmb_tweet_att[mmb_tweet_att.reply_count == np.max(mmb_tweet_att.reply_count)].conversation_id.iloc[0]
## The above line doesn't work--that tweet likely didn't "start a conversation"
## so manually choose fourth result from above search results (this works)
focal_convo = 1580279963349094409

## step 2: write query to pull tweets w/ that convo id
query = "conversation_id:{}".format(focal_convo)
query

### step 3: search based on query
tweets_mask = client.search_recent_tweets(query = query, max_results = 100,
                                    tweet_fields = ['created_at',
                                                    'author_id',
                                                    'geo',
                                                    'lang', 
                                                    'public_metrics'],
                                    user_fields = ['description',
                                                   'location',
                                                   'verified',
                                                   'public_metrics'],
                                    expansions = 'author_id')

tweets_mask_df = pd.DataFrame([pull_attr_flat(one_tweet,
                [att for att in tweet_attr if att != "public_metrics"]) 
                for one_tweet in tweets_mask.data if tweets_mask.data],
                columns = [att for att in tweet_attr if att != "public_metrics"])
tweets_mask_df.head()

'conversation_id:1580279963349094409'

Unnamed: 0,id,created_at,author_id,text,lang,geo,conversation_id
0,1580532209814949888,2022-10-13 12:13:38+00:00,23318566,@MayorBowser @RachaelGass 🥱🥱🥱,und,,
1,1580419311243386880,2022-10-13 04:45:01+00:00,2873754836,"@MayorBowser MOMMY, YOU NAILED IT AGAIN!",en,,
2,1580400810164654081,2022-10-13 03:31:30+00:00,254401092,@MayorBowser Our great Mayor!,en,,
3,1580340241025896449,2022-10-12 23:30:49+00:00,952577256500559872,"@MayorBowser No one is safe in DC, especially from crime committed by deputy mayors on Bowser staff. violence knows no bounds in a liberal city. Its no wonder everyone hates liberal run big cities.",en,,


In [6]:
## do sentiment analysis on the resulting texts: 
## are they more positive or more negative?

## initialize a scorer
si_scorer = SentimentIntensityAnalyzer()

## do sentiment scoring on the tweets
tweets_mask_df['compound_sent'] = [si_scorer.polarity_scores(one_tweet)['compound'] 
                                      for one_tweet in tweets_mask_df.text]

print('Average sentiment score:', str(tweets_mask_df['compound_sent'].agg('mean'))) # display average sentiment score

tweets_mask_df.sort_values(by = 'compound_sent', ascending = False).head(5)


Average sentiment score: -0.06092499999999998


Unnamed: 0,id,created_at,author_id,text,lang,geo,conversation_id,compound_sent
2,1580400810164654081,2022-10-13 03:31:30+00:00,254401092,@MayorBowser Our great Mayor!,en,,,0.6588
0,1580532209814949888,2022-10-13 12:13:38+00:00,23318566,@MayorBowser @RachaelGass 🥱🥱🥱,und,,,0.0
1,1580419311243386880,2022-10-13 04:45:01+00:00,2873754836,"@MayorBowser MOMMY, YOU NAILED IT AGAIN!",en,,,0.0
3,1580340241025896449,2022-10-12 23:30:49+00:00,952577256500559872,"@MayorBowser No one is safe in DC, especially from crime committed by deputy mayors on Bowser staff. violence knows no bounds in a liberal city. Its no wonder everyone hates liberal run big cities.",en,,,-0.9025
