# Collect data from Twitter using Twitter API

## import packages and configure twitter api

In [1]:
# import libraries

# !pip install tweepy
import tweepy as tw
print(tw.__version__)

4.10.0


In [50]:
# !pip install pandas
import pandas as pd
import json
import time
import csv
import requests
print(requests.__version__)

2.28.1


In [3]:
# import keys from a .gitignore file

import keys

In [4]:
# assign the values accordingly

client = tw.Client(
    wait_on_rate_limit = True,
    consumer_key = keys.consumer_key,
    consumer_secret = keys.consumer_secret,
    access_token = keys.access_token,
    access_token_secret = keys.access_token_secret,
    bearer_token = keys.bearer_token,
)

In [5]:
# check connection
# client = tw.Client(bearer_token)
client

<tweepy.client.Client at 0x7f89c2ab1bb0>

## get the list of news media twitter username

In [6]:
# read a csv file with selected news media's twitter username 

news_media_basic_data = pd.read_csv (r'/Users/katherina/data_science_tweets_neutrality/news_media_twitter_username.csv')
news_media_basic_data

Unnamed: 0,news_media,twitter_username,us_or_foreign,general_or_political
0,ABC News,ABC,US,General
1,ABC News,ABCPolitics,US,Political
2,ABC News,ABCNewsLive,US,General
3,BBC News,BBCWorld,Foreign,General
4,BBC News,BBCBreaking,Foreign,General
5,BBC News,BBCNews,Foreign,General
6,BBC News,BBCNorthAmerica,Foreign,General
7,CBS News,CBSNews,US,General
8,CBS News,CBSPolitics,US,Political
9,CBS News,CBSNewsPoll,US,Political


In [7]:
news_media_twitter_username_list = news_media_basic_data['twitter_username'].tolist()
news_media_twitter_username_list

# for testing
# news_media_twitter_username_list = ["ABC","ABCPolitics","ABCNewsLive"]
# news_media_twitter_username_list

['ABC',
 'ABCPolitics',
 'ABCNewsLive',
 'BBCWorld',
 'BBCBreaking',
 'BBCNews',
 'BBCNorthAmerica',
 'CBSNews',
 'CBSPolitics',
 'CBSNewsPoll',
 'CNN',
 'cnnbrk',
 'CNNPolitics',
 'FoxNews',
 'foxnewspolitics',
 'euronews',
 'MSNBC',
 'NBCNews',
 'NBCPolitics',
 'nytimes',
 'nytpolitics',
 'NPR',
 'nprpolitics',
 'SkyNews',
 'USATODAY',
 'usatodayDC',
 'WSJ',
 'WSJusnews',
 'washingtonpost']

## Call Twitter API v2

### get user details

In [8]:
# call twitter API v2 to get user id and other user information for the user in the list

def get_users_details(usernames):
        users_details = client.get_users(usernames=usernames, user_fields=["created_at","location","protected","public_metrics","verified"])
        return users_details

twitter_user_details = get_users_details(news_media_twitter_username_list)
twitter_user_details

Response(data=[<User id=28785486 name=ABC News username=ABC>, <User id=16815644 name=ABC News Politics username=ABCPolitics>, <User id=384438102 name=ABC News Live username=ABCNewsLive>, <User id=742143 name=BBC News (World) username=BBCWorld>, <User id=5402612 name=BBC Breaking News username=BBCBreaking>, <User id=612473 name=BBC News (UK) username=BBCNews>, <User id=69329527 name=BBC North America username=BBCNorthAmerica>, <User id=15012486 name=CBS News username=CBSNews>, <User id=18767699 name=CBS News Politics username=CBSPolitics>, <User id=270508448 name=CBS News Poll username=CBSNewsPoll>, <User id=759251 name=CNN username=CNN>, <User id=428333 name=CNN Breaking News username=cnnbrk>, <User id=13850422 name=CNN Politics username=CNNPolitics>, <User id=1367531 name=Fox News username=FoxNews>, <User id=16032925 name=Fox News Politics username=foxnewspolitics>, <User id=25067168 name=euronews username=euronews>, <User id=2836421 name=MSNBC username=MSNBC>, <User id=14173315 name=

In [10]:
# create a list of records
twitter_user_details_lst = []

# iterate over each users and corresponding user details
for user in twitter_user_details.data:
    user_info = {
        'user_id': user.id,
        'user_name': user.name,
        'user_username': user.username,
        "user_created_at" : user.created_at,
        "user_location" : user.location,
        "user_protected" : user.protected,
        "user_followers_count" : user.public_metrics["followers_count"],
        "user_following_count" : user.public_metrics["following_count"],
        "user_tweet_count" : user.public_metrics["tweet_count"],
        "user_listed_count" : user.public_metrics["listed_count"],
        "user_verified" : user.verified
    }
    twitter_user_details_lst.append(user_info)

# create dataframe from the extracted records
twitter_user_details_df = pd.DataFrame(twitter_user_details_lst)
# display the dataframe
twitter_user_details_df

Unnamed: 0,user_id,user_name,user_username,user_created_at,user_location,user_protected,user_followers_count,user_following_count,user_tweet_count,user_listed_count,user_verified
0,28785486,ABC News,ABC,2009-04-04 12:40:32+00:00,New York City / Worldwide,False,17637543,486,392806,66090,True
1,16815644,ABC News Politics,ABCPolitics,2008-10-16 22:43:16+00:00,"Washington, DC",False,1079082,473,192649,8493,True
2,384438102,ABC News Live,ABCNewsLive,2011-10-03 17:47:38+00:00,,False,95939,242,36702,2238,True
3,742143,BBC News (World),BBCWorld,2007-02-01 07:44:29+00:00,"London, UK",False,37686088,18,349672,132449,True
4,5402612,BBC Breaking News,BBCBreaking,2007-04-22 14:42:37+00:00,"London, UK",False,50643440,3,38172,147688,True
5,612473,BBC News (UK),BBCNews,2007-01-08 08:05:57+00:00,London,False,14255861,93,498922,49355,True
6,69329527,BBC North America,BBCNorthAmerica,2009-08-27 16:09:18+00:00,Washington DC,False,312480,790,74325,4434,True
7,15012486,CBS News,CBSNews,2008-06-05 00:54:31+00:00,"New York, NY",False,8713463,596,344979,56334,True
8,18767699,CBS News Politics,CBSPolitics,2009-01-08 15:57:49+00:00,"Washington, D.C.",False,313506,317,28419,3898,True
9,270508448,CBS News Poll,CBSNewsPoll,2011-03-22 19:04:44+00:00,,False,13358,260,2692,700,True


In [12]:
# check the data
twitter_user_details_df.count()

user_id                 29
user_name               29
user_username           29
user_created_at         29
user_location           23
user_protected          29
user_followers_count    29
user_following_count    29
user_tweet_count        29
user_listed_count       29
user_verified           29
dtype: int64

In [13]:
# save the data to csv
twitter_user_details_df.to_csv (r'/Users/katherina/data_science_tweets_neutrality/twitter_user_details.csv', index = False, header=True)

### get tweets of users

In [14]:
# use users ids from the created dataset to get tweets of those user ids

In [15]:
# read the csv that was created for user details
twitter_user_details_data = pd.read_csv (r'/Users/katherina/data_science_tweets_neutrality/twitter_user_details.csv')
twitter_user_details_data.head(10)

Unnamed: 0,user_id,user_name,user_username,user_created_at,user_location,user_protected,user_followers_count,user_following_count,user_tweet_count,user_listed_count,user_verified
0,28785486,ABC News,ABC,2009-04-04 12:40:32+00:00,New York City / Worldwide,False,17637543,486,392806,66090,True
1,16815644,ABC News Politics,ABCPolitics,2008-10-16 22:43:16+00:00,"Washington, DC",False,1079082,473,192649,8493,True
2,384438102,ABC News Live,ABCNewsLive,2011-10-03 17:47:38+00:00,,False,95939,242,36702,2238,True
3,742143,BBC News (World),BBCWorld,2007-02-01 07:44:29+00:00,"London, UK",False,37686088,18,349672,132449,True
4,5402612,BBC Breaking News,BBCBreaking,2007-04-22 14:42:37+00:00,"London, UK",False,50643440,3,38172,147688,True
5,612473,BBC News (UK),BBCNews,2007-01-08 08:05:57+00:00,London,False,14255861,93,498922,49355,True
6,69329527,BBC North America,BBCNorthAmerica,2009-08-27 16:09:18+00:00,Washington DC,False,312480,790,74325,4434,True
7,15012486,CBS News,CBSNews,2008-06-05 00:54:31+00:00,"New York, NY",False,8713463,596,344979,56334,True
8,18767699,CBS News Politics,CBSPolitics,2009-01-08 15:57:49+00:00,"Washington, D.C.",False,313506,317,28419,3898,True
9,270508448,CBS News Poll,CBSNewsPoll,2011-03-22 19:04:44+00:00,,False,13358,260,2692,700,True


In [28]:
# create a list for user ids
user_id_list = twitter_user_details_data["user_id"].tolist()
user_id_list

[28785486,
 16815644,
 384438102,
 742143,
 5402612,
 612473,
 69329527,
 15012486,
 18767699,
 270508448,
 759251,
 428333,
 13850422,
 1367531,
 16032925,
 25067168,
 2836421,
 14173315,
 11856032,
 807095,
 14434063,
 5392522,
 5741722,
 7587032,
 15754281,
 7998482,
 3108351,
 28135853,
 2467791]

In [29]:
# check the length of the id list
len(user_id_list)

29

In [43]:
# Create funtions to get user's Tweets and save the results

# a function for gettting a user's tweet, input user_id, start_time, and end_time
def get_users_tweets(user_id,start_time,end_time):
    users_tweets = client.get_users_tweets(id=user_id,
                                           end_time=end_time, 
                                           exclude=None, 
                                           expansions="referenced_tweets.id", 
                                           max_results=100, 
                                           pagination_token=next_token,
                                           start_time=start_time, 
                                           tweet_fields=["author_id","context_annotations","created_at","entities","in_reply_to_user_id","lang","public_metrics","referenced_tweets"])
    return users_tweets
# use tweet_fields to specify the fields we want to include in the response


# a function to append get_users_tweets response to a list
def user_tweets_response_to_lst(users_tweets,users_tweets_lst):
    if users_tweets.data is not None:
        for tweets in users_tweets.data:
            users_tweets_details = {
                'tweet_id': tweets.id,
                'tweet_text': tweets.text,
                'tweet_author_id': tweets.author_id,
                "tweet_context_annotations" : tweets.context_annotations,
                "tweet_created_at" : tweets.created_at,
                "tweet_entities" : tweets.entities,
                "tweet_in_reply_to_user_id" : tweets.in_reply_to_user_id,
                "tweet_lang" : tweets.lang,
                "tweet_retweet_count" : tweets.public_metrics["retweet_count"],
                "tweet_reply_count" : tweets.public_metrics["reply_count"],
                "tweet_like_count" : tweets.public_metrics["like_count"],
                "tweet_quote_count" : tweets.public_metrics["quote_count"],
                "tweet_referenced_tweets" : tweets.referenced_tweets
            }
            users_tweets_lst.append(users_tweets_details)
    else: users_tweets_lst = users_tweets_lst
    return users_tweets_lst

# use the if loop in case the response data is None
# use the for loop to iterate through the response 
# save them in a list with items expanded

In [46]:
# use get_users_tweets and user_tweets_response_to_lst to extract and save users' tweet for tweets in 2022 July

start_time = "2022-07-01T00:00:00Z"
end_time = "2022-08-01T00:00:00Z"

users_tweets_2022_july_lst = []

# use a for loop to iterate through the user ids
# use a while loop to iterate through result page with the next_token
# use timmer to controle the loops
# store the results in a list

for user_id in user_id_list:
    finished = False
    next_token = None
    count_page = 0
    while finished is not True:
        users_tweets = get_users_tweets(user_id,start_time,end_time)
        users_tweets_2022_july_lst = user_tweets_response_to_lst(users_tweets,users_tweets_2022_july_lst)
        count_page = count_page + 1
        if 'next_token' in users_tweets.meta:
            next_token = users_tweets.meta["next_token"]
            time.sleep(5)
        else:
            finished = True
            print("user_id: ",user_id,"count result page: ",count_page)
            time.sleep(5)

# print count of result page for each user

user_id:  28785486 count result page:  26
user_id:  16815644 count result page:  12
user_id:  384438102 count result page:  5
user_id:  742143 count result page:  13
user_id:  5402612 count result page:  1
user_id:  612473 count result page:  25
user_id:  69329527 count result page:  2
user_id:  15012486 count result page:  27
user_id:  18767699 count result page:  1
user_id:  270508448 count result page:  1
user_id:  759251 count result page:  25
user_id:  428333 count result page:  2
user_id:  13850422 count result page:  5
user_id:  1367531 count result page:  18
user_id:  16032925 count result page:  1
user_id:  25067168 count result page:  10
user_id:  2836421 count result page:  19
user_id:  14173315 count result page:  26
user_id:  11856032 count result page:  4
user_id:  807095 count result page:  27
user_id:  14434063 count result page:  4
user_id:  5392522 count result page:  14
user_id:  5741722 count result page:  7
user_id:  7587032 count result page:  22
user_id:  1575428

In [47]:
# transform the list to a dataframe
twitter_users_tweets_2022_july_df = pd.DataFrame(users_tweets_2022_july_lst)
twitter_users_tweets_2022_july_df.head(10)

Unnamed: 0,tweet_id,tweet_text,tweet_author_id,tweet_context_annotations,tweet_created_at,tweet_entities,tweet_in_reply_to_user_id,tweet_lang,tweet_retweet_count,tweet_reply_count,tweet_like_count,tweet_quote_count,tweet_referenced_tweets
0,1553892491962843136,Smokers and vapers are more likely to have a s...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 23:56:54+00:00,"{'urls': [{'start': 111, 'end': 134, 'url': 'h...",,en,59,43,144,16,
1,1553886764930138122,Gun manufacturers have made more than $1 billi...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 23:34:09+00:00,"{'urls': [{'start': 214, 'end': 237, 'url': 'h...",,en,82,388,187,65,
2,1553880276702560256,Misinformation and stigma may be holding back ...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 23:08:22+00:00,"{'urls': [{'start': 69, 'end': 92, 'url': 'htt...",,en,24,39,46,3,
3,1553872564820361222,Rising concern over the impact of a potential ...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 22:37:43+00:00,"{'annotations': [{'start': 90, 'end': 96, 'pro...",,en,23,28,58,7,
4,1553865859977908225,"At least 28 people have been confirmed dead, b...",28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-07-31 22:11:05+00:00,"{'annotations': [{'start': 98, 'end': 109, 'pr...",,en,83,38,204,6,
5,1553861323691376648,A fire burning out of control in a Northern Ca...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 21:53:03+00:00,"{'annotations': [{'start': 35, 'end': 53, 'pro...",,en,47,8,81,0,
6,1553856250613534720,Two cyclists were killed and three others inju...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 21:32:54+00:00,"{'annotations': [{'start': 78, 'end': 92, 'pro...",,en,27,5,46,1,
7,1553847893634809858,Dramatic video captures the moment a portion o...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 20:59:41+00:00,"{'annotations': [{'start': 48, 'end': 53, 'pro...",,en,31,5,53,3,
8,1553844156761620485,"NEW: George Takei on Nichelle Nichols: ""My hea...",28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-07-31 20:44:50+00:00,"{'annotations': [{'start': 5, 'end': 16, 'prob...",28785486.0,en,115,19,826,3,"[(type, id)]"
9,1553839967796854785,NEW: Former Pres. Barack Obama on Bill Russell...,28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-07-31 20:28:11+00:00,"{'annotations': [{'start': 18, 'end': 29, 'pro...",,en,1303,135,6073,55,


In [48]:
# check the tweets per author id
twitter_users_tweets_2022_july_df.groupby(['tweet_author_id'])['tweet_author_id'].count()

tweet_author_id
428333        133
612473       2454
742143       1288
759251       2413
807095       2614
1367531      1790
2467791      2487
2836421      1802
3108351      2709
5392522      1310
5402612        87
5741722       610
7587032      2162
7998482       576
11856032      374
13850422      495
14173315     2565
14434063      381
15012486     2600
15754281     1892
16815644     1137
18767699       78
25067168      958
28785486     2526
69329527      134
270508448      23
384438102     440
Name: tweet_author_id, dtype: int64

In [49]:
# save the data to a csv file
twitter_users_tweets_2022_july_df.to_csv (r'/Users/katherina/data_science_tweets_neutrality/twitter_users_tweets_2022_july.csv', index = False, header=True)

In [25]:
# users_tweets.meta["next_token"]

In [14]:
# tweet_details = [[tweet.geo, tweet.text, tweet.user.screen_name, tweet.user.location] for tweet in tweets]
# tweet_df = pd.DataFrame(data=tweet_details, columns=["geo","text","user","location"])
# tweet_df.head()