# Collect data from Twitter using Twitter API

## import packages and configure twitter api

In [1]:
# import libraries

# !pip install tweepy
import tweepy as tw
print(tw.__version__)

4.10.0


In [2]:
# !pip install pandas
import pandas as pd
import json
import time
import csv
import requests
print(requests.__version__)

2.28.1


In [3]:
# import keys from a .gitignore file

import keys

In [4]:
# assign the values accordingly

client = tw.Client(
    wait_on_rate_limit = True,
    consumer_key = keys.consumer_key,
    consumer_secret = keys.consumer_secret,
    access_token = keys.access_token,
    access_token_secret = keys.access_token_secret,
    bearer_token = keys.bearer_token,
)

In [5]:
# check connection
# client = tw.Client(bearer_token)
client

<tweepy.client.Client at 0x7f88203deca0>

## get the list of news media twitter username

In [6]:
# read a csv file with selected news media's twitter username 

news_media_basic_data = pd.read_csv (r'/Users/katherina/data_science_tweets_neutrality/news_media_twitter_username.csv')
news_media_basic_data

Unnamed: 0,news_media,twitter_username,us_or_foreign,general_or_political,political_spectrum
0,ABC News,ABC,US,General,Left
1,ABC News,ABCNewsLive,US,General,Left
2,ABC News,ABCPolitics,US,Political,Left
3,BBC News,BBCBreaking,Foreign,General,Middle
4,BBC News,BBCNews,Foreign,General,Middle
...,...,...,...,...,...
58,Vox,voxdotcom,US,General,Left
59,Wall Street Journal,WSJ,US,General,Middle
60,Wall Street Journal,WSJusnews,US,General,Middle
61,Washington Examiner,dcexaminer,US,General,Right


In [7]:
news_media_twitter_username_list = news_media_basic_data['twitter_username'].tolist()
news_media_twitter_username_list

# for testing
# news_media_twitter_username_list = ["ABC","ABCPolitics","ABCNewsLive"]
# news_media_twitter_username_list

['ABC',
 'ABCNewsLive',
 'ABCPolitics',
 'BBCBreaking',
 'BBCNews',
 'BBCNorthAmerica',
 'BBCWorld',
 'bpolitics',
 'business',
 'CBSNews',
 'CBSNewsPoll',
 'CBSPolitics',
 'CNN',
 'cnnbrk',
 'CNNPolitics',
 'DailyMail',
 'DailyWireNews',
 'realDailyWire',
 'democracynow',
 'Forbes',
 'FoxNews',
 'foxnewspolitics',
 'HuffPost',
 'HuffPostPol',
 'MotherJones',
 'MSNBC',
 'NRWire',
 'NBCNews',
 'NBCPolitics',
 'nytimes',
 'nytpolitics',
 'Newsweek',
 'NPR',
 'nprpolitics',
 'politico',
 'politicony',
 'Reuters',
 'SkyNews',
 'Slate',
 'amspectator',
 'TheAtlantic',
 'TheAtlPolitics',
 'thedailybeast',
 'EconUS',
 'TheEconomist',
 'EpochTimes',
 'FDRLST',
 'GdnPolitics',
 'guardian',
 'guardiannews',
 'thehill',
 'theintercept',
 'NewYorker',
 'WashTimes',
 'theblaze',
 'TIME',
 'USATODAY',
 'usatodayDC',
 'voxdotcom',
 'WSJ',
 'WSJusnews',
 'dcexaminer',
 'washingtonpost']

## Call Twitter API v2

### get user details

In [8]:
# call twitter API v2 to get user id and other user information for the user in the list

def get_users_details(usernames):
        users_details = client.get_users(usernames=usernames, user_fields=["created_at","location","protected","public_metrics","verified"])
        return users_details

twitter_user_details = get_users_details(news_media_twitter_username_list)
twitter_user_details

Response(data=[<User id=28785486 name=ABC News username=ABC>, <User id=384438102 name=ABC News Live username=ABCNewsLive>, <User id=16815644 name=ABC News Politics username=ABCPolitics>, <User id=5402612 name=BBC Breaking News username=BBCBreaking>, <User id=612473 name=BBC News (UK) username=BBCNews>, <User id=69329527 name=BBC North America username=BBCNorthAmerica>, <User id=742143 name=BBC News (World) username=BBCWorld>, <User id=564111558 name=Bloomberg Politics username=bpolitics>, <User id=34713362 name=Bloomberg username=business>, <User id=15012486 name=CBS News username=CBSNews>, <User id=270508448 name=CBS News Poll username=CBSNewsPoll>, <User id=18767699 name=CBS News Politics username=CBSPolitics>, <User id=759251 name=CNN username=CNN>, <User id=428333 name=CNN Breaking News username=cnnbrk>, <User id=13850422 name=CNN Politics username=CNNPolitics>, <User id=380285402 name=Daily Mail US username=DailyMail>, <User id=1215486846798569473 name=Daily Wire News username=Dai

In [9]:
# create a list of records
twitter_user_details_lst = []

# iterate over each users and corresponding user details
for user in twitter_user_details.data:
    user_info = {
        'user_id': user.id,
        'user_name': user.name,
        'user_username': user.username,
        "user_created_at" : user.created_at,
        "user_location" : user.location,
        "user_protected" : user.protected,
        "user_followers_count" : user.public_metrics["followers_count"],
        "user_following_count" : user.public_metrics["following_count"],
        "user_tweet_count" : user.public_metrics["tweet_count"],
        "user_listed_count" : user.public_metrics["listed_count"],
        "user_verified" : user.verified
    }
    twitter_user_details_lst.append(user_info)

# create dataframe from the extracted records
twitter_user_details_df = pd.DataFrame(twitter_user_details_lst)
# display the dataframe
twitter_user_details_df

Unnamed: 0,user_id,user_name,user_username,user_created_at,user_location,user_protected,user_followers_count,user_following_count,user_tweet_count,user_listed_count,user_verified
0,28785486,ABC News,ABC,2009-04-04 12:40:32+00:00,New York City / Worldwide,False,17644680,486,393230,66112,True
1,384438102,ABC News Live,ABCNewsLive,2011-10-03 17:47:38+00:00,,False,96020,242,36785,2240,True
2,16815644,ABC News Politics,ABCPolitics,2008-10-16 22:43:16+00:00,"Washington, DC",False,1079807,473,192830,8499,True
3,5402612,BBC Breaking News,BBCBreaking,2007-04-22 14:42:37+00:00,"London, UK",False,50727938,3,38184,147742,True
4,612473,BBC News (UK),BBCNews,2007-01-08 08:05:57+00:00,London,False,14276300,93,499300,49372,True
...,...,...,...,...,...,...,...,...,...,...,...
58,2347049341,Vox,voxdotcom,2014-02-16 15:46:00+00:00,,False,1046634,166,144876,16635,True
59,3108351,The Wall Street Journal,WSJ,2007-04-01 06:22:13+00:00,"New York, NY",False,20084774,1077,380324,5,True
60,28135853,WSJ U.S. News,WSJusnews,2009-04-01 16:20:16+00:00,"New York, NY",False,39831,39,32908,1184,False
61,18956073,Washington Examiner,dcexaminer,2009-01-13 22:16:00+00:00,"Washington, DC",False,309186,372,622664,4096,True


In [10]:
# check the data
twitter_user_details_df.count()

user_id                 63
user_name               63
user_username           63
user_created_at         63
user_location           48
user_protected          63
user_followers_count    63
user_following_count    63
user_tweet_count        63
user_listed_count       63
user_verified           63
dtype: int64

In [11]:
# save the data to csv
twitter_user_details_df.to_csv (r'/Users/katherina/data_science_tweets_neutrality/twitter_user_details.csv', index = False, header=True)

### get tweets of users

In [12]:
# use users ids from the created dataset to get tweets of those user ids

In [13]:
# read the csv that was created for user details
twitter_user_details_data = pd.read_csv (r'/Users/katherina/data_science_tweets_neutrality/twitter_user_details.csv')
twitter_user_details_data.head(10)

Unnamed: 0,user_id,user_name,user_username,user_created_at,user_location,user_protected,user_followers_count,user_following_count,user_tweet_count,user_listed_count,user_verified
0,28785486,ABC News,ABC,2009-04-04 12:40:32+00:00,New York City / Worldwide,False,17644680,486,393230,66112,True
1,384438102,ABC News Live,ABCNewsLive,2011-10-03 17:47:38+00:00,,False,96020,242,36785,2240,True
2,16815644,ABC News Politics,ABCPolitics,2008-10-16 22:43:16+00:00,"Washington, DC",False,1079807,473,192830,8499,True
3,5402612,BBC Breaking News,BBCBreaking,2007-04-22 14:42:37+00:00,"London, UK",False,50727938,3,38184,147742,True
4,612473,BBC News (UK),BBCNews,2007-01-08 08:05:57+00:00,London,False,14276300,93,499300,49372,True
5,69329527,BBC North America,BBCNorthAmerica,2009-08-27 16:09:18+00:00,Washington DC,False,312777,790,74351,4434,True
6,742143,BBC News (World),BBCWorld,2007-02-01 07:44:29+00:00,"London, UK",False,37774379,18,349903,132471,True
7,564111558,Bloomberg Politics,bpolitics,2012-04-26 20:50:01+00:00,,False,361262,970,224072,5519,True
8,34713362,Bloomberg,business,2009-04-23 20:05:17+00:00,New York and the World,False,8533888,1420,828896,67642,True
9,15012486,CBS News,CBSNews,2008-06-05 00:54:31+00:00,"New York, NY",False,8719635,596,345544,56372,True


In [14]:
# create a list for user ids
user_id_list = twitter_user_details_data["user_id"].tolist()
user_id_list

[28785486,
 384438102,
 16815644,
 5402612,
 612473,
 69329527,
 742143,
 564111558,
 34713362,
 15012486,
 270508448,
 18767699,
 759251,
 428333,
 13850422,
 380285402,
 1215486846798569473,
 4081106480,
 16935292,
 91478624,
 1367531,
 16032925,
 14511951,
 15458694,
 18510860,
 2836421,
 1179092476931579909,
 14173315,
 11856032,
 807095,
 14434063,
 2884771,
 5392522,
 5741722,
 9300262,
 140928543,
 1652541,
 7587032,
 15164565,
 16180004,
 35773039,
 140844060,
 16012783,
 32353291,
 5988062,
 29097819,
 1408003598,
 47636400,
 87818409,
 788524,
 1917731,
 2329066872,
 14677919,
 14662354,
 10774652,
 14293310,
 15754281,
 7998482,
 2347049341,
 3108351,
 28135853,
 18956073,
 2467791]

In [15]:
# check the length of the id list
len(user_id_list)

63

In [16]:
# Create funtions to get user's Tweets and save the results

# a function for gettting a user's tweet, input user_id, start_time, and end_time
def get_users_tweets(user_id,start_time,end_time):
    users_tweets = client.get_users_tweets(id=user_id,
                                           end_time=end_time, 
                                           exclude=None, 
                                           expansions="referenced_tweets.id", 
                                           max_results=100, 
                                           pagination_token=next_token,
                                           start_time=start_time, 
                                           tweet_fields=["author_id","context_annotations","created_at","entities","in_reply_to_user_id","lang","public_metrics","referenced_tweets"])
    return users_tweets
# use tweet_fields to specify the fields we want to include in the response


# a function to append get_users_tweets response to a list
def user_tweets_response_to_lst(users_tweets,users_tweets_lst):
    if users_tweets.data is not None:
        for tweets in users_tweets.data:
            users_tweets_details = {
                'tweet_id': tweets.id,
                'tweet_text': tweets.text,
                'tweet_author_id': tweets.author_id,
                "tweet_context_annotations" : tweets.context_annotations,
                "tweet_created_at" : tweets.created_at,
                "tweet_entities" : tweets.entities,
                "tweet_in_reply_to_user_id" : tweets.in_reply_to_user_id,
                "tweet_lang" : tweets.lang,
                "tweet_retweet_count" : tweets.public_metrics["retweet_count"],
                "tweet_reply_count" : tweets.public_metrics["reply_count"],
                "tweet_like_count" : tweets.public_metrics["like_count"],
                "tweet_quote_count" : tweets.public_metrics["quote_count"],
                "tweet_referenced_tweets" : tweets.referenced_tweets
            }
            users_tweets_lst.append(users_tweets_details)
    else: users_tweets_lst = users_tweets_lst
    return users_tweets_lst

# use the if loop in case the response data is None
# use the for loop to iterate through the response 
# save them in a list with items expanded

#### tweets in 2022 July

In [46]:
# use get_users_tweets and user_tweets_response_to_lst to extract and save users' tweet for tweets in 2022 July

start_time = "2022-07-01T00:00:00Z"
end_time = "2022-08-01T00:00:00Z"

users_tweets_2022_july_lst = []

# use a for loop to iterate through the user ids
# use a while loop to iterate through result page with the next_token
# use timmer to controle the loops
# store the results in a list

for user_id in user_id_list:
    finished = False
    next_token = None
    count_page = 0
    while finished is not True:
        users_tweets = get_users_tweets(user_id,start_time,end_time)
        users_tweets_2022_july_lst = user_tweets_response_to_lst(users_tweets,users_tweets_2022_july_lst)
        count_page = count_page + 1
        if 'next_token' in users_tweets.meta:
            next_token = users_tweets.meta["next_token"]
            time.sleep(5)
        else:
            finished = True
            print("user_id: ",user_id,"count result page: ",count_page)
            time.sleep(5)

# print count of result page for each user

user_id:  28785486 count result page:  26
user_id:  16815644 count result page:  12
user_id:  384438102 count result page:  5
user_id:  742143 count result page:  13
user_id:  5402612 count result page:  1
user_id:  612473 count result page:  25
user_id:  69329527 count result page:  2
user_id:  15012486 count result page:  27
user_id:  18767699 count result page:  1
user_id:  270508448 count result page:  1
user_id:  759251 count result page:  25
user_id:  428333 count result page:  2
user_id:  13850422 count result page:  5
user_id:  1367531 count result page:  18
user_id:  16032925 count result page:  1
user_id:  25067168 count result page:  10
user_id:  2836421 count result page:  19
user_id:  14173315 count result page:  26
user_id:  11856032 count result page:  4
user_id:  807095 count result page:  27
user_id:  14434063 count result page:  4
user_id:  5392522 count result page:  14
user_id:  5741722 count result page:  7
user_id:  7587032 count result page:  22
user_id:  1575428

In [47]:
# transform the list to a dataframe
twitter_users_tweets_2022_july_df = pd.DataFrame(users_tweets_2022_july_lst)
twitter_users_tweets_2022_july_df.head(10)

Unnamed: 0,tweet_id,tweet_text,tweet_author_id,tweet_context_annotations,tweet_created_at,tweet_entities,tweet_in_reply_to_user_id,tweet_lang,tweet_retweet_count,tweet_reply_count,tweet_like_count,tweet_quote_count,tweet_referenced_tweets
0,1553892491962843136,Smokers and vapers are more likely to have a s...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 23:56:54+00:00,"{'urls': [{'start': 111, 'end': 134, 'url': 'h...",,en,59,43,144,16,
1,1553886764930138122,Gun manufacturers have made more than $1 billi...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 23:34:09+00:00,"{'urls': [{'start': 214, 'end': 237, 'url': 'h...",,en,82,388,187,65,
2,1553880276702560256,Misinformation and stigma may be holding back ...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 23:08:22+00:00,"{'urls': [{'start': 69, 'end': 92, 'url': 'htt...",,en,24,39,46,3,
3,1553872564820361222,Rising concern over the impact of a potential ...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 22:37:43+00:00,"{'annotations': [{'start': 90, 'end': 96, 'pro...",,en,23,28,58,7,
4,1553865859977908225,"At least 28 people have been confirmed dead, b...",28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-07-31 22:11:05+00:00,"{'annotations': [{'start': 98, 'end': 109, 'pr...",,en,83,38,204,6,
5,1553861323691376648,A fire burning out of control in a Northern Ca...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 21:53:03+00:00,"{'annotations': [{'start': 35, 'end': 53, 'pro...",,en,47,8,81,0,
6,1553856250613534720,Two cyclists were killed and three others inju...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 21:32:54+00:00,"{'annotations': [{'start': 78, 'end': 92, 'pro...",,en,27,5,46,1,
7,1553847893634809858,Dramatic video captures the moment a portion o...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-07-31 20:59:41+00:00,"{'annotations': [{'start': 48, 'end': 53, 'pro...",,en,31,5,53,3,
8,1553844156761620485,"NEW: George Takei on Nichelle Nichols: ""My hea...",28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-07-31 20:44:50+00:00,"{'annotations': [{'start': 5, 'end': 16, 'prob...",28785486.0,en,115,19,826,3,"[(type, id)]"
9,1553839967796854785,NEW: Former Pres. Barack Obama on Bill Russell...,28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-07-31 20:28:11+00:00,"{'annotations': [{'start': 18, 'end': 29, 'pro...",,en,1303,135,6073,55,


In [48]:
# check the tweets per author id
twitter_users_tweets_2022_july_df.groupby(['tweet_author_id'])['tweet_author_id'].count()

tweet_author_id
428333        133
612473       2454
742143       1288
759251       2413
807095       2614
1367531      1790
2467791      2487
2836421      1802
3108351      2709
5392522      1310
5402612        87
5741722       610
7587032      2162
7998482       576
11856032      374
13850422      495
14173315     2565
14434063      381
15012486     2600
15754281     1892
16815644     1137
18767699       78
25067168      958
28785486     2526
69329527      134
270508448      23
384438102     440
Name: tweet_author_id, dtype: int64

In [49]:
# save the data to a csv file
twitter_users_tweets_2022_july_df.to_csv (r'/Users/katherina/data_science_tweets_neutrality/twitter_users_tweets_2022_july.csv', index = False, header=True)

#### tweets in 2020 October

In [11]:
# use get_users_tweets and user_tweets_response_to_lst to extract and save users' tweet for tweets in 2020 October
# US 2020 presidential election day was on 2020/11/03
# considered the deadline for early in-person voting and mail-in ballots
# we would use 2020/11/01, and convert 12 am in EDT to UTC 4 am

start_time = "2020-10-01T04:00:00Z"
end_time = "2020-11-01T04:00:00Z"

users_tweets_2020_oct_lst = []

# use a for loop to iterate through the user ids
# use a while loop to iterate through result page with the next_token
# use timmer to controle the loops
# store the results in a list

for user_id in user_id_list:
    finished = False
    next_token = None
    count_page = 0
    while finished is not True:
        users_tweets = get_users_tweets(user_id,start_time,end_time)
        users_tweets_2020_oct_lst = user_tweets_response_to_lst(users_tweets,users_tweets_2020_oct_lst)
        count_page = count_page + 1
        if 'next_token' in users_tweets.meta:
            next_token = users_tweets.meta["next_token"]
            time.sleep(5)
        else:
            finished = True
            print("user_id: ",user_id,"count result page: ",count_page)
            time.sleep(5)

# print count of result page for each user

user_id:  28785486 count result page:  1
user_id:  16815644 count result page:  1
user_id:  384438102 count result page:  1
user_id:  742143 count result page:  1
user_id:  5402612 count result page:  1
user_id:  612473 count result page:  1
user_id:  69329527 count result page:  1
user_id:  15012486 count result page:  1
user_id:  18767699 count result page:  1
user_id:  270508448 count result page:  1
user_id:  759251 count result page:  1
user_id:  428333 count result page:  1
user_id:  13850422 count result page:  1
user_id:  1367531 count result page:  1
user_id:  16032925 count result page:  1
user_id:  25067168 count result page:  1
user_id:  2836421 count result page:  1
user_id:  14173315 count result page:  1
user_id:  11856032 count result page:  1
user_id:  807095 count result page:  1
user_id:  14434063 count result page:  1
user_id:  5392522 count result page:  1
user_id:  5741722 count result page:  1
user_id:  7587032 count result page:  1
user_id:  15754281 count resul

In [12]:
# transform the list to a dataframe
twitter_users_tweets_2020_oct_df = pd.DataFrame(users_tweets_2020_oct_lst)
twitter_users_tweets_2020_oct_df.head(10)

Unnamed: 0,tweet_id,tweet_text,tweet_author_id,tweet_context_annotations,tweet_created_at,tweet_entities,tweet_in_reply_to_user_id,tweet_lang,tweet_retweet_count,tweet_reply_count,tweet_like_count,tweet_quote_count,tweet_referenced_tweets
0,1322661371721515008,"England win Six Nations, after France beat Ire...",5402612,"[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",2020-10-31 22:06:59+00:00,"{'urls': [{'start': 98, 'end': 121, 'url': 'ht...",,en,171,110,1907,62,
1,1322618362212634625,“From Thursday until the start of December you...,5402612,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2020-10-31 19:16:05+00:00,"{'urls': [{'start': 237, 'end': 260, 'url': 'h...",,en,4305,2691,9461,4139,
2,1322572709327458306,UK PM Boris Johnson is set to announce lockdow...,5402612,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2020-10-31 16:14:40+00:00,"{'urls': [{'start': 111, 'end': 134, 'url': 'h...",,en,1968,925,4911,1120,
3,1322518948714262528,"Obituary: Sir Sean Connery, the award-winning ...",5402612,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2020-10-31 12:41:03+00:00,"{'urls': [{'start': 92, 'end': 115, 'url': 'ht...",5402612.0,en,3761,581,14358,1770,"[(type, id)]"
4,1322515886884966401,"Sir Sean Connery, the actor who defined James ...",5402612,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2020-10-31 12:28:53+00:00,"{'urls': [{'start': 69, 'end': 92, 'url': 'htt...",,en,49839,10468,145399,54577,
5,1322208071616241667,RT @BBCSport: World Cup-winning England midfie...,5402612,"[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",2020-10-30 16:05:44+00:00,"{'urls': [{'start': 89, 'end': 112, 'url': 'ht...",,en,349,0,0,0,"[(type, id)]"
6,1322152190274883584,Earthquake with a magnitude of 7.0 shakes Gree...,5402612,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2020-10-30 12:23:41+00:00,"{'urls': [{'start': 119, 'end': 142, 'url': 'h...",,en,1669,268,3523,460,
7,1321987913253572613,New Zealand votes to legalise euthanasia for t...,5402612,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2020-10-30 01:30:54+00:00,"{'urls': [{'start': 104, 'end': 127, 'url': 'h...",,en,1587,596,14334,1152,
8,1321877972756955137,West Yorkshire to be placed under strictest le...,5402612,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2020-10-29 18:14:02+00:00,"{'urls': [{'start': 84, 'end': 107, 'url': 'ht...",,en,293,202,807,355,
9,1321803761413664768,Labour Party suspends former leader Jeremy Cor...,5402612,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2020-10-29 13:19:09+00:00,"{'urls': [{'start': 119, 'end': 142, 'url': 'h...",,en,1191,930,4684,1718,


In [13]:
# check the tweets per author id
twitter_users_tweets_2020_oct_df.groupby(['tweet_author_id'])['tweet_author_id'].count()

tweet_author_id
5402612      52
18767699     52
270508448    55
Name: tweet_author_id, dtype: int64

In [14]:
# save the data to a csv file
twitter_users_tweets_2020_oct_df.to_csv (r'/Users/katherina/data_science_tweets_neutrality/twitter_users_tweets_2020_oct.csv', index = False, header=True)

#### tweets in 2022 June

In [17]:
# use get_users_tweets and user_tweets_response_to_lst to extract and save users' tweet for tweets in 2022 June

start_time = "2022-06-01T00:00:00Z"
end_time = "2022-07-01T00:00:00Z"

users_tweets_2022_June_lst = []

# use a for loop to iterate through the user ids
# use a while loop to iterate through result page with the next_token
# use timmer to controle the loops
# store the results in a list

for user_id in user_id_list:
    finished = False
    next_token = None
    count_page = 0
    while finished is not True:
        users_tweets = get_users_tweets(user_id,start_time,end_time)
        users_tweets_2022_June_lst = user_tweets_response_to_lst(users_tweets,users_tweets_2022_June_lst)
        count_page = count_page + 1
        if 'next_token' in users_tweets.meta:
            next_token = users_tweets.meta["next_token"]
            time.sleep(5)
        else:
            finished = True
            print("user_id: ",user_id,"count result page: ",count_page)
            time.sleep(5)

# print count of result page for each user

user_id:  28785486 count result page:  2
user_id:  16815644 count result page:  15
user_id:  384438102 count result page:  9
user_id:  742143 count result page:  14
user_id:  5402612 count result page:  1
user_id:  612473 count result page:  4
user_id:  69329527 count result page:  2
user_id:  15012486 count result page:  1
user_id:  18767699 count result page:  1
user_id:  270508448 count result page:  1
user_id:  759251 count result page:  3
user_id:  428333 count result page:  2
user_id:  13850422 count result page:  8
user_id:  1367531 count result page:  1
user_id:  16032925 count result page:  1
user_id:  25067168 count result page:  12
user_id:  2836421 count result page:  11
user_id:  14173315 count result page:  2
user_id:  11856032 count result page:  5
user_id:  807095 count result page:  1
user_id:  14434063 count result page:  5
user_id:  5392522 count result page:  16
user_id:  5741722 count result page:  11
user_id:  7587032 count result page:  1
user_id:  15754281 count

In [18]:
# transform the list to a dataframe
twitter_users_tweets_2022_June_df = pd.DataFrame(users_tweets_2022_June_lst)
twitter_users_tweets_2022_June_df.head(10)

Unnamed: 0,tweet_id,tweet_text,tweet_author_id,tweet_context_annotations,tweet_created_at,tweet_entities,tweet_in_reply_to_user_id,tweet_lang,tweet_retweet_count,tweet_reply_count,tweet_like_count,tweet_quote_count,tweet_referenced_tweets
0,1542653639482920961,The Supreme Court announced it will hear a cas...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-06-30 23:37:43+00:00,"{'annotations': [{'start': 4, 'end': 16, 'prob...",,en,46,41,78,13,
1,1542647056254345221,The FDA says it has advised COVID-19 vaccine c...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-06-30 23:11:33+00:00,"{'annotations': [{'start': 4, 'end': 6, 'proba...",,en,30,28,93,3,
2,1542641335378874368,Tear gas or pepper spray was used on patrons a...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-06-30 22:48:49+00:00,"{'annotations': [{'start': 71, 'end': 82, 'pro...",,en,22,6,29,1,
3,1542635504700497921,"RT @ABCPolitics: NEW: In a statement, Pres. Bi...",28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-06-30 22:25:39+00:00,"{'annotations': [{'start': 44, 'end': 48, 'pro...",,en,32,0,0,0,"[(type, id)]"
4,1542634836518735872,OFF YOU GO: Alligator released into the river ...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-06-30 22:23:00+00:00,"{'annotations': [{'start': 106, 'end': 112, 'p...",,en,22,14,94,5,
5,1542633166896259074,"Ruja Ignatova, the so-called Cryptoqueen, has ...",28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-06-30 22:16:22+00:00,"{'annotations': [{'start': 0, 'end': 12, 'prob...",,en,31,6,43,9,
6,1542626059232976898,Eli Lilly has announced the U.S. government is...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-06-30 21:48:07+00:00,"{'annotations': [{'start': 0, 'end': 8, 'proba...",,en,13,10,50,1,
7,1542620228865216512,"""She's got a lot of money and she hit the road...",28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-06-30 21:24:57+00:00,"{'urls': [{'start': 62, 'end': 85, 'url': 'htt...",,en,87,30,172,9,
8,1542613564862627843,"Stocks fell again on Wall Street, closing out ...",28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-06-30 20:58:28+00:00,"{'annotations': [{'start': 21, 'end': 31, 'pro...",,en,27,19,37,8,
9,1542606539600560136,Toxic toads are returning to South Florida in ...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-06-30 20:30:33+00:00,"{'annotations': [{'start': 29, 'end': 41, 'pro...",,en,19,11,40,4,


In [19]:
# check the tweets per author id
twitter_users_tweets_2022_June_df.groupby(['tweet_author_id'])['tweet_author_id'].count()

tweet_author_id
428333        130
612473        364
742143       1358
759251        290
2836421      1056
5392522      1578
5402612        61
5741722      1043
7998482       812
11856032      428
13850422      737
14173315      107
14434063      469
15754281      966
16032925       23
16815644     1415
18767699       80
25067168     1138
28785486      197
69329527      136
270508448      65
384438102     894
Name: tweet_author_id, dtype: int64

In [20]:
twitter_users_tweets_2022_June_df['tweet_author_id'].nunique()

22

In [21]:
# save the data to a csv file
twitter_users_tweets_2022_June_df.to_csv (r'/Users/katherina/data_science_tweets_neutrality/twitter_users_tweets_2022_June.csv', index = False, header=True)

In [None]:
#### tweets from 2022 July 1st to August 10th

In [18]:
# use get_users_tweets and user_tweets_response_to_lst to extract and save users' tweet 
# for tweets from 2022 July 1st to August 10th

start_time = "2022-07-01T00:00:00Z"
end_time = "2022-08-11T00:00:00Z"

users_tweets_2022_july_august_lst = []

# use a for loop to iterate through the user ids
# use a while loop to iterate through result page with the next_token
# use timmer to controle the loops
# store the results in a list

for user_id in user_id_list:
    finished = False
    next_token = None
    count_page = 0
    while finished is not True:
        users_tweets = get_users_tweets(user_id,start_time,end_time)
        users_tweets_2022_july_august_lst = user_tweets_response_to_lst(users_tweets,users_tweets_2022_july_august_lst)
        count_page = count_page + 1
        if 'next_token' in users_tweets.meta:
            next_token = users_tweets.meta["next_token"]
            time.sleep(2)
        else:
            finished = True
            print("user_id: ",user_id,"count result page: ",count_page)
            time.sleep(2)

# print count of result page for each user

user_id:  28785486 count result page:  33
user_id:  384438102 count result page:  7
user_id:  16815644 count result page:  16
user_id:  5402612 count result page:  2
user_id:  612473 count result page:  32
user_id:  69329527 count result page:  2
user_id:  742143 count result page:  18
user_id:  564111558 count result page:  33
user_id:  34713362 count result page:  25
user_id:  15012486 count result page:  33
user_id:  270508448 count result page:  1
user_id:  18767699 count result page:  2
user_id:  759251 count result page:  33
user_id:  428333 count result page:  3
user_id:  13850422 count result page:  8
user_id:  380285402 count result page:  33
user_id:  1215486846798569473 count result page:  7
user_id:  4081106480 count result page:  33
user_id:  16935292 count result page:  10
user_id:  91478624 count result page:  32
user_id:  1367531 count result page:  32
user_id:  16032925 count result page:  1
user_id:  14511951 count result page:  28
user_id:  15458694 count result page

In [19]:
# transform the list to a dataframe
twitter_users_tweets_2022_july_august_df = pd.DataFrame(users_tweets_2022_july_august_lst)
twitter_users_tweets_2022_july_august_df.head(10)

Unnamed: 0,tweet_id,tweet_text,tweet_author_id,tweet_context_annotations,tweet_created_at,tweet_entities,tweet_in_reply_to_user_id,tweet_lang,tweet_retweet_count,tweet_reply_count,tweet_like_count,tweet_quote_count,tweet_referenced_tweets
0,1557511874228457472,In the month since Roe v. Wade was overturned ...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-08-10 23:39:02+00:00,"{'urls': [{'start': 129, 'end': 152, 'url': 'h...",,en,26,16,57,5,
1,1557503446915465218,Supporters and critics of Donald Trump continu...,28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-08-10 23:05:33+00:00,"{'urls': [{'start': 121, 'end': 144, 'url': 'h...",,en,15,92,45,5,
2,1557497303631527936,FBI Director Christopher Wray declines to comm...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-08-10 22:41:08+00:00,"{'urls': [{'start': 115, 'end': 138, 'url': 'h...",,en,50,65,157,5,
3,1557494976652709889,At least three people have died after a house ...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-08-10 22:31:53+00:00,"{'urls': [{'start': 91, 'end': 114, 'url': 'ht...",,en,31,13,58,2,
4,1557493778667212800,"Before signing the bill into law, Pres. Biden ...",28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-08-10 22:27:08+00:00,"{'urls': [{'start': 215, 'end': 238, 'url': 'h...",,en,945,133,3404,137,
5,1557491088302252037,Former Pres. Trump’s false allegation that the...,28785486,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-08-10 22:16:26+00:00,"{'urls': [{'start': 257, 'end': 280, 'url': 'h...",,en,14,70,40,3,
6,1557485458308882434,After several weeks of steady increases in cor...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-08-10 21:54:04+00:00,"{'urls': [{'start': 176, 'end': 199, 'url': 'h...",,en,22,18,68,4,
7,1557479990471655428,A Kansas man convicted of performing illegal a...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-08-10 21:32:20+00:00,"{'urls': [{'start': 149, 'end': 172, 'url': 'h...",,en,35,22,108,5,
8,1557474898473353217,Following a barrage of online allegations of r...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-08-10 21:12:06+00:00,"{'urls': [{'start': 207, 'end': 230, 'url': 'h...",,en,23,20,53,3,
9,1557473199943720960,Muslim communities are reeling after the fourt...,28785486,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-08-10 21:05:21+00:00,"{'urls': [{'start': 131, 'end': 154, 'url': 'h...",,en,10,31,61,0,


In [20]:
# check the tweets per author id
twitter_users_tweets_2022_july_august_df.groupby(['tweet_author_id'])['tweet_author_id'].count()

tweet_author_id
428333                  201
612473                 3141
742143                 1726
759251                 3218
788524                 3201
807095                 3217
1367531                3145
1652541                2400
1917731                3133
2467791                3202
2836421                2390
2884771                2239
3108351                3210
5392522                1722
5402612                 110
5741722                 885
5988062                3218
7587032                3158
7998482                 840
9300262                 956
10774652               1684
11856032                532
13850422                733
14173315               3211
14293310               2410
14434063                477
14511951               2726
14662354               3204
14677919               1623
15012486               3223
15164565               3211
15458694                902
15754281               2525
16012783               3207
16180004                446
1681

In [26]:
# check minimum tweet_created_at of the tweets per author id
twitter_users_tweets_2022_july_august_df.groupby(['tweet_author_id'])['tweet_created_at'].min()

tweet_author_id
428333                2022-07-01 09:35:24+00:00
612473                2022-07-01 00:09:18+00:00
742143                2022-07-01 00:13:12+00:00
759251                2022-07-01 10:58:24+00:00
788524                2022-07-15 20:49:00+00:00
807095                2022-07-11 12:10:02+00:00
1367531               2022-07-28 20:00:18+00:00
1652541               2022-08-05 11:25:20+00:00
1917731               2022-08-01 16:21:19+00:00
2467791               2022-07-13 20:33:06+00:00
2836421               2022-07-01 00:01:05+00:00
2884771               2022-07-01 00:53:36+00:00
3108351               2022-07-07 04:45:04+00:00
5392522               2022-07-01 00:08:33+00:00
5402612               2022-07-01 11:15:44+00:00
5741722               2022-07-01 00:04:34+00:00
5988062               2022-07-08 02:12:46+00:00
7587032               2022-07-23 07:24:43+00:00
7998482               2022-07-01 07:09:04+00:00
9300262               2022-07-01 00:14:14+00:00
10774652              20

In [21]:
twitter_users_tweets_2022_july_august_df['tweet_author_id'].nunique()

60

In [22]:
# save the data to a csv file
twitter_users_tweets_2022_july_august_df.to_csv (r'/Users/katherina/data_science_tweets_neutrality/twitter_users_tweets_2022_july_august.csv', index = False, header=True)

In [25]:
# users_tweets.meta["next_token"]

In [14]:
# tweet_details = [[tweet.geo, tweet.text, tweet.user.screen_name, tweet.user.location] for tweet in tweets]
# tweet_df = pd.DataFrame(data=tweet_details, columns=["geo","text","user","location"])
# tweet_df.head()