# Replies
Get as many replies as possible for each top tweet.

Imports

In [160]:
from TwitterAPI import TwitterAPI, TwitterOAuth, TwitterRequestError, TwitterConnectionError, TwitterPager
import pandas as pd
from tqdm import tqdm

from dotenv import load_dotenv
import os
import glob

Authentication

In [161]:
load_dotenv()
consumer_key = os.getenv("CONSUMER_KEY")
consumer_secret = os.getenv("CONSUMER_SECRET")
access_token_key = os.getenv("ACCESS_TOKEN_KEY")
access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")

api = TwitterAPI(consumer_key, consumer_secret, access_token_key, access_token_secret, api_version='2')

Fetch replies (inspired by [Adithya Narayanan](https://towardsdatascience.com/mining-replies-to-tweets-a-walkthrough-9a936602c4d6))

In [162]:
def retrieve_replies(tweet_id):
    try:
        # Get 25 replies
        pager = TwitterPager(api, 'tweets/search/recent',
                        {
                            'query': f'conversation_id:{tweet_id}',
                            'tweet.fields': 'author_id,conversation_id,created_at,in_reply_to_user_id'
                        })

        id_list = []
        tweet_list = []
        tweet_ct = 0
        for item in pager.get_iterator(wait=5):
            id_list.append(item['author_id'])
            tweet_list.append(item['text'])

            tweet_ct += 1
            if tweet_ct > 30:
                break

    # If rate limit is exceeded, wait 15 minutes
    except TwitterRequestError as e:
        print(e.status_code)
        for msg in iter(e):
            print(msg)
        time.sleep(15 * 60)

    except TwitterConnectionError as e:
        print(e)

    except Exception as e:
        print(e)

    return pd.DataFrame({ 'id': id_list, 'tweet': tweet_list })

This function takes a tweet ID and returns a dataframe with the reply ID and the reply text (with newlines removed)

In [163]:
def get_replies(tweet_id):
    replies = retrieve_replies(tweet_id)
    replies.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
    return replies

Create folders for each username (do this after fetching the replies – can be done easily using `parent_tweets.csv`)

In [164]:
# usernames = [os.path.basename(f).split('.')[0] for f in glob.glob('../data/tweets/*.csv')]
# for username in usernames:
#     os.makedirs(f'../data/replies/{username}', exist_ok=True)

Iterate over all parent tweets and fetch replies for each one

In [None]:
import time

for i in tqdm(range(23, len(usernames))):
    username = usernames[i]
    tweets = pd.read_csv(f'../data/tweets/{username}.csv')
    reply_ct = 0
    for index, row in tweets.iterrows():
        replies = get_replies(row['id'])
        replies.to_csv(f'../data/replies/{row["id"]}.csv', index=False)
        reply_ct += len(replies)

    # Sleep based on the number of tweets retrieved
    # Follows rate limit of 450 requests / 15 minutes = 1 request / 2 seconds
    time.sleep(reply_ct * 2)

    # Write progress to a file
    with open('../data/progress.txt', 'a') as f:
        f.write(f'User: {username}, Progress: {i}/{len(usernames)}, Replies: {reply_ct}')
        f.write('\n')