# Tweet Scraping

### Pre-Processing

Imports

In [9]:
import pandas as pd
from tqdm import tqdm
import time

Create dataframe of tweets and reply sentiments. Each row contains the parent tweet ID and the sentiment (positive, negative, neutral) for one of the replies.

In [10]:
df = pd.read_csv('../data/replies.csv')
df.head()

Unnamed: 0,label,id
0,positive,1223759356631625729
1,negative,1223752255846912000
2,neutral,1223752255846912000
3,negative,1223752255846912000
4,negative,1223752255846912000


Get unique tweets

In [11]:
tweet_ids = df['id'].unique()
print(f'Total replies: {len(df)}\nTotal tweets: {len(tweet_ids)}')

Total replies: 1519504
Total tweets: 34521


### Fetch Tweets from Twitter API

Imports

In [12]:
import os
import tweepy
from dotenv import load_dotenv

Authentication

In [13]:
load_dotenv()
consumer_key = os.getenv("CONSUMER_KEY")
consumer_secret = os.getenv("CONSUMER_SECRET")
access_token_key = os.getenv("ACCESS_TOKEN_KEY")
access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token_key, access_token_secret)
api = tweepy.API(auth)

Fetch all unique tweets

In [14]:
for i in tqdm(range(len(tweet_ids))):
    # Get tweet
    tweet_id = tweet_ids[i]
    try:
        tweet = api.get_status(tweet_id, tweet_mode='extended').full_text.replace('\n', ' ')
    
    # Rate limit exceeded
    except tweepy.TooManyRequests:
        with open('progress.txt', 'a') as f:
            f.write(f'Rate Limit Exceeded. Stopped at tweet {i} with ID {tweet_id}.\n')
        time.sleep(15 * 60)

    # Other errors
    except Exception as e:
        with open('progress.txt', 'a') as f:
            e = str(e).replace('\n', ' ')
            f.write(f'Error ({e}) at tweet {i} with ID {tweet_id}.\n')
        continue
    
    # Append to CSV
    with open('../data/tweets.csv', 'a') as f:
        f.write(f'{tweet_id},{tweet}\n')

    # Comply with Twitter API rate limit
    # 1 tweet / second
    time.sleep(1)

100%|██████████| 34521/34521 [13:03:29<00:00,  1.36s/it]     
