# Tweet Scraping

### Pre-Processing

Imports

In [177]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

Create dataframe of tweets and reply sentiments. Each row contains the parent tweet ID and the sentiment (positive, negative, neutral) for one of the replies.

In [178]:
replies = pd.read_csv('../data/replies.csv')
replies.head()

Unnamed: 0,label,id
0,positive,1223759356631625729
1,negative,1223752255846912000
2,neutral,1223752255846912000
3,negative,1223752255846912000
4,negative,1223752255846912000


Get unique tweets

In [179]:
tweet_ids = replies['id'].unique()
print(f'Total replies: {len(replies)}\nTotal tweets: {len(tweet_ids)}')

Total replies: 1519504
Total tweets: 34521


### Fetch Tweets from Twitter API

Imports

In [180]:
import os
import tweepy
from dotenv import load_dotenv

Authentication

In [181]:
load_dotenv()
consumer_key = os.getenv("CONSUMER_KEY")
consumer_secret = os.getenv("CONSUMER_SECRET")
access_token_key = os.getenv("ACCESS_TOKEN_KEY")
access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token_key, access_token_secret)
api = tweepy.API(auth)

Fetch all unique tweets

In [14]:
for i in tqdm(range(len(tweet_ids))):
    # Get tweet
    tweet_id = tweet_ids[i]
    try:
        tweet = api.get_status(tweet_id, tweet_mode='extended').full_text.replace('\n', ' ')
    
    # Rate limit exceeded
    except tweepy.TooManyRequests:
        with open('../data/progress.txt', 'a') as f:
            f.write(f'Rate Limit Exceeded. Stopped at tweet {i} with ID {tweet_id}.\n')
        time.sleep(15 * 60)
        continue

    # Other errors
    except Exception as e:
        with open('../data/progress.txt', 'a') as f:
            e = str(e).replace('\n', ' ')
            f.write(f'Error ({e}) at tweet {i} with ID {tweet_id}.\n')
        time.sleep(1)
        continue
    
    # Append to CSV
    with open('../data/tweets.csv', 'a') as f:
        f.write(f'{tweet_id},{tweet}\n')

    # Comply with Twitter API rate limit
    # 1 tweet / second
    time.sleep(1)

100%|██████████| 34521/34521 [13:03:29<00:00,  1.36s/it]     


### Append Sentiment Scores

Read in tweets CSV

In [206]:
# Read CSV and remove duplicate tweets
tweets = pd.read_csv('../data/tweets.csv', sep='^([^,]+),', engine='python', usecols=['id', 'tweet'])
tweets = tweets.drop_duplicates(subset=['tweet'])
tweet_ids = tweets['id'].unique()

Compute average reply sentiment score for each tweet

In [207]:
# Assign sentiment values to labels
scores = {'positive': 1.0, 'negative': -1.0, 'neutral': 0.0}
tweet_sentiments = {}
for tweet_id in tweet_ids:
    tweet_sentiments[tweet_id] = 0.0

# Iterate over tweets
for i in tqdm(range(len(tweet_ids))):
    tweet_id = tweet_ids[i]
    curr = replies[replies['id'] == tweet_id]

    # Compute average sentiment of replies
    ct = 0
    for index, row in curr.iterrows():
        sentiment = row['label']
        tweet_sentiments[tweet_id] += scores[sentiment]
        ct += 1
    tweet_sentiments[tweet_id] /= ct

100%|██████████| 27146/27146 [00:44<00:00, 606.75it/s]


Append scores to tweets CSV

In [208]:
# Append scores
tweet_ids = tweets['id']
sentiments = []
for i in tqdm(range(len(tweet_ids))):
    tweet_id = tweets.iloc[i]['id']
    sentiments.append(tweet_sentiments[tweet_id])
tweets['sentiment'] = sentiments
tweets.to_csv('../data/labeled_tweets.csv', index=False)
tweets.head()

100%|██████████| 27146/27146 [00:00<00:00, 31030.95it/s]


Unnamed: 0,id,tweet,sentiment
0,1223752255846912000,The Fox Corporation (The owners of Fox News) a...,-0.52
1,1223738389003952128,"Folks, you hear about this cornovirus deal in ...",-0.2
2,1223748267609030659,The news is finally out !!!! I will be on @lov...,0.5
3,1223739174160928773,Good news! The person under investigation for ...,-0.25
4,1223737953291128837,"Two avid golfers promised that, whoever died f...",0.166667


Append number of replies to CSV

In [210]:
tweet_ids = tweets['id'].unique()
reply_counts = []
for i in tqdm(range(len(tweet_ids))):
    tweet_id = tweet_ids[i]
    curr = replies[replies['id'] == tweet_id]
    reply_counts.append(len(curr))
tweets['reply_count'] = reply_counts
tweets.to_csv('../data/labeled_tweets.csv', index=False)
tweets.head()

100%|██████████| 27146/27146 [00:27<00:00, 989.25it/s] 


Unnamed: 0,id,tweet,sentiment,reply_count
0,1223752255846912000,The Fox Corporation (The owners of Fox News) a...,-0.52,150
1,1223738389003952128,"Folks, you hear about this cornovirus deal in ...",-0.2,45
2,1223748267609030659,The news is finally out !!!! I will be on @lov...,0.5,6
3,1223739174160928773,Good news! The person under investigation for ...,-0.25,12
4,1223737953291128837,"Two avid golfers promised that, whoever died f...",0.166667,36
