In [26]:
import os
import csv
import json
import re

import twarc
import pandas as pd

In [3]:
DATA_RAW_TWEETS_PATH       = '../data/raw'
DATA_PROCESSED_TWEETS_PATH = '../data/processed'

TWITTER_CONSUMER_KEY        = os.environ['CONSUMER_KEY']
TWITTER_CONSUMER_SECRET     = os.environ['CONSUMER_SECRET']
TWITTER_ACCESS_TOKEN        = os.environ['ACCESS_TOKEN']
TWITTER_ACCESS_TOKEN_SECRET = os.environ['ACCESS_TOKEN_SECRET']

# Aggregate Raw Data

We first aggregate the raw data, which contains filtered Tweet IDs and processed sentiment values, into a just a list of Tweet IDs.

In [23]:
TWEET_IDS_FILENAME = 'all_tweet_ids.csv'

if not os.path.isdir(DATA_PROCESSED_TWEETS_PATH):
    os.mkdir(DATA_PROCESSED_TWEETS_PATH)

if os.path.isfile(DATA_PROCESSED_TWEETS_PATH + '/' + TWEET_IDS_FILENAME):
    os.remove(DATA_PROCESSED_TWEETS_PATH + '/' + TWEET_IDS_FILENAME)

df_files = []

for filename in os.listdir(DATA_RAW_TWEETS_PATH):
    filepath = DATA_RAW_TWEETS_PATH + '/' + filename
    
    df = pd.read_csv(filepath, names=['tweet_id', 'sentiment_value'], header=None)
    df_files.append(df)
            
df_all_tweet_ids = pd.concat(df_files, axis=0, ignore_index=True)['tweet_id']
df_all_tweet_ids = df_all_tweet_ids.sort_values(ascending=True)
df_all_tweet_ids.to_csv(DATA_PROCESSED_TWEETS_PATH + '/' + TWEET_IDS_FILENAME, index=False)

Number of Tweets

In [24]:
df_all_tweet_ids.count()

241481

# Hydrate Tweets
Hydrate all the aggregated Tweet IDs from the previous step.

In [10]:
twarc_client = twarc.Twarc(TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET)

In [47]:
DATA_FULL_TWEETS_DIRECTORY   = 'full-tweets'
DATA_FULL_TWEETS_FILE_PREFIX = 'tweets-'
NUM_TWEETS_PER_FILE          = 1000 # Do not change

def resolve_cache(path):
    max_num = -1
    
    for filename in os.listdir(path):
        res = re.search(rf'^{re.escape(DATA_FULL_TWEETS_FILE_PREFIX)}(\d+)\.jsonl', filename)
        
        if res:
            num = res.group(1)
        
            if max_num < int(num):
                max_num = int(num)
            
    return max_num

In [48]:
if not os.path.isdir(DATA_PROCESSED_TWEETS_PATH + '/' + DATA_FULL_TWEETS_DIRECTORY):
    os.mkdir(DATA_PROCESSED_TWEETS_PATH + '/' + DATA_FULL_TWEETS_DIRECTORY)
    
latest_num = resolve_cache(DATA_PROCESSED_TWEETS_PATH + '/' + DATA_FULL_TWEETS_DIRECTORY)

if latest_num > -1:
    os.remove(DATA_PROCESSED_TWEETS_PATH + '/' + DATA_FULL_TWEETS_DIRECTORY + '/' + DATA_FULL_TWEETS_FILE_PREFIX + str(latest_num) + '.jsonl')
else:
   latest_num = 0 
    
total_num_tweets = df_all_tweet_ids.count()

for i in range(latest_num, (total_num_tweets // NUM_TWEETS_PER_FILE) + 1):
    ids = df_all_tweet_ids[i*NUM_TWEETS_PER_FILE:(i+1)*NUM_TWEETS_PER_FILE].tolist()
    
    with open(DATA_PROCESSED_TWEETS_PATH + '/' + DATA_FULL_TWEETS_DIRECTORY + '/' + DATA_FULL_TWEETS_FILE_PREFIX + str(i) + '.jsonl', 'w') as file:
        for tweet in twarc_client.hydrate(ids):
            tweet_json = json.dump(tweet, file)
            file.write('\n')