# Setup

With the necessary packages installed, we import our necessary packages for accessing files.

In [None]:
# Install any of the following that you don't already have installed.
import os.path
import datetime
import time
import pandas as pd
from os import path
from IPython.display import clear_output

Then we import our `twarc` library, set our keys, and initiate our `Twarc` object. Fill out the blank strings with your actual keys from the Twitter developer website.

In [None]:
from twarc import Twarc

consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret, tweet_mode = "extended")

Then we load in the IDs of our dehydrated tweets that we seek to rehydrate. We'll also look at the set of Tweets that have been already hydrated to see where we have to pick up. Check the output to see how much of the total set of IDs has been hydrated.

In [None]:
dehydrated_ids = list(pd.read_csv("id_chunk.csv")["id"])
already_hydrated_ids = []
hydrated_df = pd.DataFrame(columns = ["id_str", "rt", "full_text", "created_at", "user_id", "lang", "location", "country_code", "place_name"])

if path.exists("hydrated_df.csv"):
    hydrated_df = pd.read_csv("hydrated_df.csv")
    already_hydrated_ids = list(hydrated_df["id"])
    print("Hydrated " + str(len(already_hydrated_ids)) + " tweets so far")

Then, we print the total percent of IDs that we've hydrated. This is not going to track one-to-one with the amount of Tweets that are in the resulting dataset since some Tweets may have been deleted or marked private since their collection.

In [None]:
ids_to_hydrate = list(set(dehydrated_ids) - set(already_hydrated_ids))
print(str(round(100*len(already_hydrated_ids)/(len(already_hydrated_ids) + len(ids_to_hydrate)), 3)) + "% of total IDs hydrated")

# Hydration

Now we'll hydrate our tweetset and collect our desired information. We write to our CSVs in real time since practically this script is going to be closed and rerun multiple times. The following chunk is a helper function that will save our interim extracted Tweets along with their data.

In [None]:
def save_new_tweets(filename: str, existing_data: pd.DataFrame, new_data: dict):
    ''' Given a filename, existing dataset, and new data, this function will
    append the new data to the existing dataset and save this new combined
    dataset under the desired name. The function will then empty the columns
    of the intermediate dataset.
    '''
    existing_data = existing_data.append(pd.DataFrame(new_data))
    existing_data.to_csv(filename, index = False)
    for key in new_data.keys():
        new_data[key] = []

And this helper function will assign the values from the Tweet object to a running dictionary containing our new values.

In [None]:
def assign_attributes(tweet, attributes_to_assign, existing_dict):
    ''' Given a Twarc tweet object, a set of attributes to assign,
    and an existing dictionary, this function will assign the 
    desired attributes from the tweet to the dictionary.
    '''
    for attribute in attributes_to_assign:
        original_attribute_name = attribute
        object_to_access = tweet
        if attribute == "full_text": # If we are trying to extract the full text of a Tweet then the way we do so depends on whether it is a Retweet
            object_to_access = tweet["retweeted_status"] if is_rt else tweet
        if attribute in user_attributes: # If we are trying to extract an attribute tied to a user, we access the user object nested within the Tweet object
            object_to_access = tweet["user"]
            attribute = "id" if attribute == "user_id" else attribute # We rename specific attributes with names that would have been confusing outside of the object
        elif attribute in place_attributes: # Similarly, we may access the place object within the Tweet object
            object_to_access = tweet["place"]
            attribute = "full_name" if attribute == "place_name" else attribute
        # If the object we need to access is missing, most likely place, we assign a None value. Otherwise, we access the correct attribute from the given object
        new_value = None if object_to_access == None else object_to_access[attribute] 
        existing_dict[original_attribute_name].append(new_value)

Now, we iterate through our IDs to hydrate and do so.

In [None]:
# First, we initialize the set of attributes we will be extracting from each Tweet
newly_hydrated_data = {
    "id_str": [], # Tweet attribute
    "rt": [], # Tweet attribute
    "full_text": [], # Tweet attribute
    "created_at": [], # Tweet attribute
    "user_id": [], # User attribute
    "lang": [], # User attribute
    "location": [], # User attribute
    "country_code": [], # Place attribute
    "place_name": [] # Place attribute
}

# Also, since they are accessed differently within a Tweet object, we define our user and place attributes.
user_attributes = ["user_id", "lang", "location"]
place_attributes = ["country_code", "place_name"]
attributes_to_assign = set(newly_hydrated_data.keys()) - set(["rt"]) # Since RTs are not actually Twitter API Tweet object attributes, we differentiate between it and the other attributes.

# We set a counter which will tell us when to wait for the Twitter API
counter = 0

# Now we iterate through our ids_to_hydrate using Twarc and extract the needed values
for tweet in t.hydrate(ids_to_hydrate):
    counter += 1
    is_rt = "retweeted_status" in tweet.keys() # As it is not an attribute in the Twitter API Tweet object, we manually extract whether a given Tweet is a Retweet
    newly_hydrated_data["rt"].append(is_rt) # We assign the value we found
    assign_attributes(tweet = tweet, attributes_to_assign = attributes_to_assign, existing_dict = newly_hydrated_data) # And assign the attributes we desire using the above function
    if (counter % 900) == 0: # Every 900 Tweets, we stop and save our Tweets
        save_new_tweets(filename = "hydrated_df.csv", existing_data = hydrated_df, new_data = newly_hydrated_data)
        clear_output()
        print(str(counter) + " new tweets hydrated.")
        time.sleep(900) # And we wait 900 seconds to comply with the Twitter API's 900 Tweets / 15 minutes rate limit

# If we get out of the loop, we have hydrated all ids_to_hydrate and so we save what remains and print that we have finished
save_new_tweets(filename = "hydrated_df.csv", existing_data = hydrated_df, new_data = newly_hydrated_data)
print("All tweets saved, you're done!")