**TODO:**

* Add API logins to `keys` file

# Imports

In [3]:
# Data-handling imports
import pandas as pd
import tweepy

# API keys
with open ("keys", "r") as file:
    consumer_key, \
    consumer_secret, \
    access_token, \
    access_token_secret \
    = file.read().split('\n')
    
# Options
start_date = "2020-03-01"
end_date = "2020-05-09"
seed = 5993

# Log in to Twitter

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

# Parse data

In [None]:
# Read & write each data frame separately because it is a LOT of data
for pd_date in pd.date_range(start_date, end_date):
    date = str(date.date())
    
    # "clean" = doesn't incude retweets
    df = pd.read_csv("https://github.com/thepanacealab/covid19_twitter/raw/master/dailies/{date}/{date}_clean-dataset.tsv.gz", sep="\t")
    df_hydrated = []

    # Look up tweet text & location
    tweets = [(tweet["text"], status["user"]["location"])
              for tweet in api.statuses_lookup(df["tweed_id"])]

    # Parse dates/times
    times = [pd.datetime(
        *map(int, date.split("-")),
        *map(int, time.split(":")))
        for _, (date, time) in df[["date", "time"]].iterrows()]
    
    # Put together
    df_hydrated = pd.DataFrame(
        data = tweets,
        columns = ["text", "location"]
        index = df["tweed_id"])
    df_hydrated["time"] = times
    
    # Write out
    df_hydrated.to_csv(f"../data/Tweets-{date}.csv", index=False)

    # Delete these to save memory
    del df, df_hydrated

# References

Banda, J. M., Tekumalla, R., Wang, G., Yu, J., Liu, T., Ding, Y., and Chowell, G. May 2020. A large-scale COVID-19 Twitter chatter dataset for open scientific research—an international collaboration, arXiv:[1506.04967](http://arxiv.org/abs/1506.04967), doi:[10.5281/zenodo.3819464](http://doi.org/10.5281/zenodo.3819464)