# COVID-19 Twitter NLP
### *Ayush Noori, Omar Wahby, Ethan Fang*

In [1]:
# !conda info --envs
# !where python

First, load all requisite libraries.

In [2]:
# base libraries
import numpy as np
import pandas as pd
import configparser
import os

# download files
from datetime import datetime
import wget

# Twitter libraries
import jsonlines, json
from twarc import Twarc
import csv

# base directory
base_dir = os.getcwd()
print("Base Directory: " + base_dir)

Base Directory: c:\Users\ayush\OneDrive\Academic\College\General\HDAG Fellowship\covid19-nlp\Code


## Read API Keys
Read API keys for Twitter authentication.

In [3]:
config = configparser.ConfigParser()
config.read("config.ini")
api_keys = config["twitter"]

# read keys from config file
consumer_key = api_keys["consumer_key"]
consumer_secret = api_keys["consumer_secret"]
access_token = api_keys["access_token"]
access_token_secret = api_keys["access_token_secret"]

# create Twarc object
t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

# Download Files
First, indicate the date range of the files you would like to hydrate. Rememeber that `start_date` cannot be before `2020-01-22`.

In [4]:
import pandas as pd
start_date = '2021-07-01'
end_date = '2021-07-01'

datelist = pd.date_range(start_date, end_date).tolist()
number_files = 0
for date in datelist:
  for h in ['00','01', '02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23']:
    number_files = number_files + 1

print("You have requested to download a total of {} files.".format(number_files))

You have requested to download a total of 24 files.


In [5]:
summary_dir = os.path.join(base_dir, "..//Data//Summary Details")
os.chdir(summary_dir)

datelist = pd.date_range(start_date, end_date).tolist()
files_list = []
for date in datelist:
  day = date.strftime("%Y_%m_%d")
  p1 = "https://raw.githubusercontent.com/lopezbec/COVID19_Tweets_Dataset/master/Summary_Details/"
  m = day[0:7]
  for h in ['00','01', '02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23']:
    file = p1 + m + "/" + day + "_" + str(h) + "_Summary_Details.csv"
    wget.download(file)

# Filter Tweet IDs
Filter Tweets by specific parameters.

In [6]:
summary_details = pd.DataFrame() 
directory =  os.getcwd()
for root, dirs, files in os.walk(directory):
  for file in files:
    if file.endswith(".csv"):
      data = pd.read_csv(file) 
      frames = [summary_details, data]
      summary_details = pd.concat(frames)
summary_details

Unnamed: 0,Tweet_ID,Language,Geolocation_coordinate,RT,Likes,Retweets,Country,Date Created
0,1410398043661160450,en,NO,YES,0,1,,Thu Jul 01 00:41:12 +0000 2021
1,1410387674620334081,en,NO,YES,0,3430,,Thu Jul 01 00:00:00 +0000 2021
2,1410387675291406340,in,NO,NO,0,0,,Thu Jul 01 00:00:00 +0000 2021
3,1410387675325030400,es,NO,NO,1,1,,Thu Jul 01 00:00:00 +0000 2021
4,1410387675350196226,in,NO,NO,0,0,,Thu Jul 01 00:00:00 +0000 2021
...,...,...,...,...,...,...,...,...
95887,1410750051236810757,in,NO,YES,0,624,,Thu Jul 01 23:59:57 +0000 2021
95888,1410750053589733378,es,NO,YES,0,1,,Thu Jul 01 23:59:57 +0000 2021
95889,1410750056173473802,fr,NO,NO,0,0,,Thu Jul 01 23:59:58 +0000 2021
95890,1410750056760713221,en,NO,YES,0,659,,Thu Jul 01 23:59:58 +0000 2021


Here, we filter to include only tweets that are in English are NOT retweets (i.e., are original tweets). We could also filter for only tweets with geolocation coordinates.

In [7]:
summary_details_filter = summary_details
summary_details_filter = summary_details_filter[summary_details_filter['Language']=='en']
# summary_details_filter = summary_details_filter[summary_details_filter['Geolocation_coordinate']=='YES']
summary_details_filter = summary_details_filter[summary_details_filter['RT']=='NO']
summary_details_filter


Unnamed: 0,Tweet_ID,Language,Geolocation_coordinate,RT,Likes,Retweets,Country,Date Created
7,1410387675455033348,en,NO,NO,0,0,,Thu Jul 01 00:00:00 +0000 2021
9,1410387675505377291,en,NO,NO,1,0,,Thu Jul 01 00:00:00 +0000 2021
19,1410387675782139905,en,NO,NO,7,3,,Thu Jul 01 00:00:00 +0000 2021
23,1410387676063166469,en,NO,NO,24,8,,Thu Jul 01 00:00:00 +0000 2021
40,1410387677275463681,en,NO,NO,2,0,,Thu Jul 01 00:00:00 +0000 2021
...,...,...,...,...,...,...,...,...
95856,1410749931573301251,en,NO,NO,0,0,,Thu Jul 01 23:59:28 +0000 2021
95865,1410749953526300682,en,NO,NO,0,0,,Thu Jul 01 23:59:34 +0000 2021
95866,1410749962887909379,en,NO,NO,1,0,,Thu Jul 01 23:59:36 +0000 2021
95873,1410749982773153792,en,NO,NO,0,0,,Thu Jul 01 23:59:41 +0000 2021


In [8]:
data_dir = os.path.join(base_dir, "..//Data//")
os.chdir(data_dir)
final_tweet_ids_filename = "Filtered Tweet IDs.txt"

# store final IDs
with open(final_tweet_ids_filename, "w+") as f:
    for id in summary_details_filter['Tweet_ID']:
        f.write('%s\n' % id)

# Hydrate
Now, hydrate the tweets using the selected Tweet IDs.

In [9]:
final_tweet_ids_filename = "Filtered Tweet IDs.txt"
output_filename = "Hydrated Tweets.csv"

The time required to run the below chunk will depend on the number of tweets one would like to hydrate.

In [10]:
# stores hydrated tweets here as jsonl objects
# contains one json object per line
output_json_filename = output_filename[:output_filename.index(".")] + ".txt"
ids = []
with open(final_tweet_ids_filename, "r") as ids_file:
    ids = ids_file.read().split()
hydrated_tweets = []
ids_to_hydrate = set(ids)

# looks at the output file for already hydrated tweets
if os.path.isfile(output_json_filename):
    with jsonlines.open(output_json_filename, "r") as reader:
        for i in reader.iter(type=dict, skip_invalid=True):
            # these tweets have already been hydrated. So remove them from ids_to_hydrate
            hydrated_tweets.append(i)
            ids_to_hydrate.remove(i["id_str"])
print("Total IDs: " + str(len(ids)) + ", IDs to hydrate: " + str(len(ids_to_hydrate)))
print("Hydrated: " + str(len(hydrated_tweets)))

count = len(hydrated_tweets)
start_index = count # the index from where tweets haven't been saved to the output_json_file
# stores hydrated tweets to output_json_file every num_save iterations.
num_save  = 1000

# now, use twarc and start hydrating
for tweet in t.hydrate(ids_to_hydrate):
    hydrated_tweets.append(tweet)
    count += 1
    # if num_save iterations have passed
    if (count % num_save) == 0:
        # open the output file
        # NOTE: even if the code stops during IO, only tweets from the current iteration are lost
        # older tweets are preserved as the file is written in append mode
        with jsonlines.open(output_json_filename, "a") as writer:
            ## print("Started IO")
            # now write the tweets from start_index
            # the other tweets don't have to be written as they were already written in a previous iteration or run.
            for hydrated_tweet in hydrated_tweets[start_index:]:
                writer.write(hydrated_tweet)
            ## print("Finished IO")
        print("Saved " + str(count) + " hydrated tweets.")
        # now, since everything has been written, reset start_index
        start_index = count
# there might be tweets unwritten in the last iteration if the count is not a multiple of num_tweets
# in that case, just write out the remainder of tweets
if count != start_index:
    print("Here with start_index", start_index)
    with jsonlines.open(output_json_filename, "a") as writer:
        for hydrated_tweet in hydrated_tweets[start_index:]:
           writer.write(hydrated_tweet)

Total IDs: 540642, IDs to hydrate: 540642
Hydrated: 0
Saved 1000 hydrated tweets.
Saved 2000 hydrated tweets.
Saved 3000 hydrated tweets.
Saved 4000 hydrated tweets.
Saved 5000 hydrated tweets.
Saved 6000 hydrated tweets.
Saved 7000 hydrated tweets.
Saved 8000 hydrated tweets.
Saved 9000 hydrated tweets.
Saved 10000 hydrated tweets.
Saved 11000 hydrated tweets.
Saved 12000 hydrated tweets.
Saved 13000 hydrated tweets.
Saved 14000 hydrated tweets.
Saved 15000 hydrated tweets.
Saved 16000 hydrated tweets.
Saved 17000 hydrated tweets.
Saved 18000 hydrated tweets.
Saved 19000 hydrated tweets.
Saved 20000 hydrated tweets.
Saved 21000 hydrated tweets.
Saved 22000 hydrated tweets.
Saved 23000 hydrated tweets.
Saved 24000 hydrated tweets.
Saved 25000 hydrated tweets.
Saved 26000 hydrated tweets.
Saved 27000 hydrated tweets.
Saved 28000 hydrated tweets.
Saved 29000 hydrated tweets.
Saved 30000 hydrated tweets.
Saved 31000 hydrated tweets.
Saved 32000 hydrated tweets.
Saved 33000 hydrated tweets



Saved 41000 hydrated tweets.
Saved 42000 hydrated tweets.
Saved 43000 hydrated tweets.
Saved 44000 hydrated tweets.
Saved 45000 hydrated tweets.
Saved 46000 hydrated tweets.
Saved 47000 hydrated tweets.
Saved 48000 hydrated tweets.
Saved 49000 hydrated tweets.
Saved 50000 hydrated tweets.
Saved 51000 hydrated tweets.
Saved 52000 hydrated tweets.
Saved 53000 hydrated tweets.
Saved 54000 hydrated tweets.
Saved 55000 hydrated tweets.
Saved 56000 hydrated tweets.
Saved 57000 hydrated tweets.
Saved 58000 hydrated tweets.
Saved 59000 hydrated tweets.
Saved 60000 hydrated tweets.
Saved 61000 hydrated tweets.
Saved 62000 hydrated tweets.
Saved 63000 hydrated tweets.
Saved 64000 hydrated tweets.
Saved 65000 hydrated tweets.
Saved 66000 hydrated tweets.
Saved 67000 hydrated tweets.
Saved 68000 hydrated tweets.
Saved 69000 hydrated tweets.
Saved 70000 hydrated tweets.
Saved 71000 hydrated tweets.
Saved 72000 hydrated tweets.
Saved 73000 hydrated tweets.
Saved 74000 hydrated tweets.
Saved 75000 hy



Saved 126000 hydrated tweets.
Saved 127000 hydrated tweets.
Saved 128000 hydrated tweets.
Saved 129000 hydrated tweets.
Saved 130000 hydrated tweets.
Saved 131000 hydrated tweets.
Saved 132000 hydrated tweets.
Saved 133000 hydrated tweets.
Saved 134000 hydrated tweets.
Saved 135000 hydrated tweets.
Saved 136000 hydrated tweets.
Saved 137000 hydrated tweets.
Saved 138000 hydrated tweets.
Saved 139000 hydrated tweets.
Saved 140000 hydrated tweets.
Saved 141000 hydrated tweets.
Saved 142000 hydrated tweets.
Saved 143000 hydrated tweets.
Saved 144000 hydrated tweets.
Saved 145000 hydrated tweets.
Saved 146000 hydrated tweets.
Saved 147000 hydrated tweets.
Saved 148000 hydrated tweets.
Saved 149000 hydrated tweets.
Saved 150000 hydrated tweets.
Saved 151000 hydrated tweets.
Saved 152000 hydrated tweets.
Saved 153000 hydrated tweets.
Saved 154000 hydrated tweets.
Saved 155000 hydrated tweets.
Saved 156000 hydrated tweets.
Saved 157000 hydrated tweets.
Saved 158000 hydrated tweets.
Saved 1590



Saved 211000 hydrated tweets.
Saved 212000 hydrated tweets.
Saved 213000 hydrated tweets.
Saved 214000 hydrated tweets.
Saved 215000 hydrated tweets.
Saved 216000 hydrated tweets.
Saved 217000 hydrated tweets.
Saved 218000 hydrated tweets.
Saved 219000 hydrated tweets.
Saved 220000 hydrated tweets.
Saved 221000 hydrated tweets.
Saved 222000 hydrated tweets.
Saved 223000 hydrated tweets.
Saved 224000 hydrated tweets.
Saved 225000 hydrated tweets.
Saved 226000 hydrated tweets.
Saved 227000 hydrated tweets.
Saved 228000 hydrated tweets.
Saved 229000 hydrated tweets.
Saved 230000 hydrated tweets.
Saved 231000 hydrated tweets.
Saved 232000 hydrated tweets.
Saved 233000 hydrated tweets.
Saved 234000 hydrated tweets.
Saved 235000 hydrated tweets.
Saved 236000 hydrated tweets.
Saved 237000 hydrated tweets.
Saved 238000 hydrated tweets.
Saved 239000 hydrated tweets.
Saved 240000 hydrated tweets.
Saved 241000 hydrated tweets.
Saved 242000 hydrated tweets.
Saved 243000 hydrated tweets.
Saved 2440



Saved 296000 hydrated tweets.
Saved 297000 hydrated tweets.
Saved 298000 hydrated tweets.
Saved 299000 hydrated tweets.
Saved 300000 hydrated tweets.
Saved 301000 hydrated tweets.
Saved 302000 hydrated tweets.
Saved 303000 hydrated tweets.
Saved 304000 hydrated tweets.
Saved 305000 hydrated tweets.
Saved 306000 hydrated tweets.
Saved 307000 hydrated tweets.
Saved 308000 hydrated tweets.
Saved 309000 hydrated tweets.
Saved 310000 hydrated tweets.
Saved 311000 hydrated tweets.
Saved 312000 hydrated tweets.
Saved 313000 hydrated tweets.
Saved 314000 hydrated tweets.
Saved 315000 hydrated tweets.
Saved 316000 hydrated tweets.
Saved 317000 hydrated tweets.
Saved 318000 hydrated tweets.
Saved 319000 hydrated tweets.
Saved 320000 hydrated tweets.
Saved 321000 hydrated tweets.
Saved 322000 hydrated tweets.
Saved 323000 hydrated tweets.
Saved 324000 hydrated tweets.
Saved 325000 hydrated tweets.
Saved 326000 hydrated tweets.
Saved 327000 hydrated tweets.
Saved 328000 hydrated tweets.
Saved 3290



Saved 382000 hydrated tweets.
Saved 383000 hydrated tweets.
Saved 384000 hydrated tweets.
Saved 385000 hydrated tweets.
Saved 386000 hydrated tweets.
Saved 387000 hydrated tweets.
Saved 388000 hydrated tweets.
Saved 389000 hydrated tweets.
Saved 390000 hydrated tweets.
Saved 391000 hydrated tweets.
Saved 392000 hydrated tweets.
Saved 393000 hydrated tweets.
Saved 394000 hydrated tweets.
Saved 395000 hydrated tweets.
Saved 396000 hydrated tweets.
Saved 397000 hydrated tweets.
Saved 398000 hydrated tweets.
Saved 399000 hydrated tweets.
Saved 400000 hydrated tweets.
Saved 401000 hydrated tweets.
Saved 402000 hydrated tweets.
Saved 403000 hydrated tweets.
Saved 404000 hydrated tweets.
Saved 405000 hydrated tweets.
Saved 406000 hydrated tweets.
Saved 407000 hydrated tweets.
Saved 408000 hydrated tweets.
Saved 409000 hydrated tweets.
Saved 410000 hydrated tweets.
Saved 411000 hydrated tweets.
Saved 412000 hydrated tweets.
Saved 413000 hydrated tweets.
Saved 414000 hydrated tweets.
Saved 4150



Saved 467000 hydrated tweets.
Saved 468000 hydrated tweets.
Saved 469000 hydrated tweets.
Saved 470000 hydrated tweets.
Saved 471000 hydrated tweets.
Saved 472000 hydrated tweets.
Saved 473000 hydrated tweets.
Saved 474000 hydrated tweets.
Saved 475000 hydrated tweets.
Saved 476000 hydrated tweets.
Saved 477000 hydrated tweets.
Saved 478000 hydrated tweets.
Saved 479000 hydrated tweets.
Saved 480000 hydrated tweets.
Saved 481000 hydrated tweets.
Saved 482000 hydrated tweets.
Saved 483000 hydrated tweets.
Saved 484000 hydrated tweets.
Saved 485000 hydrated tweets.
Saved 486000 hydrated tweets.
Saved 487000 hydrated tweets.
Saved 488000 hydrated tweets.
Saved 489000 hydrated tweets.
Saved 490000 hydrated tweets.
Saved 491000 hydrated tweets.
Saved 492000 hydrated tweets.
Saved 493000 hydrated tweets.
Saved 494000 hydrated tweets.
Saved 495000 hydrated tweets.
Saved 496000 hydrated tweets.
Saved 497000 hydrated tweets.
Saved 498000 hydrated tweets.
Saved 499000 hydrated tweets.
Saved 5000

## Convert `JSONL` to `CSV`

In [35]:
output_json_filename = output_filename[:output_filename.index(".")] + ".txt"

# these are the column name that are selected to be stored in the csv
keyset = ["created_at", "id", "id_str", "full_text", "source", "truncated", "in_reply_to_status_id",
          "in_reply_to_status_id_str", "in_reply_to_user_id", "in_reply_to_user_id_str", 
          "in_reply_to_screen_name", "user", "coordinates", "place", "quoted_status_id",
          "quoted_status_id_str", "is_quote_status", "quoted_status", "retweeted_status", 
          "quote_count", "reply_count", "retweet_count", "favorite_count", "entities", 
          "extended_entities", "favorited", "retweeted", "possibly_sensitive", "filter_level", 
          "lang", "matching_rules", "current_user_retweet", "scopes", "withheld_copyright", 
          "withheld_in_countries", "withheld_scope", "geo", "contributors", "display_text_range",
          "quoted_status_permalink"]
hydrated_tweets = []

# reads the current tweets
with jsonlines.open(output_json_filename, "r") as reader:
    for i in reader.iter(type = dict, skip_invalid = True):
        hydrated_tweets.append(i)

# convert nested dictionary to JSON format for R
tweet_n = 0
for tweet in hydrated_tweets:
    tweet['user'] = json.dumps(tweet['user'])
    tweet['entities'] = json.dumps(tweet['entities'])
    # tweet['quoted_status_permalink'] = json.dumps(tweet['quoted_status_permalink'])
    tweet['place'] = json.dumps(tweet['place'])
    tweet['coordinates'] = json.dumps(tweet['coordinates'])
    tweet_n = tweet_n + 1
    if (tweet_n % 50000) == 0:
        print("Parsed {} tweets.".format(tweet_n))

# write tweets out
with open(output_filename, "w+", encoding = "utf-8") as output_file:
    d = csv.DictWriter(output_file, keyset)
    d.writeheader()
    d.writerows(hydrated_tweets)

Parsed 50000 tweets.
Parsed 100000 tweets.
Parsed 150000 tweets.
Parsed 200000 tweets.
Parsed 250000 tweets.
Parsed 300000 tweets.
Parsed 350000 tweets.
Parsed 400000 tweets.
Parsed 450000 tweets.
Parsed 500000 tweets.
