### References

Code copied and modified from: \
https://github.com/gabrielpreda/covid-19-tweets/blob/master/covid-19-tweets.ipynb


Twitter API reference : \
https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/overview/tweet-object \
https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/overview/user-object


Tweepy reference: \
http://docs.tweepy.org/en/latest/api.html \
http://docs.tweepy.org/en/latest/auth_tutorial.html


Tips: \
https://bhaskarvk.github.io/2015/01/how-to-use-twitters-search-rest-api-most-effectively./

### 1. Import packages

In [1]:
import os
import json
import tweepy as tw
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import date, datetime
from IPython.display import clear_output

pd.set_option('max_colwidth', None)

### 2. Configurations

In [2]:
search_words = "#Trump OR #Biden -filter:retweets"
fname        = "Tweets_US.csv"
inputFolder  = 'input'

date_until   = date.today().strftime("%Y-%m-%d")
tweetsPerQry = 100

DEBUG        = False

In [3]:
fname = os.path.join(inputFolder, fname)

### 3. Twitter API authentication

In [4]:
TWITTER_CONSUMER_API_KEY    = os.environ["TWITTER_CONSUMER_API_KEY"]
TWITTER_CONSUMER_API_SECRET = os.environ["TWITTER_CONSUMER_API_SECRET"]

In [5]:
# auth = tw.OAuthHandler(TWITTER_CONSUMER_API_KEY, TWITTER_CONSUMER_API_SECRET)

auth = tw.AppAuthHandler(TWITTER_CONSUMER_API_KEY, TWITTER_CONSUMER_API_SECRET) #This is faster with a higher rate limit
api  = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print ("Can't Authenticate")
else:
    print("API authenticated and ready.")        

API authenticated and ready.


### 4. Read Past Data (to get since_id)

In [6]:
if os.path.exists(fname):
    toConcate     = True
    tweets_old_df = pd.read_csv(fname, lineterminator='\n')
    since_id      = tweets_old_df.id.values[0]
    print(f"past tweets: {tweets_old_df.shape}")
    
    # Archive old tweets -------------------------#
    today = date.today().strftime("%Y_%m_%d")
    tweets_old_df.to_csv(f"{fname.replace('.csv', '')}_{today}.csv", index=False)
    print(f"{len(tweets_old_df) :,} tweets archived at {str(datetime.now())}")
    
else:
    toConcate = False
    since_id  = None
    print(f"No old tweets")


past tweets: (621404, 20)
621,404 tweets archived at 2020-11-02 08:39:58.473429


### 5. Get Tweets

In [7]:
def prettyPrintStatus(status):
    json_str = json.dumps(status._json)

    #deserialise string into python object
    parsed = json.loads(json_str)

    print(json.dumps(parsed, indent=4, sort_keys=True))    

In [8]:
def get_tweets_df(tweets):    
    rows = []
    for tweet in tweets:  
        
        if 'media' in tweet.entities.keys():
            media_type =  [media["type"] for media in tweet.entities["media"]]
        else:
            media_type = None            

        if 'extended_entities' in json.loads(json.dumps(tweet._json)):
            extended_media_type =  np.unique([media["type"] for media in tweet.extended_entities["media"]])
        else:
            extended_media_type = None

        row = {'id'                    : tweet.id,
               'user_name'             : tweet.user.name, 
               'user_location'         : tweet.user.location,
               'user_description'      : tweet.user.description,
               'user_created'          : tweet.user.created_at,
               'user_followers'        : tweet.user.followers_count,
               'user_friends'          : tweet.user.friends_count,
               'user_favourites'       : tweet.user.favourites_count,
               'user_verified'         : tweet.user.verified,
               'coordinates'           : tweet.coordinates,
               'timestamp'             : tweet.created_at,
               'text'                  : tweet.full_text, 
               'truncated'             : tweet.truncated, 
               'hashtags'              : [hashtag["text"] for hashtag in tweet.entities["hashtags"]],
               'retweet_count'         : tweet.retweet_count,
               'favorite_count'        : tweet.favorite_count, #likes count
               'has_media_type'        : extended_media_type,            
               'in_reply_to_status_id' : tweet.in_reply_to_status_id,
               'source'                : tweet.source,
               'is_retweet'            : tweet.retweeted,        
             }    
        rows.append(row)

    return pd.DataFrame(rows)


In [9]:
%%time

all_tweets = pd.DataFrame()

max_id = None
count  = 0

while True:    
    # Collect new tweets (further back in time)--------#
    tweets = tw.Cursor(api.search,
                       q          = search_words,                    
                       tweet_mode = "extended", # needed to get full_text                   
                       lang       = "en",
                       since_id   = str(since_id - 1) if since_id else None, #lower bound
                       max_id     = str(max_id - 1) if max_id else None,     #upper bound
                       until      = date_until
                      ).items(tweetsPerQry)
    
    tweets_copy = []
    for tweet in tweets:
        tweets_copy.append(tweet)

    print(f"new tweets retrieved: {len(tweets_copy)}")
    #--------------------------------------------------#    
    
    #Break if no more tweets returned -----------------#
    if len(tweets_copy) == 0:
        break
        
        
    #Convert to Dataframe -----------------------------#
    new_tweets = get_tweets_df(tweets_copy)    
    
    
    #Concat new tweets with existing ones -------------#                    
    all_tweets = pd.concat([all_tweets, new_tweets], axis=0)
    
    
    #Set max_id for next iteration --------------------#
    max_id = new_tweets.id.values[-1]
    
    count += len(tweets_copy)
    
    print(f"Current date and time: {str(datetime.now())}")
    print(f"Tweets collection count: {count :,}")
    print(f"Latest Tweet timestamp: {new_tweets.timestamp.values[-1]}")
        
    clear_output(wait=True)
    
    if DEBUG:
        break
    #break
    
#-----------------------------------------------------------#
print(f"Total tweets collected: {len(all_tweets) :,}")

new tweets retrieved: 0
Total tweets collected: 23,850
CPU times: user 46.9 s, sys: 1.83 s, total: 48.7 s
Wall time: 1h 1min 55s


In [10]:
all_tweets.timestamp.values[0]

numpy.datetime64('2020-11-01T23:59:57.000000000')

### 6. Save the data

#### 6.1. Merge past and present data

In [11]:
if toConcate:
    tweets_all_df = pd.concat([all_tweets, tweets_old_df], axis=0)
    tweets_all_df.reset_index(inplace=True)
else:
    tweets_all_df = all_tweets
    
print(f"new tweets: {all_tweets.shape[0] :,}; all tweets: {tweets_all_df.shape[0] :,}")

new tweets: 23,850; all tweets: 645,254


#### 6.2. Drop duplicates

In [12]:
tweets_all_df.drop_duplicates(subset = ["id"], inplace=True)
tweets_all_df.reset_index(inplace=True)
tweets_all_df.drop(columns=['index', 'level_0'], inplace=True, errors='ignore')

print(f"all tweets: {tweets_all_df.shape}")

all tweets: (645253, 20)


In [13]:
tweets_all_df.head(1)

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,coordinates,timestamp,text,truncated,hashtags,retweet_count,favorite_count,has_media_type,in_reply_to_status_id,source,is_retweet
0,1323052187539353600,Julie Frein,"Headingley, Leeds, UK","BA(Hons) Art Vis Comm. Expat American living in UK, Divorced, walking a positive path. PLEASE, no DMS if only wanting a 'date'. #Resistance #lovepeople",2009-02-19 02:14:14,1222,1282,17866,False,,2020-11-01 23:59:57,"@realDonaldTrump Dumpy Donnie is the candidate of rioters, looters, arsonists, gun-toting Billy-Bubbas, terrorists, lobbyists and special interests. Biden is the candidate of farmers, factory workers, police officers, and hard-working, law-abiding patriots of every race, religion and creed #BIDEN",False,[BIDEN],0,0,,1.323031e+18,Twitter Web App,False


#### 6.3. Export the updated data

In [14]:
tweets_all_df.to_csv(fname, index=False)
start = tweets_all_df.timestamp.values[-1]
end   = tweets_all_df.timestamp.values[0]

print(f"{len(tweets_all_df) :,} tweets, from {start} to {end}, saved at {str(datetime.now())}")

645,253 tweets, from 2020-10-01 22:14:28 to 2020-11-01 23:59:57, saved at 2020-11-02 09:42:05.708817
