## Import Tweepy and Credentials

In [1]:
#!pip install tweepy
#!pip install jsonpickle

In [2]:
import tweepy

# Create credentials.py in working directory with 
# your application's key and secret. API_KEY and API_SECRET
from credentials import *

# -or-
# Type in credentials below
# API_KEY = 'string'
# API_SECRET = 'string'

auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print ("Can't Authenticate")
    sys.exit(-1)

In [3]:
import os
import linecache
import pickle
from datetime import datetime

import sys
import jsonpickle

## States to Scrape

In [4]:
state_list = ['california', 'colorado', 'florida', 
              'georgia', 'idaho', 'illinois', 
              'louisiana', 'massachusetts', 'newyork',
              'tennessee', 'texas', 'washington']

## Tracker
Tracked most recent tweet scraped, number of tweets scraped, and location for geofence used for that state

In [5]:
# tracker = {}
# I used linecache to get the max_id from my previously scraped tweets
# for state in state_list:
#     tracker[state] = {'max_id': linecache.getline(f'tweets/tweets_compressed/coronavirus_{state}_tweets.txt',2).split(',')[0],
#                       'downloaded': 0,
#                       'tweetLocation': ''}

In [6]:
tracker = {'california': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '36.116,-119.682,300mi'},
         'colorado': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '39.060,-105.311,200mi'},
         'florida': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '27.766,-81.687,225mi'},
         'georgia': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '32.781,-83.334,150mi'},
         'idaho': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '44.241,-114.479,200mi'},
         'illinois': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '40.350,-88.986,150mi'},
         'louisiana': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '31.170,-91.868,150mi'},
         'massachusetts': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '42.230,-71.530,100mi'},
         'newyork': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '40.700,-73.974,50mi'},
         'tennessee': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '35.748,-86.692,200mi'},
         'texas': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '31.527,-99.524,350mi'},
         'washington': {'max_id': '',
          'downloaded': 0,
          'tweetLocation': '47.401,-121.491,200mi'}}

In [7]:
# pickle.dump(tracker, open('tweets/tweets_tracker.txt', 'wb'))
tracker = pickle.load(open('tweets/tweets_tracker.txt', 'rb'))
error_counter = 0

# Scrape To API's Limit
Twitter only allows to scrape Tweets no older than 7 days, uncommenting this will scrape until that point

In [8]:
# code modified from https://bhaskarvk.github.io/2015/01/how-to-use-twitters-search-rest-api-most-effectively./

# searchQuery = 'coronavirus -filter:replies -filter:retweets'  # search query can use filters from advanced twitter searching
# tweetLocation = '30.976,112.271,275mi' # latitude, longitude, radius of circle from point in mi or km
# tweetLang = 'en' # scraping only english tweets

# maxTweets = 10000000 # some arbitrary large number
# tweetsPerQry = 100  # this is the max the API permits
# fName = f'coronavirus_hubeichina_tweets.txt' # file name

# # If results from a specific ID onwards are reqd, set since_id to that ID.
# # else default to no lower limit, go as far back as API allows
# sinceId = None

# # If results only below a specific ID are, set max_id to that ID.
# # else default to no upper limit, start from the most recent tweet matching the search query.
# max_id = -1

# tweetCount = 0
# print("Downloading max {0} tweets".format(maxTweets))
# with open(fName, 'w') as f:
#     while tweetCount < maxTweets:
#         try:
#             if (max_id <= 0):
#                 if (not sinceId):
#                     new_tweets = api.search(q=searchQuery, geocode=tweetLocation, 
#                                             lang=tweetLang, count=tweetsPerQry,
#                                             tweet_mode='extended')
#                 else:
#                     new_tweets = api.search(q=searchQuery, geocode=tweetLocation, 
#                                             lang=tweetLang, count=tweetsPerQry,
#                                             tweet_mode='extended',
#                                             since_id=sinceId)
#             else:
#                 if (not sinceId):
#                     new_tweets = api.search(q=searchQuery, geocode=tweetLocation, 
#                                             lang=tweetLang, count=tweetsPerQry,
#                                             tweet_mode='extended',
#                                             max_id=str(max_id - 1))
#                 else:
#                     new_tweets = api.search(q=searchQuery, geocode=tweetLocation, 
#                                             lang=tweetLang, count=tweetsPerQry,
#                                             tweet_mode='extended',
#                                             max_id=str(max_id - 1),
#                                             since_id=sinceId)
#             if not new_tweets:
#                 print("No more tweets found")
#                 break
#             for tweet in new_tweets:
#                 f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
#                         '\n')
#             tweetCount += len(new_tweets)
#             print("Downloaded {0} tweets".format(tweetCount))
#             max_id = new_tweets[-1].id
#         except tweepy.TweepError as e:
#             # Just exit if any error
#             print("some error : " + str(e))
#             break

# print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))

This is to scrape any new tweets given the proper start_id from the previous run

In [9]:
def scrapeNewTweets(state, tweetLocation, start_id, date):
    ''' scrapes new tweets using Twitter API 
    
    params
    ---
    state: file naming and tracking purposes
    tweetLocation: geofence for scraping tweets, must be in string of lat, long, radius (mi or km appended)
        Example: '30.976,112.271,275mi'
    start_id: scrape until it reaches this ID
    date: file naming purposes
    '''
    searchQuery = 'coronavirus -filter:replies -filter:retweets'  # search query can use filters from advanced twitter searching
    tweetLocation = tweetLocation # latitude, longitude, radius of circle from point in mi or km
    tweetLang = 'en'

    maxTweets = 10000000 # Some arbitrary large number
    tweetsPerQry = 100  # this is the max the API permits
    fName = f'tweets/tweets_{date}/coronavirus_{state}_tweets_{date}.txt' # We'll store the tweets in a text file.


    # If results from a specific ID onwards are reqd, set since_id to that ID.
    # else default to no lower limit, go as far back as API allows
    sinceId = start_id

    # If results only below a specific ID are, set max_id to that ID.
    # else default to no upper limit, start from the most recent tweet matching the search query.
    max_id = -1

    tweetCount = 0
    print("Downloading max {0} tweets".format(maxTweets))
    with open(fName, 'w') as f:
        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, geocode=tweetLocation, 
                                                lang=tweetLang, count=tweetsPerQry,
                                                tweet_mode='extended')
                    else:
                        new_tweets = api.search(q=searchQuery, geocode=tweetLocation, 
                                                lang=tweetLang, count=tweetsPerQry,
                                                tweet_mode='extended',
                                                since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, geocode=tweetLocation, 
                                                lang=tweetLang, count=tweetsPerQry,
                                                tweet_mode='extended',
                                                max_id=str(max_id - 1))
                    else:
                        new_tweets = api.search(q=searchQuery, geocode=tweetLocation, 
                                                lang=tweetLang, count=tweetsPerQry,
                                                tweet_mode='extended',
                                                max_id=str(max_id - 1),
                                                since_id=sinceId)
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet in new_tweets:
                    f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
                            '\n')
                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                if max_id == -1:
                    max_id = new_tweets[-1].id
                    tracker[state]['max_id'] = max_id
                else:
                    max_id = new_tweets[-1].id
            except tweepy.TweepError as e:
                # Just exit if any error
                error_counter += 1
                print("some error : " + str(e))
                break

    print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))
    tracker[state]['downloaded'] += tweetCount

In [10]:
# Creating a new folder as to not override previous scrapes
today = datetime.today().strftime('%Y_%m_%d')

if not os.path.exists(f'tweets/tweets_{today}'):
    os.makedirs(f'tweets/tweets_{today}')

In [11]:
# Batch running per state
for state in state_list:
    print(f'Starting to scrape for {state}')
    scrapeNewTweets(state, tracker[state]['tweetLocation'],  tracker[state]['max_id'], today)

pickle.dump(tracker, open('tweets/tweets_tracker.txt', 'wb'))

What’s more using the initial values for max_id and/or since_id you can fetch results to and from arbitrary IDs. This is really helpful if you want to the program repeatedly to fetch newer results since last run. Just look up the max ID (the ID of the first line) from the previous run and set that to since_id for the next run. If you’ve to stop your program before exhausting all the possible results and rerun it again to fetch the remaining results, you can look up the min ID (the ID of the last line) and pass that as max_id for the next run to start from that ID and below.
* max_id (first tweet) from previous run -> since_id = 
* min_id (last tweet) -> max_id for new run if starting from where you left off = 

**California**
* searchQuery = 'coronavirus'
* **Downloaded 298499 tweets**

* searchQuery = 'covid19'
* **Downloaded 57200 tweets**
* tweetLocation = '36.116,-119.682,300mi'

**Florida**
* searchQuery = 'coronavirus'
* **Downloaded 102359 tweets**

* searchQuery = 'covid19'
* **Downloaded --- tweets**
* tweetLocation = '27.766,-81.687,225mi'

**Massachusetts**
* searchQuery = 'coronavirus'
* **Downloaded 94157 tweets**
* tweetLocation = '42.230,-71.530,100mi'

**New York**
* searchQuery = 'coronavirus'
* **Downloaded 293528 tweets**
* tweetLocation =  '40.700,-73.974,50mi'

**Washington**
* searchQuery = 'coronavirus'
* **Downloaded 85792 tweets**
* tweetLocation = '47.401,-121.491,200mi'

**Tennessee**
* searchQuery = 'coronavirus'
* **Downloaded 139487 tweets**
* tweetLocation = '35.748,-86.692,200mi'

**Texas**
* searchQuery = 'coronavirus'
* **Downloaded 196317 tweets**
* tweetLocation = '31.527,-99.524,350mi'

**Louisiana**
* searchQuery = 'coronavirus'
* **Downloaded 54235 tweets**
* tweetLocation = '31.170,-91.868,150mi'

**Illinois**
* searchQuery = 'coronavirus'
* **Downloaded 140102 tweets**
* tweetLocation = '40.350,-88.986,150mi'

**Colorado**
* searchQuery = 'coronavirus'
* **Downloaded 30059 tweets**
* tweetLocation = '39.060,-105.311,200mi'

**Georgia**
* searchQuery = 'coronavirus'
* **Downloaded 101940 tweets**
* tweetLocation = '32.781,-83.334,150mi'

**Idaho**
* searchQuery = 'coronavirus'
* **Downloaded 6882 tweets**
* tweetLocation = '44.241,-114.479,200mi'

Countries
---
**Italy**
* **Downloaded 13835 tweets**
* tweetLocation = '41.872,12.567,275mi'

**Hubei, China**
* **Downloaded 1359 tweets**
* tweetLocation = '30.976,112.271,275mi'

**South Korea**
Latitude	Longitude
35.908	127.767

**France**
Latitude	Longitude
46.228	2.214


**Spain**
Latitude	Longitude
40.464	-3.749

**Germany**
Latitude	Longitude
51.166	10.452

Retweets Included Below
--- 
**California**
* Downloaded 670881 tweets
* max_id (first tweet) from previous run -> since_id = "1239943180579000320" 
* min_id (last tweet) -> max_id for new run if starting from where you left off = "1239289843491823616"
* searchQuery = 'coronavirus'
* tweetLocation = '36.116,-119.682,300mi'
* tweetLang = 'en'
* fName = 'coronavirus_california_tweets.txt'

**Florida**
* Downloaded
* max_id (first tweet) from previous run -> since_id = 
* min_id (last tweet) -> max_id for new run if starting from where you left off = 
* searchQuery = 'coronavirus'
* tweetLocation = '27.766,-81.687,225mi'
* tweetLang = 'en'
* fName = 'coronavirus_florida_tweets.txt'