In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import GetOldTweets3 as got
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime

## The function itself

I'll probably nest this in a functions.py file in the final project so it doesn't take up notebook space, but leaving it here for now so you can look through it easily, if you'd like! Scroll to the bottom to use it :)

In [2]:
def state_tweets_to_csv(query:str, tweets_per_iter:int, cities:dict, date_range:tuple, state:str):
    '''
    A function for returning search results on a query
    to create a representative sample of a state/region
    
    Parameters
    ----------
    query : string, a search query to be passed through
    Twitter's advanced search. Can use booleans within
    the query!
    
    tweets_per_iter : int, number of tweets to pull per iteration, 
    this is used to calculate the sleep time, may cause errors if using
    a number greater than 
    (recommend using 8000 as an upper limit)
    
    cities : dict, dictionary where the keys are [city, state abbreviation] 
    and the values are the distance around the city to search.
    Keys should be strings, values can be strings or integers.
    Not case-sensitive
    Example: {'chicago': 10, 'sPringfield': '20'}
    
    date_range : tuple, a range of dates as stringts to pull 
    tweets from, formatted as 'YYYY-MM-DD'. Put earliest date first. 
    Example: ('2020-03-20', '2020-03-25')
    
    state : string, enter the two-letter state code you are pulling info from.
    Not case-sensitive.
    '''
    # Cleaning cities variable
    cities = {key.lower(): value for key, value in cities.items()}
    
    # Check to make sure we won't trigger a timeout on Twitter
    if tweets_per_iter > 17999:
        raise Exception("Your max Tweet per iter must 17999 or lower")
    
    
    # Makes the data folder in the directory if you don't already have it
    os.makedirs('data', exist_ok=True)
    
    def csv_store(resultsAux):
        '''
        A function that is used within getTweets() as a receive buffer.
        This function stores a city's info in a .csv so if you hit a
        rate limit, your data gets saved.
        '''
        # Create dataframe from the temporary variable, resultsAux (comes from getTweets() source code)
        df = pd.DataFrame(t.__dict__ for t in resultsAux)
            
        # Add new columns to the df and write to new .csv
        df['city'] = city
        df['query'] = query
        df['date_range'] = str(date_range)
        df['state'] = state
        df['date'] = pd.to_datetime(df['date'], utc=True)
        df['month'] = df['date'].dt.month # Thanks Haley Taft for finding this link! https://www.dataquest.io/blog/python-datetime-tutorial/
        df['day'] = df['date'].dt.day
        # the .csv and the extra headers will be removed at the end of the whole function
        df.to_csv(f'./data/{city}_scrape_data.csv', index=False, mode='a')
            
    # Create a static timestamp to use for versioning
    timestamp = str(time.ctime().replace(' ', '_').replace(':', '_'))
    
    # Set state to uppercase for filenaming uniformity
    state = state.upper()
    
    
    #-----------------------------------------------------------------------------
    # Main search loop, developed by Eric Heidbreder, Haley Taft, Irene Anibogwu, and Steven Markoe
    
    '''
    Setting Variables:
    
    Twitter's request timer resets every 15 minutes, 
    we add 3% in the sleep_time calculation to be safe, 
    as tweets_per_iter seems to have some variation.
    '''
    time_window = 900 
    max_tweets_per_time_window = 17999
    sleep_time = (time_window * 1.03) * (tweets_per_iter / max_tweets_per_time_window)
    
    for city, area in cities.items():
         
        # First, we need to set the max_id to a specific tweet earlier than all those we'll be searching for
        max_id = 1295148306117476352 # Set a starting max_id, thanks Steven Markoe for figuring out this approach!
        
        while True:
            # Start with a random rest so we don't trigger the search limit
            print(f'Waiting {sleep_time} seconds before next iteration')
            time.sleep(np.random.normal(sleep_time, 0.1))

            # Try to get all tweets as determined by tweets_per_iter
            try:
                tweetCriteria = got.manager.TweetCriteria().setQuerySearch(f'{query} max_id:{max_id}')\
                                                   .setSince(date_range[0])\
                                                   .setUntil(date_range[1])\
                                                   .setMaxTweets(tweets_per_iter)\
                                                   .setNear(f'{city}, {state}')\
                                                   .setEmoji('unicode')\
                                                   .setWithin(f'{str(area)}mi')
                tweets = got.manager.TweetManager.getTweets(tweetCriteria, 
                                                            receiveBuffer=csv_store) # This receive buffer goes into the csv_store function defined above

                # Let's get the current city's csv that was created from the getTweets() receiveBuffer
                current_city = pd.read_csv(f'./data/{city}_scrape_data.csv')
                
                # Is this a full page of tweets? If not it means it's the last page
                if len(current_city) < (tweets_per_iter / 2):
                    print(f'Returned {len(current_city)} tweets, wrapping up work on {city}!')
                    # Save this data to the csv
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                    # Clean up the directory
                    os.remove(f'./data/{city}_scrape_data.csv')
                    break
                
                # Tell me how many tweets we collected
                print(f'Finished current iteration for {city}, we got {len(current_city)} tweets.')

                max_id = int(current_city.tail(1)['id'].values[0]) # HALEY TAFT FIGURED THIS OUT!

                # Create a .csv and put each city's data inside
                current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                # Clean up the directory by removing the city's .csv
                os.remove(f'./data/{city}_scrape_data.csv')

                # Set the new max id to the last id in our previous dataframe. This will start the new query at this id.
                max_id = int(current_city.tail(1)['id'].values[0])

            # If one of the searches didn't return anything, it won't create a .csv and will throw an error, let's account for that
            except FileNotFoundError:
                print(f'Found no tweets remaining for {city}, moving on to next city!')
                break

            # This is just a general catch-all for any other issues (including timeouts)
            except:

                # If there were errors above, we'll have to account for the missing .csv with another try/except
                try:
                    # Let's get the current city's csv that was created above
                    current_city = pd.read_csv(f'./data/{city}_scrape_data.csv')

                    # Tell me how many tweets we collected
                    print(f'Encountered error, storing {len(current_city)} tweets from {city} and moving on.')
                    
                    # Is this a full page of tweets? If not, it means it's the last page
                    if len(current_city) < (tweets_per_iter / 2): 
                        current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                        os.remove(f'./data/{city}_scrape_data.csv')
                        break

                    max_id = int(current_city.tail(1)['id'].values[0])

                    # Create a .csv and put each city's data inside
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                    # Clean up the directory by removing the city's .csv
                    os.remove(f'./data/{city}_scrape_data.csv')

                # If the .csv didn't exist, just sleep and go on to the next city!
                except:
                    break
    
    try:
        # Clean up final df
        print('Cleaning up final dataframe!')
        df_full = pd.read_csv(f'./data/{state}_scrape_data_{timestamp}.csv')
        df_full = df_full[df_full['username'] != 'username'] # Removes headers leftover from scraping
        df_full.dropna(subset=['text', 'date'], inplace=True) # There were some nulls in the text and date column that are likely the result of deleted/private tweets
        df_full.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', index=False)
    except:
        pass
        

## Picking a search query

We tried to **grab every tweet from each city**, but soon found that we wouldn't be able to do that (returned 20000 tweets from Chicago for just one day, and we want to search across a **2-week period**). We're also limited to the **number of terms** we can include in a query (testing showed it was somewhere around 25-40 words max, may be character based). We chose to build our query with words derived from the `top_words_il` dataframe, which contains the top words from Illinois after running the corpus through `CountVectorizer`. Illinois happened to be the first large dataframe we performed EDA on.

In [3]:
top_words_nj = pd.read_csv('./data/top_words_cvec.csv')

In [6]:
term_list = top_words_nj.head(20)['0'].tolist()
term_list.remove('illinois')

Our query contains **24 terms**: 
* 50% are neutral words with stop words removed
* 50% are Covid-19-related terms.

In [12]:
nj_term_list = ['get', 'one', 'time', 'people', 'day', 'know', 'today', 'need', 'go', 'home', 'right', 'going', 
             'pandemic', 'coronavirus', 'news', 'health', 'covid', '19', 'quarantine', 'governor', 'capitol', 'capital', 'murphy', 'virus'] 

## Use this area to collect tweets!

In [13]:
# Building Illinois query
query = ' OR '.join(nj_term_list) # Joining with OR so that we get tweets that contain those individual words rather than phrases
tweets_per_iter = 1000

# Picking wider ranges for more rural areas, shallower ranges for cities, used google maps to try not to overlap, but we can also check for duplicates afterward.
cities = {
           'newark': 10,
           'jersey city': 8,
           'hoboken': 5,
           'paterson': 10,
           'elizabeth': 10,
           'toms river': 10,
           'seacaucus': 10,
           'clifton': 10,
           'trenton': 8,
           'camden': 10,
           'passaic': 10,
           'summit': 10,
           'bayonne': 10,
           'vineland': 18,
           'new brunswick': 15,
           'union city': 10,
           'princeton': 10,
           'andover': 40,
           'new egypt': 40,
           'totowa': 30,
           'burlington': 35,
           'hackettstown': 30,
           'matawan': 20,
           'raritan': 20,
           'cherry hill': 10,
           'linden': 10,
           'rahway': 10,
           'harrison': 22,
           'fair lawn': 15,
           'vineland': 17,
           'sussex': 35,
           'dunellen': 27,
           'dayton': 30,
           'mountainside': 25,
           'watchung': 25,
           'point plesant': 15,
           'boonton': 30,
           'rockaway': 30,
           'cape may': 30,
           'wildwood' : 35,
           'asbury park' : 18,
           'freehold': 25,
    
           

         }
date_range = ('2020-03-13', '2020-03-28') # 2 weeks total, starting 3 days before governor announced state shutdown. The 'Until' date is exclusive, so this range looks like 15 days
state = 'nj'

In [15]:

state_tweets_to_csv(query, tweets_per_iter, cities, date_range, state)

Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for newark, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for newark, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for newark, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for newark, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for newark, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for newark, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for newark, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for newark, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iterati

KeyboardInterrupt: 

In [16]:
ny_term_list = ['get', 'one', 'time', 'people', 'day', 'know', 'today', 'need', 'go', 'home', 'right', 'going', 
             'pandemic', 'coronavirus', 'news', 'health', 'covid', '19', 'quarantine', 'governor', 'capitol', 'capital', 'cuomo', 'virus'] 

In [24]:
# Building Georgia Query
query = ' OR '.join(ny_term_list) # Joining with OR so that we get tweets that contain those individual words rather than phrases
tweets_per_iter = 1000

# Picking wider ranges for more rural areas, shallower ranges for cities, used google maps to try not to overlap, but we can also check for duplicates afterward.
cities = {
#        'new york': 10,
       'buffalo' : 10,
       'rochester': 10,
       'yonkers': 10,
       'syracuse' : 10,
       'albany': 10,
       'new rochelle': 10,
       'cheektowaga': 10,
       'mount vernon': 10,
       'schenectady': 10,
       'hicksville' : 25,
       'alfred' :40,
       'poughkeepsie': 20,
       'staten island': 10,
       'malone': 40,
    
        
        
    
         }
date_range = ('2020-03-19', '2020-04-02') # 2 weeks total, starting 3 days before governor announced state shutdown. The 'Until' date is exclusive, so this range looks like 15 days
state = 'ny'

In [25]:
# uncomment to run
state_tweets_to_csv(query, tweets_per_iter, cities, date_range, state)

Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for buffalo, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for buffalo, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for buffalo, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for buffalo, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for buffalo, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for buffalo, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for buffalo, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for buffalo, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current

Finished current iteration for yonkers, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for yonkers, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for yonkers, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for yonkers, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for yonkers, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for yonkers, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for yonkers, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for yonkers, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for yonkers, we got 1009 tweets.
Waiting 51.5

Finished current iteration for new rochelle, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for new rochelle, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for new rochelle, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for new rochelle, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for new rochelle, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for new rochelle, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for new rochelle, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for new rochelle, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for n

Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for m

Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for mount vernon, we got 1009 tweets.
Waiting 51.50286127007056 seconds before next iteration
Finished current iteration for m