In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import GetOldTweets3 as got
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime

## The function itself

I'll probably nest this in a functions.py file in the final project so it doesn't take up notebook space, but leaving it here for now so you can look through it easily, if you'd like! Scroll to the bottom to use it :)

In [2]:
def state_tweets_to_csv(query:str, max_tweets:int, cities:dict, date_range:tuple, state:str, sleep_time:float=1.5):
    '''
    A function for returning search results on a query
    to create a representative sample of a state/region
    
    Parameters
    ----------
    query : string, a search query to be passed through
    Twitter's advanced search. Can use booleans within
    the query!
    
    max_tweets : int, number of tweets to pull, recommend
    staying within the boundaries of the twitter API limitations
    (recommend using 18000 as an upper limit)
    
    cities : dict, dictionary where the keys are [city, state abbreviation] 
    and the values are the distance around the city to search.
    Keys should be strings, values can be strings or integers.
    Not case-sensitive
    Example: {'chicago': 10, 'sPringfield': '20'}
    
    date_range : tuple, a range of dates as stringts to pull 
    tweets from, formatted as 'YYYY-MM-DD'. Put earliest date first. 
    Example: ('2020-03-20', '2020-03-25')
    
    state : string, enter the two-letter state code you are pulling info from.
    Not case-sensitive.
    '''
    def csv_store(resultsAux):
        '''
        A function that is used within getTweets() as a receive buffer.
        This function stores a city's info in a .csv so if you hit a
        rate limit, your data gets saved.
        '''
        # Create dataframe from the temporary variable, resultsAux (comes from getTweets() source code)
        df = pd.DataFrame(t.__dict__ for t in resultsAux)
        
        # Add city column to this df and write to new .csv, the .csv will be removed at the end of the whole function
        df['city'] = city
        df['query'] = query
        df['date_range'] = str(date_range)
        df.to_csv(f'./data/{city}_scrape_data.csv', index=False, mode='a')
        
    
    # Create a static timestamp to use for versioning
    timestamp = str(time.ctime().replace(' ', '_').replace(':', '_'))
    
    # Set state to uppercase for filenaming uniformity
    state = state.upper()
    
    for city, area in cities.items():
        # Make city lowercase for consitent file naming
        city = city.lower()
        
        # Try to get all tweets as determined by max_tweets
        try:
            tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                               .setSince(date_range[0])\
                                               .setUntil(date_range[1])\
                                               .setMaxTweets(max_tweets)\
                                               .setNear(f'{city}, {state}')\
                                               .setWithin(f'{str(area)}mi')
            tweets = got.manager.TweetManager.getTweets(tweetCriteria, 
                                                        receiveBuffer=csv_store) # This receive buffer goes into the csv_store function defined above

            # Let's get the current city's csv that was created above
            current_city = pd.read_csv(f'./data/{city}_scrape_data.csv')
            
            # Tell me how many tweets we collected
            print(f'Finished collecting tweets from {city}, we got {len(current_city)} tweets')

            # Is this the first city?
            if city == list(cities.keys())[0]:
                # Create a .csv and put each city's data inside
                current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                # Clean up the directory by removing the city's .csv
                os.remove(f'./data/{city}_scrape_data.csv')
            
            else:
                # Don't need header for anything but the first city
                current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                os.remove(f'./data/{city}_scrape_data.csv')
                
            # Rest a random amount to try not to be detected as a bot
            time.sleep(np.random.normal(sleep_time, 0.1))
        
        # If one of the searches didn't return anything, it won't create a .csv and will throw an error, let's account for that
        except FileNotFoundError:
            pass
        
        # This is just a general catch-all for any other issues (including timeouts)
        except:
        
            # If there were errors above, we'll have to account for the missing .csvs with another try/except
            try:
                # Let's get the current city's csv that was created above
                current_city = pd.read_csv(f'./data/{city}_scrape_data.csv')

                # Tell me how many tweets we collected
                print(f'Finished collecting tweets from {city}, we got {len(current_city)} tweets')

                # Is this the first city?
                if city == list(cities.keys())[0]:
                    # Create a .csv and put each city's data inside
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                    # Clean up the directory by removing the city's .csv
                    os.remove(f'./data/{city}_scrape_data.csv')

                else:
                    # Don't need header for anything but the first city
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                    os.remove(f'./data/{city}_scrape_data.csv')

                    # Rest a random amount to try not to be detected as a bot
                    time.sleep(np.random.normal(sleep_time, 0.1))
            
            # If the .csv didn't exist, just sleep and go on to the next city!
            except:
                time.sleep(np.random.normal(sleep_time, 0.1))

## Use this area to collect tweets!

I haven't been able to grab many tweets from rural areas in a short date range, thinking about expanding date range before and after an announcement so that non-urban areas are better represented.

In [4]:
query = 'covid OR COVID-19 OR covid19 OR murphy OR corona OR coronavirus OR gov OR governor OR capital'
max_tweets = 1000

# Picking wider ranges for more rural areas, shallower ranges for cities, used google maps to try not to overlap, but we can also check for duplicates afterward.
cities = {
          'newark': 10,
          'trenton': 5,
          'camden': 20,
          'jersey city': 10,
          'vineland': 30,
          'Hoboken': 20,
          'passaic': 20,
          'clifton': 20,
          'bayonne': 10,
          'elizabeth': 10,
          'paterson': 20
         }
date_range = ('2020-03-07', '2020-03-15')
state = 'nj'

In [6]:
state_tweets_to_csv(query, max_tweets, cities, date_range, state, 60)

Finished collecting tweets from newark, we got 1009 tweets
Finished collecting tweets from trenton, we got 670 tweets
Finished collecting tweets from camden, we got 1009 tweets
Finished collecting tweets from jersey city, we got 1009 tweets
Finished collecting tweets from vineland, we got 49 tweets
Finished collecting tweets from hoboken, we got 1009 tweets
Finished collecting tweets from passaic, we got 1009 tweets
Finished collecting tweets from clifton, we got 1009 tweets
Finished collecting tweets from bayonne, we got 1009 tweets
Finished collecting tweets from elizabeth, we got 1009 tweets
Finished collecting tweets from paterson, we got 1009 tweets


In [8]:
nj = pd.read_csv('./data/NJ_scrape_data_Thu_Sep_10_16_51_42_2020.csv')
nj.head()

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,formatted_date,hashtags,mentions,geo,urls,city,query,date_range
0,marigreyes13,,Jeeee y confirmaron el primer caso de coronavi...,0,2,1,1238977968237674497,https://twitter.com/marigreyes13/status/123897...,49742903,2020-03-14 23:59:01+00:00,Sat Mar 14 23:59:01 +0000 2020,,,,,newark,covid OR COVID-19 OR covid19 OR murphy OR coro...,"('2020-03-07', '2020-03-15')"
1,the_sole_broker,,Travis Scott Jordan 6 GS New Size 4Y $399.99 T...,0,0,0,1238977927842275334,https://twitter.com/the_sole_broker/status/123...,789883607389175808,2020-03-14 23:58:51+00:00,Sat Mar 14 23:58:51 +0000 2020,,,,https://www.instagram.com/p/B9u7yfAp_-z/?igshi...,newark,covid OR COVID-19 OR covid19 OR murphy OR coro...,"('2020-03-07', '2020-03-15')"
2,Baskarbhat,,Indian way to avoid contact of hands! Help spr...,0,0,0,1238977889783156736,https://twitter.com/Baskarbhat/status/12389778...,47604483,2020-03-14 23:58:42+00:00,Sat Mar 14 23:58:42 +0000 2020,#COVID,,,,newark,covid OR COVID-19 OR covid19 OR murphy OR coro...,"('2020-03-07', '2020-03-15')"
3,MatthewDrutt,,Trumps tests negative for COVID-19. Anybody te...,0,1,0,1238977746627440643,https://twitter.com/MatthewDrutt/status/123897...,152975295,2020-03-14 23:58:08+00:00,Sat Mar 14 23:58:08 +0000 2020,,,,,newark,covid OR COVID-19 OR covid19 OR murphy OR coro...,"('2020-03-07', '2020-03-15')"
4,MaineventL,,Get ready for war #covid #coronavirüsü http://...,0,0,0,1238977728105320448,https://twitter.com/MaineventL/status/12389777...,1140542122065649664,2020-03-14 23:58:04+00:00,Sat Mar 14 23:58:04 +0000 2020,#covid #coronavir #facemask #camo #faceshield ...,,,"http://Maineventlingerie.com,https://www.insta...",newark,covid OR COVID-19 OR covid19 OR murphy OR coro...,"('2020-03-07', '2020-03-15')"


In [9]:
nj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   username        9800 non-null   object
 1   to              2217 non-null   object
 2   text            9799 non-null   object
 3   retweets        9800 non-null   object
 4   favorites       9800 non-null   object
 5   replies         9800 non-null   object
 6   id              9800 non-null   object
 7   permalink       9800 non-null   object
 8   author_id       9800 non-null   object
 9   date            9800 non-null   object
 10  formatted_date  9800 non-null   object
 11  hashtags        4786 non-null   object
 12  mentions        2291 non-null   object
 13  geo             87 non-null     object
 14  urls            3476 non-null   object
 15  city            9800 non-null   object
 16  query           9800 non-null   object
 17  date_range      9800 non-null   object
dtypes: objec