In [1]:
# created on Dec 24, 2020
# modified on April 14, 2021
# modified on Jan 2, 2021
# @author:          Bo Zhao
# @email:           zhaobo@uw.edu
# @website:         https://hgis.uw.edu
# @organization:    Department of Geography, University of Washington, Seattle
# @description:     Search historical tweets using locational information

In [2]:
import tweepy
import pandas as pd

In [3]:
# the file path where to store the output csv on google drive
output_file = '/gdrive/My Drive/twsearch-result.csv'

# Apply for your own Twitter API keys at https://developer.twitter.com/en/apply-for-access
consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [4]:
# Define the search term and the date_since date as variables
search_words = "data science"
# make sure there is no space between lat, long and the radius.
location = ""
date_since = "2023-1-15"
# read the Twitter API document to look for other ways to customize your queries.
# refer to https://developer.twitter.com/en/docs/twitter-api/v1/rules-and-filtering/search-operators
# for example: you can ignore all the retweets by #wildfires -filter:retweets
# Geolocalization: the search operator “near” isn’t available in the API, but there is a more precise way to restrict
# your query by a given location using the geocode parameter specified with the template “latitude,longitude,radius”,
# for example, “47.6138893,-122.3107869,10mi” (capitol hill at Seattle). When conducting geo searches, the search API will first attempt to find Tweets、
# which have lat/long within the queried geocode, and in case of not having success, it will attempt to find Tweets created
# by users whose profile location can be reverse geocoded into a lat/long within the queried geocode, meaning that is possible
# to receive Tweets which do not include lat/long information.


# Collect tweets
# tweets = tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since).items(100)
tweets = tweepy.Cursor(api.search, q=search_words, geocode=location, lang="en", since=date_since).items(1000)

# create an array to store the result
result = []

# Iterate and print tweets
for tweet in tweets:
    row = {
        'username': tweet.author.name,
        'userid': tweet.author.id,
        'profile_location': tweet.author.location,
        'created_at': str(tweet.author.created_at),
        'text': tweet.text,
        'retweet_count': tweet.retweet_count,
        'source': tweet.source,
        'coordinates': tweet.coordinates
    }
    result.append(row)
    print(row)

# Store the results as a pandas dataframe
df = pd.DataFrame(result)

# notify the completion of the crawling in the console.
print("the crawling task is finished.")


{'username': 'Data Science Dojo', 'userid': 1318985240, 'profile_location': 'Seattle, WA', 'created_at': '2013-03-31 19:25:57', 'text': 'Discover your path to a career in data science! Join our free information session on January 26th at 9 am (PT) to l… https://t.co/O5yvwcHEDZ', 'retweet_count': 1, 'source': 'HubSpot', 'coordinates': None}
{'username': 'TMJ - SEA IT Jobs', 'userid': 21676969, 'profile_location': 'USA-WA-Seattle Metro', 'created_at': '2009-02-23 18:26:40', 'text': "Want to work at Starbucks? We're hiring in #Seattle, WA! Click for details: https://t.co/uiLsnwAKl4 #techjobs", 'retweet_count': 0, 'source': 'CareerArc 2.0', 'coordinates': {'type': 'Point', 'coordinates': [-122.3320708, 47.6062095]}}
{'username': 'Dr. Kim Martini 🏳️\u200d🌈', 'userid': 48494794, 'profile_location': 'Seattle, WA', 'created_at': '2009-06-18 21:22:10', 'text': 'How about CTDs on Cormorants? They take some pretty great coastal data. \n\nhttps://t.co/yN4fbbShYu… https://t.co/H7FmqcBUlG', 'retweet

In [5]:
# Create data on to Google Drive
from google.colab import drive
# Mount your Drive to the Colab VM.
drive.mount('/gdrive')
  
df.to_csv(output_file, index=False)

Mounted at /gdrive


In [6]:
# download the csv to your local computer
from google.colab import files
files.download(output_file)
print("the csv has been downloaded to your local computer. The program has been completed successfully.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

the csv has been downloaded to your local computer. The program has been completed successfully.
