This notebook is for drafting the twitter.py file.

In [19]:
import json 
import pandas as pd
import numpy as np
import requests
import time
from tqdm import tqdm

# Read API key credentials
api = pd.read_json('~/Coding/APIs/twitter_keys.json')['DSAN_501-project']

# Save credentials
consumer_key=api.loc["key"]
consumer_secret=api.loc["key_secret"]
access_token=api.loc["access_token"]
access_token_secret=api.loc["access_token_secret"]
bearer_token=api.loc["bearer_token"]

# PREPARE AND ORGANIZE DATA
# Read in dataset
incidents = pd.read_csv('../../../../data/02-clean-data/SSDB/incident.csv')
incidents['Date'] = pd.to_datetime(incidents['Date'])

# Filter out incidents that occurred before 2006-03-21 (day of first ever tweet)
incidents_twitter = incidents.copy()[incidents['Date'] > pd.to_datetime('2006-03-21 15:50:00')]

# Add new data column marking one week after the incident
incidents_twitter['Date_Week_Later'] = incidents_twitter['Date'] + pd.Timedelta(days=7)

# Convert date columns to YYYY-MM-DDTHH:mm:ssZ (ISO 8601/RFC 3339) for API requests
incidents_twitter['Query_Date'] = incidents_twitter['Date'].dt.strftime('%Y-%m-%d') + "T00:00:00Z"
incidents_twitter['Query_Date_Week_Later'] = incidents_twitter['Date_Week_Later'].dt.strftime('%Y-%m-%d') + "T00:00:00Z"

# Save incidents_twitter as csv
incidents_twitter.to_csv('./../../../../data/01-modified-data/SSDB/incident_twitter.csv')

# Replace & with and in school names
new_schools = [str(school).replace('&', 'and') for school in incidents_twitter['School']]

# Construct get request parameters, save IDs to list
ids = incidents_twitter['Incident_ID']
queries = "-is:retweet ((school (shooting OR shootings)) OR (" + incidents_twitter['City'] + " (shooting OR shootings)) OR (" + incidents_twitter['School'] + "))"
start_times = incidents_twitter['Query_Date']
end_times = incidents_twitter['Query_Date_Week_Later']

# Specify relative filepath for outputs
rel_filepath = "../../../../data/00-raw-data/Twitter/"


# DEFINE FUNCTIONS FOR API REQUESTS
def generate_url(query, start_time, end_time, max_results=500, next_token=False):
    if (next_token == False):
        url = "https://api.twitter.com/2/tweets/search/all?query={}&start_time={}&end_time={}&tweet.fields=text,author_id,created_at,geo,lang&max_results={}".format(query, start_time, end_time, max_results)
    else:
        url = "https://api.twitter.com/2/tweets/search/all?next_token={}&query={}&start_time={}&end_time={}&tweet.fields=text,author_id,created_at,geo,lang&max_results={}".format(next_token, query, start_time, end_time, max_results)
    return url

headers = {"Authorization": "Bearer {}".format(bearer_token)}

def search_twitter(url):
    response = requests.request("GET", url, headers=headers)

    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

num_requests = 0
num_results = 0

max_inc_results = 600 # 7000

# 2 limits:
# 1. 300 requests / 15 min
# 2. 1 request / sec

# BEGIN API REQUESTS
for i, (inc_id, query, start_t, end_t) in enumerate(zip(ids[0:2], queries[0:2], start_times[0:2], end_times[0:2])):
    # Print status
    print("Requesting tweets for incident ID " + inc_id + "...")

    # Initialize tracking variables
    next_page = True # Track pagination
    num_page = 0 # Track pagination numbers
    nt = False # Set nt (next token) as False for first request of query

    # Start "timers" for regulation of limits
    limit1_start = time.time() # Do not exceed 300 requests / 15 min
    limit2_start = time.time() # Do not exceed 1 request / second

    while (next_page == True):
        # Generate URL (nt will be false on first and last requests of the query)
        query_url = generate_url(query=query, start_time=start_t, end_time=end_t, max_results=500, next_token=nt)


        # Make request
        json_response = search_twitter(url=query_url)

        # Track metadata
        num_results = num_results + json_response['meta']['result_count']
        try:
            nt = json_response['meta']['next_token']
            num_page += 1
        except KeyError:
            nt = False
            next_page = False

        # Track progress
        num_requests += 1

        # Save response
        output = json.dumps(json_response, indent=4, sort_keys=True)

        with open(rel_filepath + inc_id + '_' + str(num_page) + '.json', 'w') as outfile:
            outfile.write(output)

        # Check limits
        # Limit 1: 300 requests / 15 min
        if (num_requests >= 300) or ((time.time() - limit1_start) > 14*60):
            # Sleep ten minutes (1 request / sec for 300 requests is 5 min)
            print("10 minute sleep")
            for i in tqdm(range(10*60)):
                time.sleep(1)
            
            # Reset request count and "timer"
            num_requests = 0
            limit1_start = time.time()

        # Limit 2: 1 request / sec
        if (time.time() - limit2_start <= 1):
            time.sleep(1 - (time.time() - limit2_start) )

        limit2_start = time.time() # Reset "timer"

        # Check if tweet count has exceeded max_inc_results
        if (max_inc_results < num_results):
            next_page = False
            num_results = 0
        


print("All requests complete.")


Requesting tweets for incident ID 20220620ILGRC...
Requesting tweets for incident ID 20220613WAMAE...
All requests complete.


In [21]:
len(ids)

1184

In [63]:
ids.tolist().index('20210921OHMIM') # Check this incident at end

246

In [91]:
ids.tolist().index('20140911UTWET')

917

In [66]:
ids[404:]

404     20210104NYPEJ
405     20201229WAROS
406     20201225FLYOP
407     20201216ILDUC
408     20201211VATAY
            ...      
1179    20060522SCBUI
1180    20060505FLPAM
1181    20060424NCEAC
1182    20060418TXWEH
1183    20060405DCROW
Name: Incident_ID, Length: 780, dtype: object

In [82]:
queries[509]

'-is:retweet ((school (shooting OR shootings)) OR (Chicago (shooting OR shootings)) OR (Lindblom Math & Science Academy High School))'

In [85]:
new_schools = [str(school).replace('&', 'and') for school in incidents_twitter['School']]

new_schools[509]

'Lindblom Math and Science Academy High School'