# Twitter developer API: recent search endpoint 

In [1]:
import os
import json
import requests
import csv
import dateutil
import time
import datetime as dt
import pandas as pd

Define Bearer token, which allows us to connect to the Twitter developer API

In [2]:
with open('credentials.json', 'r') as json_file:
    cred = json.load(json_file)

In [3]:
def bearer_oauth(r):
    r.headers['Authorization'] = f'Bearer {cred["BEARER_TOKEN"]}'
    r.headers['User-Agent'] = "v2RecentSearchPython"
    return r

Define the endpoint that we want to connet to and set up the query

In [4]:
search_url = "https://api.twitter.com/2/tweets/search/recent"

In [5]:
query_params = {
    'query': '(climate change OR #climatechange) -is:retweet lang:en',
    'end_time': '2022-08-20T00:00:00Z',
    'max_results': 100,
    'tweet.fields': 'created_at,public_metrics',
}

In [6]:
def connect_to_endpoint(url, params, next_token=None):
    params['next_token'] = next_token
    response = requests.get(url, auth=bearer_oauth, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [7]:
json_response = connect_to_endpoint(search_url, query_params)
tweet_data = pd.DataFrame(json_response['data'])
tweet_data.head()

200


Unnamed: 0,created_at,id,public_metrics,text
0,2022-08-19T23:59:59.000Z,1560778635304378369,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",This #SydneyMorningHerald article shows the fo...
1,2022-08-19T23:59:54.000Z,1560778615280861186,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",I think the Obama administration was the last ...
2,2022-08-19T23:59:48.000Z,1560778588642762753,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Biden had “three main domestic policy goals: i...
3,2022-08-19T23:59:17.000Z,1560778460184072192,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@ssrpmcmurphy @davidrvetter Only two types of ...
4,2022-08-19T23:59:16.000Z,1560778454265643009,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Is there still opportunity to stem the tide of...


The tweet developer API only allows us to extract 100 tweets at a time, so in order to build up an expanded dataset, we define a function that will create a query for different end dates times in the period that we are interested in. This also allows us to make use of pagination: if more than 100 tweets are returned for a query, the twitter API will return a next token which allows us to look at the next 100 tweets returned for the same query. 

In [8]:
def create_query_params(query, end_date, max_results = 10):

    #change params based on the endpoint you are using
    query_params = {'query': query,
                    'end_time': end_date,
                    'max_results': max_results,
                    'tweet.fields': 'created_at,public_metrics',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    
    return query_params

We create a list of datetime objects for the time period we are interested, with an hourly interval. Note that in order for the API to successfully return data, these dates need to be within the last 7 days.

In [9]:
def calc_dates_list(start_datetime, end_datetime, delta_hours):
    dates_to_extract = list(pd.date_range(
        start=start_datetime,
        end=end_datetime - dt.timedelta(seconds=1),
        freq=dt.timedelta(
            hours=delta_hours)).to_pydatetime())
    return dates_to_extract

In [10]:
start_date = '2022-08-21T00:00:00Z'
end_date = '2022-08-21T02:00:00Z'
date_format = "%Y-%m-%dT%H:%M:%SZ"

dates_list = calc_dates_list(
    dt.datetime.strptime(start_date, date_format),
    dt.datetime.strptime(end_date, date_format), 
    delta_hours=1
)

In [11]:
dates_list

[datetime.datetime(2022, 8, 21, 0, 0), datetime.datetime(2022, 8, 21, 1, 0)]

We define a function to write data to a csv file

In [12]:
def append_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Tweet ID
        tweet_id = tweet['id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])
        
        # 3. Tweet text
        text = tweet['text']

        # 4. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']
        
        # Assemble all data in a list
        res = [tweet_id, created_at, text, like_count, quote_count, reply_count, retweet_count]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

Create a skeleton CSV file with the headers for each column corresponding to the data that will be return by the API

In [13]:
# Create file
csvFile = open("data.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
csv_headers = ['tweet_id', 'created_at', 'tweet', 'like_count', 'quote_count', 'reply_count', 'retweet_count']
csvWriter.writerow(csv_headers)
csvFile.close()

Iterate through the dates in the datelist we have created. We define how many tweets we want to extract per hour with max_counts variable, here we have set max)counts = 1000 per hour (noting that only 100 can be extracted at a time, so we have to do 10 queries per hour). Each time the data is extracted from the API, it will then we appended to csv we created. 

In [None]:
#Inputs for tweets
keyword = '(#climatechange OR climate change) -is:retweet lang:en'
max_results = 100 

#Total number of tweets we collected from the loop
total_tweets = 0

for end_date in dates_list:
    
    print(f'extracting data for {end_date}')

    # Inputs
    count = 0 # Counting tweets per time period
    max_count = 1000 # Max tweets per time period
    flag = True
    next_token = None
    
    # Check if flag is true
    while flag:
        # Check if max_count reached
        if count >= max_count:
            break
        print("-------------------")
        print("Token: ", next_token)
        
        query_params = create_query_params(keyword, end_date.strftime(date_format), max_results)
        
        json_response = connect_to_endpoint(search_url, query_params, next_token)
        result_count = json_response['meta']['result_count']

        if 'next_token' in json_response['meta']:
            # Save the token to use for next call
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                # print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                print("-------------------")
                # print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5)
            
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
        time.sleep(30)
print("Total number of results: ", total_tweets)

extracting data for 2022-08-21 00:00:00
-------------------
Token:  None
200
Next Token:  b26v89c19zqg8o3fpz8kb67c81kyi3aiucm0yxfucef7h
# of Tweets added from this response:  99
Total # of Tweets added:  99
-------------------
-------------------
Token:  b26v89c19zqg8o3fpz8kb67c81kyi3aiucm0yxfucef7h
200
Next Token:  b26v89c19zqg8o3fpz8kb67c6hoecn1izsuwkl84lezy5
# of Tweets added from this response:  100
Total # of Tweets added:  199
-------------------
-------------------
Token:  b26v89c19zqg8o3fpz8kb67c6hoecn1izsuwkl84lezy5
200
Next Token:  b26v89c19zqg8o3fpz8kb67c1zq7cl2f87vzqte49i35p
# of Tweets added from this response:  100
Total # of Tweets added:  299
-------------------
-------------------
Token:  b26v89c19zqg8o3fpz8kb67c1zq7cl2f87vzqte49i35p
200
Next Token:  b26v89c19zqg8o3fpz8kb67byy9o2avurpx0e8utootbx
# of Tweets added from this response:  100
Total # of Tweets added:  399
-------------------
-------------------
Token:  b26v89c19zqg8o3fpz8kb67byy9o2avurpx0e8utootbx
200
Next 