In [13]:
import tweepy
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time

from dotenv import load_dotenv
from __future__ import annotations

load_dotenv()

True

In [14]:
bearer_token = os.environ.get('BEARER_TOKEN')

client = tweepy.Client(bearer_token=bearer_token)


Following this guide to create a query searcher - https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a

In [15]:
def create_headers(bearer_token):
    headers = {"Authorization": 'Bearer {}'.format(bearer_token)}
    return headers

In [16]:
def create_url(keyword, start_date=None, end_date=None, max_results = 100):
    
    search_url = "https://api.twitter.com/2/tweets/search/recent" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': 'Roe v Wade',
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}} # lets you access the next page of results 
    return (search_url, query_params)

^ Need to determine whether next_token even works and how to prevent repeat tweets from being pulled in 


In [17]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [18]:
#Inputs for the request
bearer_token 
headers = create_headers(bearer_token)
keyword = "Roe v Wade lang:en"


In [19]:
url = create_url(keyword)
json_response = connect_to_endpoint(url[0], headers, url[1])

Endpoint Response Code: 200


In [20]:
print(json.dumps(json_response, indent=4, sort_keys=True))

{
    "data": [
        {
            "author_id": "66443782",
            "conversation_id": "1534182200710729731",
            "created_at": "2022-06-07T14:35:15.000Z",
            "id": "1534182200710729731",
            "lang": "en",
            "public_metrics": {
                "like_count": 0,
                "quote_count": 0,
                "reply_count": 0,
                "retweet_count": 23
            },
            "referenced_tweets": [
                {
                    "id": "1534179877108195331",
                    "type": "retweeted"
                }
            ],
            "reply_settings": "everyone",
            "source": "Twitter Web App",
            "text": "RT @PramilaJayapal: An extremist Supreme Court is poised to strip away abortion rights \u2014 and the rights of LGBTQ+ folks could be next if we\u2026"
        },
        {
            "author_id": "60312823",
            "conversation_id": "1534182196868620288",
            "created_at": "2022-06-

In [21]:
json_response['meta']['result_count'] # shows the total results retreived 

100

At this point, there's two ways to turn this existing data into a CSV file. You can just take pandas and use the df.to_csv function to turn everything into that but the article is going to use a "custom method" that essentially takes in a dictionary and creates unique columns for the additional data collected, like "like_count" and "retweet_count"

In [31]:
csvFile = open('data.csv', 'a', newline='', encoding='utf8') # essentially creating the basic parameters for a file 
csvWriter = csv.writer(csvFile)

csvWriter.writerow(['author id', 'conversation_id', 'created_at', 'tweet_id', 'tweet', 'source', 'geo'])
csvFile.close()

In [27]:
for item in json_response['data']:
    print(item)

{'id': '1534182200710729731', 'public_metrics': {'retweet_count': 23, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': 'RT @PramilaJayapal: An extremist Supreme Court is poised to strip away abortion rights — and the rights of LGBTQ+ folks could be next if we…', 'source': 'Twitter Web App', 'lang': 'en', 'author_id': '66443782', 'created_at': '2022-06-07T14:35:15.000Z', 'reply_settings': 'everyone', 'referenced_tweets': [{'type': 'retweeted', 'id': '1534179877108195331'}], 'conversation_id': '1534182200710729731'}
{'id': '1534182196868620288', 'public_metrics': {'retweet_count': 364, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': 'RT @KarenAttiah: Between the NYT op-ed endorsing suffering in marriage, to Roe V. Wade being overturned, the glee over the supposed end of…', 'source': 'Twitter for Android', 'lang': 'en', 'author_id': '60312823', 'created_at': '2022-06-07T14:35:14.000Z', 'reply_settings': 'everyone', 'referenced_tweets': [{'type': 'retweeted', 'id

In [32]:
def append_to_csv(json_response, fileName):
    """ Appends scraped tweets to the csv file after assigning each
    of them a variable and creating a list of required variables
    """
    counter = 0
    
    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet ID
        tweet_id = tweet['id']
        
        # 5. Conversation ID
        conversation_id = tweet['conversation_id']
        

        # 7. source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
        
        
        # Assemble all data in a list - ordered by csv file 
        res = [author_id, conversation_id, created_at, tweet_id, text,
               source, geo]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 
    

In [33]:
# The above is just a function - actually have to pass through the json parameters 
append_to_csv(json_response, 'data.csv')

# of Tweets added from this response:  100


To actually run everything, I basically just need to take the entire thing and condense it down into getting the json_response and running appendtocsv