In [256]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time

In [257]:
os.environ['TOKEN'] = ''

In [258]:
def auth():
    return os.getenv('TOKEN')

In [259]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [260]:
def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions':  'author_id,in_reply_to_user_id,geo.place_id,attachments.media_keys,entities.mentions.username',
                    'tweet.fields': 'attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'media.fields': 'media_key,preview_image_url,public_metrics,type,url',
                    'place.fields':'country,full_name,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

In [261]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [262]:
#Inputs for the request
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = 'AzimioLaUmoja OR azimio OR azimiostrong OR UDAkwanza OR elections22 OR Elections2022 OR IEBC OR kenyakwanza has:media has:mentions'
start_time = "2020-01-01T00:00:00.000Z"
end_time = "2020-12-31T00:00:00.000Z"
max_results = 500

In [263]:
url = create_url(keyword, start_time,end_time, max_results)
json_response = connect_to_endpoint(url[0], headers, url[1])

Endpoint Response Code: 200


In [264]:
print(json.dumps(json_response, indent=4, sort_keys=True))

{
    "data": [
        {
            "author_id": "814949250941546496",
            "conversation_id": "1344431631118049284",
            "created_at": "2020-12-30T23:54:13.000Z",
            "entities": {
                "annotations": [
                    {
                        "end": 81,
                        "normalized_text": "IEBC",
                        "probability": 0.7527,
                        "start": 78,
                        "type": "Organization"
                    }
                ],
                "mentions": [
                    {
                        "end": 17,
                        "id": "167464191",
                        "start": 3,
                        "username": "WaihigaMwaura"
                    }
                ]
            },
            "id": "1344431631118049284",
            "lang": "en",
            "possibly_sensitive": false,
            "public_metrics": {
                "like_count": 0,
                "quote_count": 0,


In [265]:
json_response['data'][0]['created_at']

'2020-12-30T23:54:13.000Z'

In [266]:
json_response['meta']['result_count']

440

In [267]:
# Create file
csvFile = open("data.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
headers=['created at','id','author id','Author name','text','lang','like_count', 'quote_count', 'reply_count','retweet_count','attachments','geo','source']
#csvWriter.writerow(['author id', 'created_at', 'geo','Location' 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
csvWriter.writerow(headers)
csvFile.close()

In [268]:
json_response['data']

[{'created_at': '2020-12-30T23:54:13.000Z',
  'public_metrics': {'retweet_count': 369,
   'reply_count': 0,
   'like_count': 0,
   'quote_count': 0},
  'reply_settings': 'everyone',
  'entities': {'mentions': [{'start': 3,
     'end': 17,
     'username': 'WaihigaMwaura',
     'id': '167464191'}],
   'annotations': [{'start': 78,
     'end': 81,
     'probability': 0.7527,
     'type': 'Organization',
     'normalized_text': 'IEBC'}]},
  'possibly_sensitive': False,
  'referenced_tweets': [{'type': 'retweeted', 'id': '1344196178968240128'}],
  'lang': 'en',
  'conversation_id': '1344431631118049284',
  'author_id': '814949250941546496',
  'text': 'RT @WaihigaMwaura: 139,475 Kenyans applied for the position of clerks for the IEBC verification process, 400 were selected.\n#StateOfTheNati…',
  'source': 'Twitter for Android',
  'id': '1344431631118049284'},
 {'attachments': {'media_keys': ['3_1344430621955862530']},
  'created_at': '2020-12-30T23:50:13.000Z',
  'public_metrics': {'retweet

In [253]:
def append_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    includes=json_response['includes']

    users=includes['users']
    users = {user["id"]: user for user in users}

    #medias = includes['media']
    #mediass = {media['media_key']: media for media in medias}
    media = {m["media_key"]: m for m in includes['media']}

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # Author 
        author_id = tweet['author_id']
        author_name=users[author_id]['name']

        #  Time created
        created_at = dateutil.parser.parse(tweet['created_at'])
        
        #attachments
        if('attachments' in tweet):
            attachment=tweet['attachments']
#             media_keys = attachment['media_keys']
        
#             media_key=media[media_keys[0]].media_key
#             media_url=media[media_keys[0]].url or media[media_keys[0]].preview_image_url
#             media_type=media[media_keys[0]].type
        else:
            attachment=" "
#             media_keys=" "

        # Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
#             name=geo['full_name']
        else:
            geo = " "

        # Tweet ID
        tweet_id = tweet['id']
        

        # Language
        lang = tweet['lang']

        # Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # source
        source = tweet['source']

        # Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
#         headers=['created at','id','author id','Author name','text','lang','like_count', 'quote_count', 
#                  'reply_count','retweet_count','attachments',
#                  'Media key','Media url','geo','Location','source']
        res = [created_at,tweet_id,author_id,author_name,text,lang,like_count, quote_count, reply_count,
               retweet_count,attachment,geo,source]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter)

In [269]:
append_to_csv(json_response, "data.csv")

# of Tweets added from this response:  440


In [270]:
#Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
#hashtags related to 2022 general elections-AzimioLaUmoja, azimio, azimiostrong, UDAkwanza, 
#elections22, Elections2022,IEBC,kenyakwanza,
keyword = 'AzimioLaUmoja OR azimio OR azimiostrong OR UDAkwanza OR elections22 OR Elections2022 OR IEBC OR kenyakwanza'
#elections22, Elections2022,IEBC,kenyakwanza"
start_list =    ['2021-01-01T00:00:00.000Z',
                 '2021-01-15T00:00:00.000Z',
                 '2021-02-01T00:00:00.000Z',
                 '2021-02-15T00:00:00.000Z',
                 '2021-03-01T00:00:00.000Z',
                 '2021-03-15T00:00:00.000Z',
                 '2021-04-01T00:00:00.000Z',
                 '2021-04-15T00:00:00.000Z',
                 '2021-05-01T00:00:00.000Z',
                 '2021-05-15T00:00:00.000Z',
                 '2021-06-01T00:00:00.000Z',
                 '2021-06-15T00:00:00.000Z',
                 '2021-07-01T00:00:00.000Z',
                 '2021-07-15T00:00:00.000Z',
                 '2021-08-01T00:00:00.000Z',
                 '2021-08-15T00:00:00.000Z',
                 '2021-09-01T00:00:00.000Z',
                 '2021-09-15T00:00:00.000Z',
                 '2021-10-01T00:00:00.000Z',
                 '2021-10-15T00:00:00.000Z',
                 '2021-11-01T00:00:00.000Z',
                 '2021-11-15T00:00:00.000Z',
                 '2021-12-01T00:00:00.000Z',
                 '2021-12-15T00:00:00.000Z',
                 '2022-01-01T00:00:00.000Z',
                 '2022-01-15T00:00:00.000Z',
                 '2022-02-01T00:00:00.000Z',
                 '2022-02-15T00:00:00.000Z',
                 '2022-03-01T00:00:00.000Z',
                 '2022-03-15T00:00:00.000Z',
                 '2022-04-01T00:00:00.000Z',
                 '2022-04-15T00:00:00.000Z',
                 '2022-05-01T00:00:00.000Z'
                
                ]

end_list =      [
                '2021-01-14T23:59:45.000Z',
                '2021-01-31T00:00:00.000Z',
                '2021-02-14T23:59:45.000Z',
                '2021-02-28T00:00:00.000Z',
                '2021-03-14T23:59:45.000Z',
                '2021-03-31T00:00:00.000Z',
                '2021-04-14T23:59:45.000Z',
                '2021-04-30T00:00:00.000Z',
                '2021-05-14T23:59:45.000Z',
                '2021-05-31T00:00:00.000Z',
                '2021-06-14T23:59:45.000Z',
                '2021-06-30T00:00:00.000Z',
                 '2021-07-14T23:59:45.000Z',
                 '2021-07-31T00:00:00.000Z',
                '2021-08-14T23:59:45.000Z',
                 '2021-08-31T00:00:00.000Z',
                '2021-09-14T23:59:45.000Z',
                 '2021-09-30T00:00:00.000Z',
                 '2021-10-14T23:59:45.000Z',
                 '2021-10-31T00:00:00.000Z',
                '2021-11-14T23:59:45.000Z',
                 '2021-11-30T00:00:00.000Z',
                '2021-12-14T23:59:45.000Z',
                 '2021-12-31T00:00:00.000Z',
                '2022-01-14T23:59:45.000Z',
                 '2022-01-31T00:00:00.000Z',
                 '2022-02-14T23:59:45.000Z',
                 '2022-02-28T00:00:00.000Z',
                '2022-03-14T23:59:45.000Z',
                 '2022-03-31T00:00:00.000Z',
                '2022-04-14T23:59:45.000Z',
                 '2022-04-30T00:00:00.000Z',
                 '2022-05-14T00:00:00.000Z'
                ]
max_results = 500

#Total number of tweets we collected from the loop
total_tweets = 0

# Create file
csvFile = open("data.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

csvWriter.writerow(['created at','id','author id','Author name','text','lang','like_count', 'quote_count', 'reply_count','retweet_count','attachments','geo','source'])
#csvWriter.writerow(['author id', 'created_at', 'geo','Location' 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
#csvWriter.writerow(headers)

#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
#csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
csvFile.close()

for i in range(0,len(start_list)):

    # Inputs
    count = 0 # Counting tweets per time period
    max_count = 10000 # Max tweets per time period
    flag = True
    next_token = None
    
    # Check if flag is true
    while flag:
        # Check if max_count reached
        if count >= max_count:
            break
        print("-------------------")
        print("Token: ", next_token)
        url = create_url(keyword, start_list[i],end_list[i], max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        result_count = json_response['meta']['result_count']

        if 'next_token' in json_response['meta']:
            # Save the token to use for next call
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                print("-------------------")
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(15)
            
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
        time.sleep(5)
print("Total number of results: ", total_tweets)

-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3foshvhgw8el1e4oos0oet1gepzfwjh
Start Date:  2021-01-01T00:00:00.000Z
# of Tweets added from this response:  480
Total # of Tweets added:  480
-------------------
-------------------
Token:  b26v89c19zqg8o3foshvhgw8el1e4oos0oet1gepzfwjh
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3foshvhetaybrvoez07thvb6u3nd9j1
Start Date:  2021-01-01T00:00:00.000Z
# of Tweets added from this response:  419
Total # of Tweets added:  899
-------------------
-------------------
Token:  b26v89c19zqg8o3foshvhetaybrvoez07thvb6u3nd9j1
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3foshvhcp6fdeec3rm65tsxrki5zuyl
Start Date:  2021-01-01T00:00:00.000Z
# of Tweets added from this response:  465
Total # of Tweets added:  1364
-------------------
-------------------
Token:  b26v89c19zqg8o3foshvhcp6fdeec3rm65tsxrki5zuyl
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3foshv24weqykjjh58ep84fh3vnkb5p
Sta