## **Colab Notebook to extract the data using Twitter API**

### **Installing required libraries**

In [1]:
!pip install python-dotenv
!pip install colab-env

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-dotenv
  Downloading python_dotenv-0.21.0-py3-none-any.whl (18 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.21.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colab-env
  Downloading colab-env-0.2.0.tar.gz (4.7 kB)
Building wheels for collected packages: colab-env
  Building wheel for colab-env (setup.py) ... [?25l[?25hdone
  Created wheel for colab-env: filename=colab_env-0.2.0-py3-none-any.whl size=3838 sha256=b487807d08a629390446fa05d608d65e137b6fbbb887848e06b24aceeb7c0a0d
  Stored in directory: /root/.cache/pip/wheels/bb/ca/e8/3d25b6abb4ac719ecb9e837bb75f2a9b980430005fb12a9107
Successfully built colab-env
Installing collected packages: colab-env
Successfully installed colab-env-0.2.0


### **Importing the required libraries**

In [41]:
# importing the required libraries
import os
import time
import requests
import dateutil
import pandas as pd

import colab_env

### **Extracting the tweets using Twitter API**<br/>
Since I required the data from the past 10 years, I needed to use the Search All API which was only available to users with academic access. I applied for the access and was approved for it quickly.<br/>
API Documentation can be found here: https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all 

In [42]:
# function for creating the request header
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [67]:
# function for creating the URL with the request query
def create_url(keyword, start_date, end_date, max_results = 500):  
    search_url = "https://api.twitter.com/2/tweets/search/all"
    # setting the parameters for the request query
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,geo.place_id',
                    'tweet.fields': 'author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,referenced_tweets,source,text,withheld',
                    'user.fields': 'created_at,description,id,location,name,protected,username,verified,withheld',
                    'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

In [68]:
# function for connecting to the API endpoint
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [1]:
# function to get the required details from the JSON response and add it to the CSV file
def add_response_to_csv(json_response, file_name):
    counter = 0
    csv_file = open(file_name, "a", newline="", encoding='utf-8')
    csv_writer = csv.writer(csv_file)

    # looping through each extracted tweet
    for tweet in json_response['data']:
        author_id = tweet['author_id'] # getting author id
        created_at = dateutil.parser.parse(tweet['created_at']) # gives date of creation of tweet
        # getting the geo location id from where the tweet was made
        if ('geo' in tweet):
            if 'place_id' in tweet['geo']: 
                geo = tweet['geo']['place_id']
            else:
                geo = " "
        else:
            geo = " "
        tweet_id = tweet['id'] # getting the unique identifier for the tweet
        lang = tweet['lang']   # getting the language of the tweet

        # getting the tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']
        followers_count = None
        if 'followers_count' in tweet['public_metrics']:
            followers_count = tweet['public_metrics']['followers_count']

        source = ''
        if 'source' in tweet:
            source = tweet['source'] # getting the source of the tweet
        text = tweet['text']         # getting the text of the tweet

        values = [author_id, created_at, geo, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text, followers_count]
        
        # Append the result to the CSV file
        csv_writer.writerow(values)
        counter += 1

    csv_file.close()
    print("Number of tweets added: ", counter) 

In [70]:
import csv

headers = create_headers(os.getenv("BEARER_TOKEN"))
# preparing the query keyword using the words: climate change and global warming
# and the hashtags: #climatechange and #globalwarming
keyword = "climate change OR global warming OR #climatechange OR #globalwarming lang:en"

# collecting tweets for the past 10 years so creating a list of start and end 
# dates in the format required for the Twitter API request
start_list =    ['2013-01-01T00:00:00.000Z',
                 '2014-01-01T00:00:00.000Z',
                 '2015-01-01T00:00:00.000Z',
                 '2016-01-01T00:00:00.000Z',
                 '2017-01-01T00:00:00.000Z',
                 '2018-01-01T00:00:00.000Z',
                 '2019-01-01T00:00:00.000Z',
                 '2020-01-01T00:00:00.000Z',
                 '2021-01-01T00:00:00.000Z',
                 '2022-01-01T00:00:00.000Z',
                 ]

end_list =    ['2013-12-31T00:00:00.000Z',
                 '2014-12-31T00:00:00.000Z',
                 '2015-12-31T00:00:00.000Z',
                 '2016-12-31T00:00:00.000Z',
                 '2017-12-31T00:00:00.000Z',
                 '2018-12-31T00:00:00.000Z',
                 '2019-12-31T00:00:00.000Z',
                 '2020-12-31T00:00:00.000Z',
                 '2021-12-31T00:00:00.000Z',
                 '2022-10-15T00:00:00.000Z',
                 ]
max_results = 500
#Total number of tweets we collected from the loop
total_tweets = 0

# creating the final CSV file
csv_file = open("data.csv", "a", newline="", encoding='utf-8')
csv_writer = csv.writer(csv_file)

# creating the headers for the columns we want to save in the CSV file
csv_writer.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet', 'followers_count'])
csv_file.close()
authors = []

for i in range(0,len(start_list)):
    count = 0 # counting number of tweets per time period
    max_count = 5000 # maximum number of tweets per time period
    flag = True
    next_token = None
    
    # checking if flag is true
    while flag:
        # checking if max_count has been reached
        if count >= max_count:
            break
        print("Token: ", next_token)
        url = create_url(keyword, start_list[i], end_list[i], max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        result_count = json_response['meta']['result_count']
        authors.extend(json_response['includes']['users'])
        if 'next_token' in json_response['meta']:
            # saving the token to use for next call
            next_token = json_response['meta']['next_token']
            print("Next token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                print("Start date: ", start_list[i])
                add_response_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Number of tweets added: ", total_tweets)
                time.sleep(5)                
        # if no next token exists
        else:
            if result_count is not None and result_count > 0:
                print("Start date: ", start_list[i])
                add_response_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Number of tweets added: ", total_tweets)
                time.sleep(5)
            
            # changing flag to false to begin processing for the next time period
            flag = False
            next_token = None
        time.sleep(5)

pd.DataFrame(authors).to_csv("users.csv")
print("Total number of results: ", total_tweets)

Token:  None
Endpoint Response Code: 200
Next token:  1jzu9lk96gu5npw15vhyfo4gx32jynits43icn8fmrgd
Start date:  2013-01-01T00:00:00.000Z
Number of tweets added:  460
Number of tweets added:  460
Token:  1jzu9lk96gu5npw15vhyfo4gx32jynits43icn8fmrgd
Endpoint Response Code: 200
Next token:  1jzu9lk96gu5npw15vhyfntpz9hp82tf6xmlq6kaqzjx
Start date:  2013-01-01T00:00:00.000Z
Number of tweets added:  467
Number of tweets added:  927
Token:  1jzu9lk96gu5npw15vhyfntpz9hp82tf6xmlq6kaqzjx
Endpoint Response Code: 200
Next token:  1jzu9lk96gu5npw15vhyfniw1osp5tonlk9pvgtr18xp
Start date:  2013-01-01T00:00:00.000Z
Number of tweets added:  473
Number of tweets added:  1400
Token:  1jzu9lk96gu5npw15vhyfniw1osp5tonlk9pvgtr18xp
Endpoint Response Code: 200
Next token:  1jzu9lk96gu5npw15vhyfn83mbbzh404ju6acpt5z0jh
Start date:  2013-01-01T00:00:00.000Z
Number of tweets added:  480
Number of tweets added:  1880
Token:  1jzu9lk96gu5npw15vhyfn83mbbzh404ju6acpt5z0jh
Endpoint Response Code: 200
Next token:  1jzu

In [84]:
extracted_tweets_df = pd.read_csv('/content/data.csv')
extracted_tweets_df = extracted_tweets_df.astype({"author id": str})
print(extracted_tweets_df.head(10))
authors_df = pd.read_csv('/content/users.csv')
print(authors_df.head(10))
authors_df = authors_df.astype({"id": str})

    author id                 created_at geo                  id lang  \
0  1242448184  2013-12-30 23:59:40+00:00      417807229534756864   en   
1  1618546020  2013-12-30 23:59:38+00:00      417807221356232705   en   
2    62341471  2013-12-30 23:59:37+00:00      417807218063736832   en   
3    27413507  2013-12-30 23:59:34+00:00      417807202951241729   en   
4    38755317  2013-12-30 23:59:22+00:00      417807153911439360   en   
5  1470065498  2013-12-30 23:59:18+00:00      417807137515900928   en   
6  1004344742  2013-12-30 23:59:08+00:00      417807092653649920   en   
7  1514058516  2013-12-30 23:59:04+00:00      417807076052975616   en   
8   451324442  2013-12-30 23:58:40+00:00      417806978518245376   nl   
9   507874515  2013-12-30 23:58:35+00:00      417806955868987392   en   

   like_count  quote_count  reply_count  retweet_count                source  \
0           0            0            0              0  Twitter for Websites   
1           1            0          

In [85]:
# mapping the data from the authors_df to the extracted_tweets_df to get 
# additional details such as username, name and location of the tweet author.
import numpy as np

extracted_tweets_df['username'] = np.nan
extracted_tweets_df['name'] = np.nan
extracted_tweets_df['location'] = np.nan

for i, row in extracted_tweets_df.iterrows():
    author_id = row['author id']
    author_info = authors_df[authors_df['id']==author_id]
    if author_info.shape[0] > 0:
        print(i)
        author_info = author_info.iloc[0]
        extracted_tweets_df.loc[(extracted_tweets_df['author id']==author_id), ['username', 'name', 'location']] = [author_info['username'], author_info['name'], author_info['location']]

extracted_tweets_df.to_csv("twitter_climate_keyword_extracted_data.csv")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
47845
47846
47847
47848
47849
47850
47851
47852
47853
47854
47855
47856
47857
47858
47859
47860
47861
47862
47863
47864
47865
47866
47867
47868
47869
47870
47871
47872
47873
47874
47875
47876
47877
47878
47879
47880
47881
47882
47883
47884
47885
47886
47887
47888
47889
47890
47891
47892
47893
47894
47895
47896
47897
47898
47899
47900
47901
47902
47903
47904
47905
47906
47907
47908
47909
47910
47911
47912
47913
47914
47915
47916
47917
47918
47919
47920
47921
47922
47923
47924
47925
47926
47927
47928
47929
47930
47931
47932
47933
47934
47935
47936
47937
47938
47939
47940
47941
47942
47943
47944
47945
47946
47947
47948
47949
47950
47951
47952
47953
47954
47955
47956
47957
47958
47959
47960
47961
47962
47963
47964
47965
47966
47967
47968
47969
47970
47971
47972
47973
47974
47975
47976
47977
47978
47979
47980
47981
47982
47983
47984
47985
47986
47987
47988
47989
47990
47991
47992
47993
47994
47995
47996
47997
47998
47999
48000