## Collect daily keyword counts from Twitter

- Code based on: https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a
- Note: can only do 300 requests per 15-minute window (https://developer.twitter.com/en/docs/twitter-api/rate-limits). Given that each requests finds 31 days this means that I can only request one full timeperiod (2006-2022) per 15 minutes. Time to grab a coffee

In [2]:
import requests
import json
from time import perf_counter
from datetime import datetime
import regex as re
import pandas as pd

# I created a .py file where I saved the Twitter bearer token as a string
from twitter_keys import bearer_token
from params import count_file

In [3]:
# save files
metoo_file = 'data/metoo_daily_count.json'
blm_file = 'data/blm_daily_count.json'
black_file = 'data/black_daily_count.json'
oscars_not_blm_file = 'data/oscars_not_blm_daily_count.json'
oscars_file = 'data/oscars_daily_count.json'
metoo2_file = 'data/metoo2_daily_count.json'

In [4]:
# Search parameters
# See: https://developer.twitter.com/en/docs/twitter-api/tweets/counts/api-reference/get-tweets-counts-all

search_url="https://api.twitter.com/2/tweets/counts/all"

keyword_metoo = "#metoo OR metoo" # API isn't case sensitive, 'OR' means any
keyword_metoo2 = "#metoo"
keyword_blm = "#blm OR blm OR #blacklivesmatter OR blacklivesmatter"
keyword_black = "#blacklivesmatter OR blacklivesmatter"
keyword_oscars_not_blm = "(#oscarssowhite OR oscarssowhite) -blm -#blm -blacklivesmatter -#blacklivesmatter"
keyword_oscars = "#oscarssowhite OR oscarssowhite"

start_time = "2006-03-21T00:00:00.000Z" # Day of the firt ever tweet.
end_time = "2022-02-24T00:00:00.000Z"
granularity = "day"

In [5]:
# Functions

def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def create_url(keyword, start_time, end_time, granularity, search_url="https://api.twitter.com/2/tweets/counts/all"):

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_time,
                    'end_time': end_time,
                    'granularity': granularity,
                    'next_token': {}}
    return (search_url, query_params)

def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

headers = create_headers(bearer_token)

In [6]:
# Testing a single use
url = create_url(keyword_metoo, start_time,end_time, granularity)
json_response = connect_to_endpoint(url[0], headers, url[1])

In [7]:
# Looping through full pagination

perf_start = perf_counter()
flag = True
next_token = None
counts = []
url = create_url(keyword_metoo2, start_time,end_time, granularity)
total_tweet_count = 0

while flag:
    json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
    result_count = json_response['meta']['total_tweet_count']
    
    counts.extend(json_response['data'])
    total_tweet_count += result_count

    if 'next_token' in json_response['meta']:
        next_token = json_response['meta']['next_token']

    else:            
        flag = False
        next_token = None
        
print(f"Collected {total_tweet_count} tweets in {int(perf_counter()-perf_start)} seconds. Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# #metoo or metoo: collected 27,599,446  tweets in 44 seconds
# #blm OR blm OR #blacklivesmatter OR blacklivesmatter collected 307,992,432  tweets in 57 seconds
# blacklivesmatter OR #blacklivesmatter Collected 67,752,610 tweets in 44 seconds. Finished at 2022-05-11 12:54:27
# oscars_not_blm Collected 769,166 tweets in 42 seconds
# oscarssowhite Collected 776,538 tweets in 40 seconds

Collected 21887375 tweets in 40 seconds. Finished at 2022-05-11 13:21:39


In [8]:
with open(metoo2_file, 'w') as outfile:
    json.dump(counts, outfile)

## Combine into dataframe

In [9]:
metoo = pd.read_json(metoo_file)
metoo2 = pd.read_json(metoo2_file)
blm = pd.read_json(blm_file)
black = pd.read_json(black_file)
oscars_not_blm = pd.read_json(oscars_not_blm_file)
oscars = pd.read_json(oscars_file)

In [10]:
metoo.rename(columns={"tweet_count": "MeToo"}, inplace=True)
metoo2.rename(columns={"tweet_count": "MeTooX"}, inplace=True)
blm.rename(columns={"tweet_count": "BLM"}, inplace=True)
black.rename(columns={"tweet_count": "BlackLivesMatter"}, inplace=True)
oscars_not_blm.rename(columns={"tweet_count": "OscarsSoWhite -BLM"}, inplace=True)
oscars.rename(columns={"tweet_count": "OscarsSoWhite"}, inplace=True)

In [11]:
df = metoo.merge(blm, how='outer', on=['start', 'end'])
df = df.merge(metoo2, how='outer', on=['start', 'end'])
df = df.merge(black, how='outer', on=['start', 'end'])
#df.rename(columns={"tweet_count_x": "MeToo", "tweet_count_y": "BLM"}, inplace=True)

df = df.merge(oscars_not_blm, how='outer', on=['start', 'end'])
df = df.merge(oscars, how='outer', on=['start', 'end'])

df['Date'] = pd.to_datetime(df['start'])
df.drop(columns=['end', 'start'], inplace=True)
#df = df.loc[:, ['Date', 'MeToo', 'BLM', 'Oscars Not BLM']]
df = df.loc[:, ['Date', 'MeToo', 'MeTooX', 'BLM', 'BlackLivesMatter','OscarsSoWhite', 'OscarsSoWhite -BLM']]
#df = dates.merge(df, how='outer', on='Date')
df['Date'] = df['Date'].dt.date
df.fillna(0, inplace=True)
df['MeToo'] = df['MeToo'].astype(int)
df['BLM'] = df['BLM'].astype(int)
df['BlackLivesMatter'] = df['BlackLivesMatter'].astype(int)
df['OscarsSoWhite -BLM'] = df['OscarsSoWhite -BLM'].astype(int)
df['OscarsSoWhite'] = df['OscarsSoWhite'].astype(int)


df.sort_values('Date', ascending=False, inplace=True)

In [12]:
df.head()

Unnamed: 0,Date,MeToo,MeTooX,BLM,BlackLivesMatter,OscarsSoWhite,OscarsSoWhite -BLM
30,2022-02-23,3918,2712,80346,6271,8,8
29,2022-02-22,3586,2508,97034,5930,4,3
28,2022-02-21,4214,2951,82759,6419,13,12
27,2022-02-20,4796,3099,97238,5018,2,2
26,2022-02-19,4919,3265,120806,6588,7,7


In [14]:
df.sort_values('OscarsSoWhite', ascending=False).head(10)

Unnamed: 0,Date,MeToo,MeTooX,BLM,BlackLivesMatter,OscarsSoWhite,OscarsSoWhite -BLM
2184,2016-02-29,476,123,48040,36082,127398,126510
2262,2016-01-14,551,62,14095,3654,50350,50320
2580,2015-01-15,732,54,33449,14061,28246,27402
2201,2016-01-15,459,66,14010,4744,25577,25542
2557,2015-02-23,554,71,25459,8317,23791,22134
2581,2015-01-16,639,77,30987,12582,22720,22451
2205,2016-01-19,204,43,22807,13113,22265,22145
2208,2016-01-22,235,43,20282,7526,22022,21922
2183,2016-02-28,269,48,21740,9654,19159,19073
2207,2016-01-21,419,54,19934,10458,17514,17421


In [13]:
df.corr()

Unnamed: 0,MeToo,MeTooX,BLM,BlackLivesMatter,OscarsSoWhite,OscarsSoWhite -BLM
MeToo,1.0,0.995833,0.005642,0.014798,-0.005485,-0.005481
MeTooX,0.995833,1.0,0.000308,0.011891,-0.005156,-0.00515
BLM,0.005642,0.000308,1.0,0.944746,-0.007761,-0.007923
BlackLivesMatter,0.014798,0.011891,0.944746,1.0,0.001568,0.001425
OscarsSoWhite,-0.005485,-0.005156,-0.007761,0.001568,1.0,0.999942
OscarsSoWhite -BLM,-0.005481,-0.00515,-0.007923,0.001425,0.999942,1.0


In [15]:
df.drop(columns=['MeToo', 'BLM', 'OscarsSoWhite -BLM'], inplace=True)

In [21]:
df.head()

Unnamed: 0,Date,MeToo,BLM,OscarsSoWhite
30,2022-02-23,2712,6271,8
29,2022-02-22,2508,5930,4
28,2022-02-21,2951,6419,13
27,2022-02-20,3099,5018,2
26,2022-02-19,3265,6588,7


In [19]:
df.rename(columns={"MeTooX": "MeToo", "BlackLivesMatter": "BLM"}, inplace=True)

In [23]:
df.to_csv(count_file, index=False)

## Testing keyword functionality
Conclusion: they work as expected.

In [43]:
# Search parameters
search_url = "https://api.twitter.com/2/tweets/search/all"
max_results = 500

params = {
    'query': keyword_metoo,
    'end_time': end_time,
    'max_results': max_results,
    'tweet.fields':'geo,public_metrics,lang'
}


In [44]:
# Testing a single use
json_response = connect_to_endpoint(search_url, headers, params)

In [45]:
print(json.dumps(json_response, indent=4, sort_keys=True))

{
    "data": [
        {
            "id": "1496635900113166343",
            "lang": "und",
            "public_metrics": {
                "like_count": 0,
                "quote_count": 0,
                "reply_count": 0,
                "retweet_count": 0
            },
            "text": "@AdmiralMazenja #metoo"
        },
        {
            "id": "1496635838884724739",
            "lang": "und",
            "public_metrics": {
                "like_count": 1,
                "quote_count": 0,
                "reply_count": 0,
                "retweet_count": 0
            },
            "text": "@mikefreemanNFL #metoo"
        },
        {
            "id": "1496635687474507777",
            "lang": "en",
            "public_metrics": {
                "like_count": 0,
                "quote_count": 0,
                "reply_count": 0,
                "retweet_count": 163
            },
            "text": "RT @JustAMomNamedMP: When I was 13, I told him to stop and he didn\

In [60]:
# Checking that the 'OR' operator works as aspected (i.e., as any 'if any is true'). It does.

for i in json_response['data']:
    j = i['text'].lower()
    if "#metoo" in j:
        if re.search("(?<!#)metoo", j):
            print(i['text'])

RT @freewiseguy: Napišeš dva tvita, ju opremiš z lojtrco #MeToo in obvestiš @ObamaFoundation, da vodiš nacionalno MeToo kampanjo. Nekateri…
RT @freewiseguy: Napišeš dva tvita, ju opremiš z lojtrco #MeToo in obvestiš @ObamaFoundation, da vodiš nacionalno MeToo kampanjo. Nekateri…
RT @remetoo_2018: 季節が変わっていくね。もし、あなたが置いていかれる気分で嫌な感じだったら、私もその気分を知っていると知っていてほしい。それは、世界であなただけが感じる気分じゃない。
そして、それ、いつかちゃんと終わるやつ。

#remetoo #metoo…
RT @remetoo_2018: 季節が変わっていくね。もし、あなたが置いていかれる気分で嫌な感じだったら、私もその気分を知っていると知っていてほしい。それは、世界であなただけが感じる気分じゃない。
そして、それ、いつかちゃんと終わるやつ。

#remetoo #metoo…
RT @freewiseguy: Napišeš dva tvita, ju opremiš z lojtrco #MeToo in obvestiš @ObamaFoundation, da vodiš nacionalno MeToo kampanjo. Nekateri…
RT @freewiseguy: Napišeš dva tvita, ju opremiš z lojtrco #MeToo in obvestiš @ObamaFoundation, da vodiš nacionalno MeToo kampanjo. Nekateri…
RT @freewiseguy: Napišeš dva tvita, ju opremiš z lojtrco #MeToo in obvestiš @ObamaFoundation, da vodiš nacionalno MeToo kampanjo. Nekateri…
RT @freewiseguy: Nap