In [3]:
import pandas as pd
import requests
import os
import json

# Google trends

This is done by using the *pytrends* package. This simply scrapes the google trends website.

In [4]:
#!pip install pytrends
from pytrends.request import TrendReq
# documentation: https://github.com/GeneralMills/pytrends

In [31]:
def get_trends(keyword, time=0):
    connection = TrendReq(hl='en-US', tz=360)
    connection.build_payload(keyword, cat=0, timeframe=time, geo='US', gprop='')
    # which data?
    # DMA = metropolitan, REGION = state
    result = connection.interest_by_region(resolution='REGION', inc_low_vol=True, inc_geo_code=False)
    return result

In [128]:
# try to handle exception for:
# ReadTimeout: HTTPSConnectionPool(host='trends.google.com', port=443): Read timed out. (read timeout=5)
def get_time(keyword):
    # startdate = "1/1/2022"
    startdate = "15/4/2022"
    range = pd.date_range(start=startdate,end=pd.to_datetime("today"))
    timeframe = range.astype("str") + " " + range.shift(periods=1).astype("str")
    # this is faster than filling up empty dataframes
    # sources (1): https://gist.github.com/Coldsp33d/ea080f580ab3a3b75c2f40c201d50164
    # sources (2): https://pandas.pydata.org/pandas-docs/version/0.21/generated/pandas.DataFrame.append.html
    data = []
    for i in timeframe:
        result = get_trends([keyword], i)[keyword].values.tolist()
        data.append(result)
        print(f"Appended {i}")
    dataframe = pd.DataFrame(data).T
    states = get_trends([keyword], "all")[keyword].index
    dataframe.set_index(states, inplace=True)
    dataframe.columns = range
    return dataframe

In [139]:
# Keywords are "subjects" constructed by the Google team
# These consolidate the results of different search terms
# COVID
# this returns results about searches such as "covid", "covid19", "covid variants", etc
print("Getting the COVID dataframe")
covid_df = get_time("Coronavirus disease 2019")
# VACCINES
# this returns results about searches such as "covid vaccine", "booster shot near me", "Pfizer-BioNTech COVID-19 vaccine"
print("\nGetting the Vaccines dataframe")
vaccine_df = get_time("COVID-19-vaccine")

Getting the COVID dataframe
Appended 2022-04-15 2022-04-16
Appended 2022-04-16 2022-04-17
Appended 2022-04-17 2022-04-18
Appended 2022-04-18 2022-04-19
Appended 2022-04-19 2022-04-20
Appended 2022-04-20 2022-04-21
Appended 2022-04-21 2022-04-22
Appended 2022-04-22 2022-04-23
Appended 2022-04-23 2022-04-24

 Getting the Vaccines dataframe
Appended 2022-04-15 2022-04-16
Appended 2022-04-16 2022-04-17
Appended 2022-04-17 2022-04-18
Appended 2022-04-18 2022-04-19
Appended 2022-04-19 2022-04-20
Appended 2022-04-20 2022-04-21
Appended 2022-04-21 2022-04-22
Appended 2022-04-22 2022-04-23
Appended 2022-04-23 2022-04-24


In [None]:
#covid_df
#covid_df.T.stack()
#vaccine_df
#vaccine_df.T.stack()

# Twitter 

Access to tweet counts is restricted for academic research. Therefore, we do not directly use the Twitter API.

In [5]:
datasource = "./data/tweets/"

In [6]:
arr = os.listdir(datasource)

In [8]:
arr[0]

'2021_02.zip'

In [9]:
from zipfile import ZipFile

In [11]:
with ZipFile(datasource+arr[0], 'r') as zipobject:
    filelist = zipobject.namelist()

In [20]:
zipobject = ZipFile(datasource+arr[0], 'r')
filelist = zipobject.namelist()


In [21]:
filelist

['2021_02/',
 '2021_02/2021_february28_march1.csv',
 '__MACOSX/',
 '__MACOSX/2021_02/',
 '__MACOSX/2021_02/._2021_february28_march1.csv',
 '2021_02/2021_february27_february28.csv',
 '__MACOSX/2021_02/._2021_february27_february28.csv',
 '2021_02/2021_february9_february10.csv',
 '__MACOSX/2021_02/._2021_february9_february10.csv',
 '2021_02/2021_february6_february7.csv',
 '__MACOSX/2021_02/._2021_february6_february7.csv',
 '2021_02/2021_february7_february8.csv',
 '__MACOSX/2021_02/._2021_february7_february8.csv',
 '2021_02/2021_february1_february2.csv',
 '__MACOSX/2021_02/._2021_february1_february2.csv',
 '2021_02/2021_february2_february3.csv',
 '__MACOSX/2021_02/._2021_february2_february3.csv',
 '2021_02/2021_february17_february18.csv',
 '__MACOSX/2021_02/._2021_february17_february18.csv',
 '2021_02/2021_february5_february6.csv',
 '__MACOSX/2021_02/._2021_february5_february6.csv',
 '2021_02/2021_february18_february19.csv',
 '__MACOSX/2021_02/._2021_february18_february19.csv',
 '2021_02/2

In [22]:
csv_files = list(filter(lambda f: f.endswith('.csv'), filelist))


In [15]:
import pandas as pd

In [23]:
pd.read_csv(zipobject.open(csv_files[0]))

Unnamed: 0,1365881633195360258,0.02121212121212121
0,1365882679640477696,0.000000
1,1365882680743657473,0.000000
2,1365882681855074317,-0.200000
3,1365882684984201227,0.000000
4,1365882685038727168,0.333333
...,...,...
575,1366240150414573571,0.500000
576,1366240657711407106,-0.151852
577,1366240688556437506,0.095833
578,1366241468004921348,0.000000


In [30]:
!pip install twarc
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-0.20.0-py3-none-any.whl (17 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.20.0


In [31]:
from twarc import Twarc
from dotenv import load_dotenv

In [33]:
# load your environment variables 
#BASEDIR = os.path.abspath(os.path.dirname(__file__))
#load_dotenv(os.path.join(BASEDIR, '.env'))
load_dotenv('credentials.env')

True

In [36]:
t_inst = Twarc(os.getenv("consumer_key"), 
               os.getenv("consumer_secret"), 
               os.getenv("access_token"), 
               os.getenv("access_token_secret"))

In [38]:
!pip install wget
import wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9680 sha256=9c9dc31f049064f34a2226c058490951538f7481c5accd006853ebbccffed95c
  Stored in directory: /Users/tristanvandevelde/Library/Caches/pip/wheels/bd/a8/c3/3cf2c14a1837a4e04bd98631724e81f33f462d86a1d895fae0
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [39]:
dataset_URL = "https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2021-01-20/2021-01-20_clean-dataset.tsv.gz?raw=true" #@param {type:"string"}

In [40]:
wget.download(dataset_URL, out='clean-dataset.tsv.gz')

'clean-dataset.tsv.gz'

In [43]:
import shutil
import gzip

In [44]:
#Unzips the dataset and gets the TSV dataset
with gzip.open('clean-dataset.tsv.gz', 'rb') as f_in:
    with open('clean-dataset.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [45]:
#Deletes the compressed GZ file
os.unlink("clean-dataset.tsv.gz")

In [46]:
df = pd.read_csv('clean-dataset.tsv',sep="\t")

In [61]:
df.head()
df["country_code"].unique() # US is in there
df_tweets = df.loc[df["country_code"] == "US"]

In [71]:
df_tweets.head()

Unnamed: 0,tweet_id,date,time,lang,country_code
537,1351758157547249664,2021-01-20,05:07:13,en,US
827,1351758525475688448,2021-01-20,05:08:41,en,US
875,1351758616303263746,2021-01-20,05:09:03,en,US
959,1351758726995169281,2021-01-20,05:09:29,en,US
977,1351758761099194369,2021-01-20,05:09:37,en,US


In [73]:
df_tweets.sort_values(by=["date", "time"], axis=0)
# check timeframe

Unnamed: 0,tweet_id,date,time,lang,country_code
537,1351758157547249664,2021-01-20,05:07:13,en,US
827,1351758525475688448,2021-01-20,05:08:41,en,US
875,1351758616303263746,2021-01-20,05:09:03,en,US
959,1351758726995169281,2021-01-20,05:09:29,en,US
977,1351758761099194369,2021-01-20,05:09:37,en,US
...,...,...,...,...,...
423609,1352118387602718720,2021-01-21,04:58:39,en,US
423678,1352118504263094272,2021-01-21,04:59:07,en,US
424181,1352119098042322944,2021-01-21,05:01:28,und,US
424398,1352119477694042121,2021-01-21,05:02:59,en,US


In [70]:
# this is what we'll put through the API
tweet_IDs = df_tweets["tweet_id"].values
tweet_IDs.size

2183

In [None]:
# fuck this is just today