# Tutorial Outline

### Twitter Data Collection
1. twarc2 for data collection
2. Scraping Twitter with Selenium

### Reddit Data Collection
1. Reddit Data Using Selenium
2. Reddit Data Using PRAW 
- Is PRAW access changing?
3. Other options for Reddit (Pushshift files for past data)

### Mastodon Data Collection
1. Mastodon Data using httpx + Mastodon API
2. Mastodon Data using tweepy-mastodon wrapper for Mastodon API

### Twitter Data Collection using twarc2

#### What you'll need? 
1. Twitter API credentials (at least the Bearer token)
2. Academic API credentials (only for full archive access)

#### Setp 1: Setting up twarc: https://twarc-project.readthedocs.io/en/latest/twarc2_en_us/#configure

#### Step 2: Define functions to use twarc2 with custom parameters for tweet, user, and search queries

In [18]:
import os
import pandas as pd

In [31]:
def search(par_directory, query, counts=True, archive=True):
    start_date = '2023-01-01'
    end_date = '2023-06-01'
    out_file = par_directory + query + '.jsonl'
    print(query)
    if counts:
        out_file = '../counts/{}_counts.jsonl'.format(query)
        if archive:
            !twarc2 counts --archive --start-time {start_date} --end-time {end_date} {query} {out_file}
        else:
            !twarc2 counts --start-time {start_date} --end-time {end_date} {query} {out_file}
    else:
        if archive:
            !twarc2 search --archive --start-time {start_date} --end-time {end_date} --user-fields "created_at,description,id,location,protected,public_metrics,url,username,verified" {query} {out_file}
        else:
            !twarc2 search --start-time {start_date} --end-time {end_date} --user-fields "created_at,description,id,location,protected,public_metrics,url,username,verified" {query} {out_file}

def get_tweets(par_directory, screen_name, counts=True):
    start_date = '2022-01-01'
    end_date = '2023-06-01'
    out_file = par_directory + screen_name + '.jsonl'
    query = 'from:'+screen_name
    print(query)
    if counts:
        out_file = '../counts/{}_counts.jsonl'.format(screen_name)
        !twarc2 counts --archive --start-time {start_date} --end-time {end_date} {query} {out_file}
    else:
        !twarc2 search --archive --start-time {start_date} --end-time {end_date} --user-fields "created_at,description,id,location,protected,public_metrics,url,username,verified" {query} {out_file}
    
def get_tweets_from_userid(par_directory, user_id):
    start_date = '2022-01-01'
    end_date = '2022-11-10'
    out_file = par_directory + user_id + '.jsonl'
    query = 'from:'+user_id
#     print(query)
    !twarc2 search --archive --start-time {start_date} --end-time {end_date} --user-fields "created_at,description,id,location,protected,public_metrics,url,username,verified" {query} {out_file}
    
    
def get_tweets_that_mention_user(par_directory, screen_name, counts=True):
    start_date = '2023-02-21'
    end_date = '2023-02-28'
    out_file = par_directory + screen_name + '.jsonl'
    query = """'@{} -from:{}'""".format(screen_name, screen_name)
    if counts:
        out_file = '../counts/{}_counts.jsonl'.format(screen_name)
        !twarc2 counts --archive --start-time {start_date} --end-time {end_date} {query} {out_file}
    else:
        !twarc2 search --archive --start-time {start_date} --end-time {end_date} --user-fields "created_at,description,id,location,protected,public_metrics,url,username,verified" {query} {out_file}
    
def get_conversation(par_directory, screen_name, conversation_id):
    out_dir = par_directory + screen_name + '/'
    if not screen_name in os.listdir(par_directory):
        os.mkdir(out_dir)
    out_file = out_dir + conversation_id + '.jsonl'
    if conversation_id + '.jsonl' in os.listdir(out_dir):
        return
    !twarc2 conversation --archive {conversation_id} {out_file}    
    
def get_retweeters_of_tweet(par_directory, tweet_id):
    out_file = par_directory + tweet_id + '.jsonl'
    !twarc2 retweeted-by --archive {tweet_id} {out_file}
    
def get_likers_of_tweet(par_directory, tweet_id):
    out_file = par_directory + tweet_id + '.jsonl'
    !twarc2 liking-users --archive {tweet_id} {out_file}
    
def get_likes_by_user(par_directory, userid):
    out_file = par_directory + userid + '.jsonl'
    !twarc2 liked-tweets --archive {userid} {out_file}
    
def get_friends_of_user(par_directory, screen_name):
    out_file = par_directory + screen_name + '.jsonl'
    !twarc2 following {screen_name} {out_file}

def get_followers_of_user(par_directory, screen_name):
    out_file = par_directory + screen_name + '.jsonl'
    !twarc2 followers {screen_name} {out_file}
    
def get_followersf_user(par_directory, tweet_id):
    out_file = par_directory + tweet_id + '.jsonl'
    !twarc2 quotes {tweet_id} {out_file}
    
def extract_tweets(df):
    L = df.data.tolist()
    tweets = []
    for l in L:
        tweets.extend(l)
    return pd.DataFrame(tweets)

In [10]:
par_dir = '../counts/'
# os.mkdir(par_dir)
get_tweets(par_dir, 'RahulGandhi')

from:RahulGandhi
100%|█████████████| Processed 4 months/4 months [00:06<00:00, 215 tweets total ]


In [11]:
query = 'from:RahulGandhi'
out_file = '../results/RahulGandhi.json'
!twarc2 search --user-fields "created_at,description,id,location,protected,public_metrics,url,username,verified" {query} {out_file}

100%|██████████████████| Processed 6 days/6 days [00:00<00:00, 16 tweets total ]


In [33]:
par_dir = '../results/'
get_tweets(par_dir, 'JoeBiden', counts=False)

from:JoeBiden
100%|█| Processed 1 year, 4 months/1 year, 4 months [00:28<00:00, 1362 tweets to


In [14]:
search(par_dir, 'from:RahulGandhi')

from:RahulGandhi
100%|█████████████| Processed 4 months/4 months [00:06<00:00, 215 tweets total ]


In [15]:
search(par_dir, 'from:RahulGandhi', counts=False)

from:RahulGandhi
100%|█████████████| Processed 4 months/4 months [00:05<00:00, 215 tweets total ]


In [16]:
os.listdir(par_dir)

['anmolpanda_.jsonl',
 'counts',
 'from:RahulGandhi.jsonl',
 'JoeBiden.jsonl',
 'RahulGandhi.json']

In [21]:
df_data = pd.read_json(par_dir + 'from:RahulGandhi.jsonl', lines=True)
df = extract_tweets(df_data)
df

Unnamed: 0,public_metrics,lang,edit_history_tweet_ids,entities,author_id,context_annotations,conversation_id,created_at,id,text,attachments,possibly_sensitive,reply_settings,referenced_tweets,in_reply_to_user_id
0,"{'retweet_count': 13760, 'reply_count': 3072, ...",hi,[1663945499379122179],"{'urls': [{'start': 125, 'end': 148, 'url': 'h...",3171712086,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1663945499379122179,2023-05-31T16:28:16.000Z,1663945499379122179,"कुछ लोग मानते हैं उन्हें 'सब' पता है।\n\nमगर, ...",{'media_keys': ['13_1663944757775826944']},False,everyone,,
1,"{'retweet_count': 13615, 'reply_count': 2630, ...",hi,[1663793888883261440],"{'urls': [{'start': 93, 'end': 116, 'url': 'ht...",3171712086,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1663793888883261440,2023-05-31T06:25:49.000Z,1663793888883261440,भारत जोड़ो यात्रा का संदेश - साथ चलो और खोलते ...,{'media_keys': ['13_1663792796032196609']},False,everyone,,
2,"{'retweet_count': 4252, 'reply_count': 0, 'lik...",en,[1663743998907826180],"{'annotations': [{'start': 89, 'end': 112, 'pr...",3171712086,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1663743998907826180,2023-05-31T03:07:34.000Z,1663743998907826180,RT @INCIndia: Shri @RahulGandhi Interacts with...,,False,everyone,"[{'type': 'retweeted', 'id': '1663724307212824...",
3,"{'retweet_count': 12780, 'reply_count': 2059, ...",hi,[1663456870852608002],"{'urls': [{'start': 103, 'end': 126, 'url': 'h...",3171712086,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1663456870852608002,2023-05-30T08:06:38.000Z,1663456870852608002,"दिवाली पर भी न बोनस पाते हैं, न घर जा पाते हैं...",{'media_keys': ['13_1663456294144196609']},False,everyone,,
4,"{'retweet_count': 3882, 'reply_count': 280, 'l...",en,[1663402066457563136],"{'urls': [{'start': 198, 'end': 221, 'url': 'h...",3171712086,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1663402066457563136,2023-05-30T04:28:51.000Z,1663402066457563136,"Blessed with natural beauty, rich history and ...",{'media_keys': ['3_1663402059033464835']},False,everyone,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,"{'retweet_count': 4115, 'reply_count': 0, 'lik...",hi,[1610269329203036161],"{'urls': [{'start': 112, 'end': 135, 'url': 'h...",3171712086,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1610269329203036161,2023-01-03T13:38:20.000Z,1610269329203036161,RT @bharatjodo: पूर्व रॉ चीफ ए एस दुलत ने आज भ...,{'media_keys': ['3_1610228283974766592']},False,everyone,"[{'type': 'retweeted', 'id': '1610237220195303...",
211,"{'retweet_count': 8952, 'reply_count': 0, 'lik...",qme,[1610269143156297731],"{'urls': [{'start': 19, 'end': 42, 'url': 'htt...",3171712086,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1610269143156297731,2023-01-03T13:37:35.000Z,1610269143156297731,RT @INCIndia: ❤️❤️ https://t.co/9MIQKMIdAQ,{'media_keys': ['7_1610222126799417344']},False,everyone,"[{'type': 'retweeted', 'id': '1610222430043406...",
212,"{'retweet_count': 9867, 'reply_count': 1285, '...",hi,[1610245934595846149],"{'urls': [{'start': 175, 'end': 198, 'url': 'h...",3171712086,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1610245934595846149,2023-01-03T12:05:22.000Z,1610245934595846149,"गंगा-जमुनी तहज़ीब की जन्मभूमि, जिसका इतिहास और...",{'media_keys': ['3_1610245914748387328']},False,everyone,,
213,"{'retweet_count': 6574, 'reply_count': 944, 'l...",en,[1609772217906393088],"{'annotations': [{'start': 5, 'end': 7, 'proba...",3171712086,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1609772217906393088,2023-01-02T04:42:59.000Z,1609772217906393088,"‘Hey Ram’, Khadi, Films and how only India, no...",{'media_keys': ['13_1609770596837240837']},False,everyone,,


In [22]:
df.text.tolist()

["कुछ लोग मानते हैं उन्हें 'सब' पता है।\n\nमगर, देश को आगे बढ़ाने के लिए सुनना, समझना और सीखना ज़रूरी है - यही भारतीय सभ्यता है। https://t.co/q3LCnsEfr3",
 'भारत जोड़ो यात्रा का संदेश - साथ चलो और खोलते जाओ, ‘नफ़रत के बाज़ार में मोहब्बत की दुकानें’। https://t.co/gVLD8ERUkX',
 'RT @INCIndia: Shri @RahulGandhi Interacts with activists, academics and civil society at University of California, Santa Cruz. https://t.co…',
 'दिवाली पर भी न बोनस पाते हैं, न घर जा पाते हैं - त्याग और तपस्या से भरी है ट्रक ड्राइवरों की ज़िंदगी।\n\nhttps://t.co/2O2eYxuj0P https://t.co/8DIr2o0TTK',
 "Blessed with natural beauty, rich history and a vibrant culture, Goa is a precious jewel of India's treasured land of diversity.\n\nMy heartfelt greetings to the people of Goa on their statehood day. https://t.co/3XMnCKomVc",
 '6 घंटो की दिल्ली-चंडीगढ़ यात्रा में ट्रक ड्राइवरों के साथ दिलचस्प बातचीत!\n\n24 घंटे सड़कों पर बिताकर, वो भारत के हर कोने को जोड़ते हैं।\n\nपूरा वीडियो यूट्यूब पर:\nhttps://t.co/2O2eYxuj0P htt

In [24]:
df.columns

Index(['public_metrics', 'lang', 'edit_history_tweet_ids', 'entities',
       'author_id', 'context_annotations', 'conversation_id', 'created_at',
       'id', 'text', 'attachments', 'possibly_sensitive', 'reply_settings',
       'referenced_tweets', 'in_reply_to_user_id'],
      dtype='object')

In [25]:
df.id.tolist()

['1663945499379122179',
 '1663793888883261440',
 '1663743998907826180',
 '1663456870852608002',
 '1663402066457563136',
 '1663151329240567808',
 '1662759636200722433',
 '1662711995660107778',
 '1662321352811114496',
 '1662035737725009920',
 '1661709615636029442',
 '1661283364803080192',
 '1661004220810797062',
 '1660161465154953217',
 '1660119248318996482',
 '1659907997710131204',
 '1659853343639093252',
 '1659829852160118784',
 '1659130390865670145',
 '1658324912262766593',
 '1657323367416963079',
 '1657311092593545216',
 '1656686299783258114',
 '1656641386329047041',
 '1656641255512883200',
 '1656290379749289985',
 '1656134831200165888',
 '1656113102817992706',
 '1655943674805747713',
 '1655518519906168833',
 '1655411026421071873',
 '1655284609859620864',
 '1655235945409437697',
 '1655186423396589568',
 '1654868595875393537',
 '1654840771059810305',
 '1654486309979803650',
 '1654383684424704000',
 '1654332457741606915',
 '1654093726980112386',
 '1654039719620247556',
 '16540022422555

In [28]:
get_likes_by_user(par_dir, tweet_id='1663945499379122179')

TypeError: get_likes_by_user() got an unexpected keyword argument 'tweet_id'