In [1]:
import csv
import snscrape.modules.twitter as sntwitter
import itertools
import pandas as pd
import json

In [55]:
pd.options.display.max_columns = None

In [2]:
OUTPUT_FILE = "../data/results.csv"

In [3]:
def save_tweets(query, n_tweets):
    """
    Finds the latest tweets according to a query and saves them in a csv file
    :param query: the query to look for
    :param n_tweets: number of tweets to scrape
    """
    with open(OUTPUT_FILE, 'w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['id', 'date', 'username', 'content', 'url'])
        
        tweets_gen = sntwitter.TwitterSearchScraper(query).get_items()
        top_tweets = itertools.islice(tweets_gen, n_tweets)
        
        for tweet in top_tweets:
            writer.writerow([tweet.id, tweet.date, tweet.username, tweet.content, tweet.url])
        
        print('Done!')

In [4]:
query = "#foreignaid"

In [5]:
save_tweets(query=query, n_tweets=10)

Done!


  writer.writerow([tweet.id, tweet.date, tweet.username, tweet.content, tweet.url])
  writer.writerow([tweet.id, tweet.date, tweet.username, tweet.content, tweet.url])


In [6]:
tweets_gen = sntwitter.TwitterSearchScraper(query).get_items()

## Advanced

Advanced Search reference: https://github.com/igorbrigadir/twitter-advanced-search

In [19]:
def advancedTweetSearch(query, loc, start, end, lang="en"):
    TWEET_STOR = []
    LOC_STOR = []
    
    scraped_tweets = sntwitter.TwitterSearchScraper(f'{query} geocode:"{loc}"since:{start} until:{end}').get_items()
    
    sliced_scraped_tweets = itertools.islice(scraped_tweets, 10000)
    
    
    for tweet in sliced_scraped_tweets:
        try:
            LOC_STOR.append(json.loads(tweet.json())['place']['country'])
            TWEET_STOR.append(json.loads(tweet.json()))
        except TypeError:
            continue
        
    return TWEET_STOR, LOC_STOR

In [20]:
query = "economic development OR #economicdevelopment"
start = '2019-01-01'
end = '2023-01-01'
loc = "-3.337954,119.882813, 4000km"

In [21]:
%time southest_asia, location_details = advancedTweetSearch(query=query, start=start, end=end, loc=loc)

CPU times: user 12.1 s, sys: 143 ms, total: 12.2 s
Wall time: 6min 15s


In [22]:
pd.Series(location_details).value_counts()

India                         3066
Pakistan                       453
Australia                      311
Bangladesh                     113
Maldives                        98
Sri Lanka                       72
Nepal                           61
People's Republic of China      47
Indonesia                       27
Taiwan                          20
East Timor                      10
Papua New Guinea                 7
Bhutan                           6
Northern Mariana Islands         3
Guam                             3
Japan                            3
                                 2
United States                    1
dtype: int64

In [23]:
set([(tweet['lang']) for tweet in southest_asia])

{'ca',
 'da',
 'en',
 'es',
 'et',
 'fr',
 'hi',
 'ht',
 'in',
 'mr',
 'nl',
 'pt',
 'ro',
 'tl',
 'und',
 'zh'}

In [24]:
# brazil

loc = "-8.581021,-51.328125, 5000km"

In [25]:
%time lat_amrica, location_details = advancedTweetSearch(query=query, start=start, end=end, loc=loc)

CPU times: user 16.2 s, sys: 192 ms, total: 16.3 s
Wall time: 7min 44s


In [26]:
pd.Series(location_details).value_counts()

United States    4302
Ghana             495
Mexico             58
Sierra Leone       46
Morocco            31
Gambia             25
Fiji               24
Spain              19
Ivory Coast        19
Senegal            18
Brazil             16
Liberia            15
Bahamas            10
Guatemala           7
Bermuda             6
Cape Verde          4
Guinea              3
Mali                2
Portugal            1
Gibraltar           1
Burkina Faso        1
Ecuador             1
Argentina           1
Mauritania          1
Guinea Bissau       1
dtype: int64

In [28]:
set([tweet['lang'] for tweet in lat_amrica])

{'ca', 'en', 'es', 'fr', 'pt', 'ro'}

# store tweets

In [34]:
lat_amrica[0].keys()

dict_keys(['_type', 'url', 'date', 'rawContent', 'renderedContent', 'id', 'user', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 'conversationId', 'lang', 'source', 'sourceUrl', 'sourceLabel', 'links', 'media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId', 'inReplyToUser', 'mentionedUsers', 'coordinates', 'place', 'hashtags', 'cashtags', 'card', 'viewCount', 'vibe', 'content', 'outlinks', 'outlinksss', 'tcooutlinks', 'tcooutlinksss', 'username'])

In [51]:
MAIN_COLS = ['url', 'date', 'rawContent', 'renderedContent', 'id', 'user', 'replyCount', 'retweetCount',
             'likeCount', 'quoteCount', 'lang', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId',
             'inReplyToUser', 'mentionedUsers', 'coordinates', 'place', 'hashtags', 'cashtags', 
            'viewCount', 'vibe', 'content', 'card', 'siteUser']
USER_COLS = ['username', 'id', 'rawDescription', 'renderedDescription', 'created', 'followersCount', 'statusesCount', 
            'favouritesCount', 'listerCount']
CARD_COLS = ['title', 'description', 'siteUser']

SITE_USER_COLS = ['id', 'displayname', 'rawDescription', 'renderedDescription', 'verified', 'created', 
 'followersCount', 'friendsCount', 'statusesCount', 'favouritesCount', 'listedCount', 'mediaCount', 'location']

In [59]:
lat_amr_df = pd.json_normalize(lat_amrica)

In [62]:
lat_amr_df.columns = lat_amr_df.columns.str.replace('.', '_')

  lat_amr_df.columns = lat_amr_df.columns.str.replace('.', '_')


In [70]:
['user_' + col for col in USER_COLS if 'user_' + col not in lat_amr_df.columns]

['user_listerCount']

In [69]:
['card_' + col for col in CARD_COLS if 'card_' + col not in lat_amr_df.columns]

[]

In [75]:
['card_siteUser__' + col for col in SITE_USER_COLS if 'card_siteUser__' + col not in lat_amr_df.columns]

['card_siteUser__id',
 'card_siteUser__displayname',
 'card_siteUser__rawDescription',
 'card_siteUser__renderedDescription',
 'card_siteUser__verified',
 'card_siteUser__created',
 'card_siteUser__followersCount',
 'card_siteUser__friendsCount',
 'card_siteUser__statusesCount',
 'card_siteUser__favouritesCount',
 'card_siteUser__listedCount',
 'card_siteUser__mediaCount',
 'card_siteUser__location']