In [1]:
import csv
import snscrape.modules.twitter as sntwitter
import itertools
import pandas as pd
import json

In [2]:
OUTPUT_FILE = "../data/results.csv"

In [3]:
def save_tweets(query, n_tweets):
    """
    Finds the latest tweets according to a query and saves them in a csv file
    :param query: the query to look for
    :param n_tweets: number of tweets to scrape
    """
    with open(OUTPUT_FILE, 'w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['id', 'date', 'username', 'content', 'url'])
        
        tweets_gen = sntwitter.TwitterSearchScraper(query).get_items()
        top_tweets = itertools.islice(tweets_gen, n_tweets)
        
        for tweet in top_tweets:
            writer.writerow([tweet.id, tweet.date, tweet.username, tweet.content, tweet.url])
        
        print('Done!')

In [4]:
query = "#foreignaid"

In [5]:
save_tweets(query=query, n_tweets=10)

Done!


  writer.writerow([tweet.id, tweet.date, tweet.username, tweet.content, tweet.url])
  writer.writerow([tweet.id, tweet.date, tweet.username, tweet.content, tweet.url])


In [6]:
tweets_gen = sntwitter.TwitterSearchScraper(query).get_items()

## Advanced

Advanced Search reference: https://github.com/igorbrigadir/twitter-advanced-search

In [55]:
def advancedTweetSearch(query, loc, start, end, lang="en"):
    TWEET_STOR = []
    LOC_STOR = []
    
    scraped_tweets = sntwitter.TwitterSearchScraper(f'{query} geocode:"{loc}"since:{start} until:{end}').get_items()
    
    sliced_scraped_tweets = itertools.islice(scraped_tweets, 5000)
    
    
    for tweet in sliced_scraped_tweets:
        try:
            LOC_STOR.append(json.loads(tweet.json())['place']['country'])
            TWEET_STOR.append(json.loads(tweet.json()))
        except TypeError:
            continue
        
    return TWEET_STOR, LOC_STOR

In [56]:
query = "economic development OR #economicdevelopment"
start = '2019-01-01'
end = '2023-01-01'
loc = "-3.337954,119.882813, 3500km"

In [57]:
%time tweet_stor, location_details = advancedTweetSearch(query=query, start=start, end=end, loc=loc)

CPU times: user 9.01 s, sys: 145 ms, total: 9.15 s
Wall time: 5min 33s


In [58]:
pd.Series(location_details).value_counts()

India                         2915
Australia                      178
Bangladesh                     113
Maldives                        98
Sri Lanka                       72
Pakistan                        67
Nepal                           61
Indonesia                       27
Taiwan                          19
People's Republic of China      14
East Timor                      10
Papua New Guinea                 7
Bhutan                           6
Northern Mariana Islands         3
Guam                             3
United States                    1
                                 1
dtype: int64

In [59]:
set([(tweet['lang']) for tweet in tweet_stor])

{'ca',
 'da',
 'en',
 'es',
 'et',
 'fr',
 'hi',
 'ht',
 'in',
 'mr',
 'pt',
 'ro',
 'tl',
 'und',
 'zh'}

In [60]:
# brazil

loc = "-8.581021,-51.328125, 4500km"

In [61]:
%time tweet_stor, location_details = advancedTweetSearch(query=query, start=start, end=end, loc=loc)

CPU times: user 7.82 s, sys: 104 ms, total: 7.92 s
Wall time: 4min 59s


In [62]:
pd.Series(location_details).value_counts()

United States    2215
Ghana             496
Mexico             58
Sierra Leone       46
Gambia             25
Fiji               24
Morocco            20
Ivory Coast        19
Senegal            18
Brazil             16
Liberia            15
Bahamas            10
Guatemala           7
Cape Verde          4
Guinea              3
Spain               2
Mali                2
Burkina Faso        1
Ecuador             1
Mauritania          1
Guinea Bissau       1
dtype: int64

In [64]:
set([tweet['lang'] for tweet in tweet_stor])

{'ca', 'en', 'es', 'fr', 'pt', 'ro'}

In [65]:
tweet_stor[0]

{'_type': 'snscrape.modules.twitter.Tweet',
 'url': 'https://twitter.com/blackraiden76T/status/1608253696089161729',
 'date': '2022-12-29T00:08:55+00:00',
 'rawContent': 'Sustainable community development must be a national priority at some point— the burden on the individual — the family — The community—The economic burden on states over the next several years will be too great to bear—Public health policy—upholding our civil liberties. #life',
 'renderedContent': 'Sustainable community development must be a national priority at some point— the burden on the individual — the family — The community—The economic burden on states over the next several years will be too great to bear—Public health policy—upholding our civil liberties. #life',
 'id': 1608253696089161729,
 'user': {'_type': 'snscrape.modules.twitter.User',
  'username': 'blackraiden76T',
  'id': 177520363,
  'displayname': 'Dr. Richard Broussard',
  'rawDescription': 'Humanitarian. Music for Total Wellness/ Health Equity. P

In [69]:
non_eng = [non_eng_tweet for non_eng_tweet in tweet_stor if non_eng_tweet['lang'] != 'en']

In [70]:
non_eng[0]

{'_type': 'snscrape.modules.twitter.Tweet',
 'url': 'https://twitter.com/toddahunter/status/1597618235892981762',
 'date': '2022-11-29T15:47:24+00:00',
 'rawContent': 'Veterans Roundtable (@ Del Mar Center For Economic Development) https://t.co/xjkx3st0dH',
 'renderedContent': 'Veterans Roundtable (@ Del Mar Center For Economic Development) swarmapp.com/c/cpEzItpgiKA',
 'id': 1597618235892981762,
 'user': {'_type': 'snscrape.modules.twitter.User',
  'username': 'toddahunter',
  'id': 133165344,
  'displayname': 'Todd Hunter',
  'rawDescription': 'Personal Twitter feed of Representative Todd Hunter - Dist. 32. Follow @Dist32StateRep for updates from Team Todd.',
  'renderedDescription': 'Personal Twitter feed of Representative Todd Hunter - Dist. 32. Follow @Dist32StateRep for updates from Team Todd.',
  'descriptionLinks': None,
  'verified': False,
  'created': '2010-04-15T05:02:51+00:00',
  'followersCount': 8056,
  'friendsCount': 1476,
  'statusesCount': 17243,
  'favouritesCount':

# Language translation