<a href="https://colab.research.google.com/github/ksnugroho/feel-in/blob/main/twitter-scraper/01_main_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Main Twitter Scraper**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Path to google drive folder
%cd /content/drive/MyDrive/

In [None]:
# https://pypi.org/project/snscrape/0.2.0/
!pip install -q snscrape

In [None]:
# function to get the tweets from a user
def tweet_scraper(keywords, start_date, end_date, num_of_tweets=500, lang='id'):
    """
    Paremters:
    ----------------
    keywords: str
    start_date: str (format yyyy-mm-dd)
    end_date: str (format yyyy-mm-dd)
    num_of_tweets: int (default 500)
    lang: str (default 'id')

    Return:
    ----------------
    df: dataframe

    Example:
    ----------------
    tweet_scrapper('marah', '2019-01-01', '2021-11-01', num_of_tweets=500, lang='id')
    """

    import pandas as pd
    import snscrape.modules.twitter as sntwitter
    from timeit import default_timer as timer

    start = timer()

    # creating list to append tweet data
    tweets_list = []

    # using TwitterSearchScraper to scrape data and append tweets to list
    criteria = f'{keywords} since:{start_date} until:{end_date} lang:{lang} exclude:retweets exclude:replies'
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(criteria).get_items()):
        if i > num_of_tweets:
            break
        tweets_list.append([tweet.date, tweet.content])

    # creating a dataframe from the tweets list above
    df = pd.DataFrame(tweets_list, columns=['datetime', 'tweet'])
    df = df.dropna()            # dropping rows with NaN values
    df = df.drop_duplicates()   # dropping duplicates

    end = timer()
    print('Time taken:', end - start, 'seconds')

    return df

In [None]:
emotion_keywords = {
    'anger'     : ['kesal', 'murka', 'benci', 'marah', 'tersinggung'],
    'disgust'   : ['muak', 'risih', 'penat', 'jijik', 'enek'],
    'fear'      : ['takut', 'ngeri', 'cemas', 'gugup', 'tersiksa'],
    'joy'       : ['senang', 'bangga', 'bahagia', 'puas', 'riang'],
    'sadness'   : ['kecewa', 'sedih', 'berduka', 'sengsara', 'kesepian'],
    'surprise'  : ['heran', 'terkejut', 'terpesona', 'tertipu', 'kaget']
}

start_date  = '2020-01-01'
end_date    = '2020-12-31'
folder_path = 'twitter-scraper/data-raw'
num_of_tweets = 100000 

result = {}  # creating a dictionary to store data

for emotion, keywords in emotion_keywords.items():
    for key in keywords:
        result[key] = tweet_scraper(key, start_date=start_date, end_date=end_date, num_of_tweets=num_of_tweets)
        result[key].to_csv(f'{folder_path}/{emotion}/{key}-{start_date}-{end_date}.csv')

print('Done!')