# Data download

Ths notebook contains the code for downloading the Twitter data of candidates for the Conventional Constitution.

- Input: candidates Twitter IDs
- Output: downloaded tweets (in MongoDB)

In [None]:
import pandas as pd
from dotenv import load_dotenv
import tweepy
from tweepy import TweepError
import os
import pymongo
from pymongo.errors import BulkWriteError
import logging
import time
import tqdm
import pyathena

In [None]:
logging.basicConfig(format='[%(asctime)s] - %(name)s - %(funcName)s - %(levelname)s : %(message)s', level=logging.INFO)
log = logging.getLogger(__name__)

load_dotenv()  # take environment variables from .env.

auth = tweepy.OAuthHandler(os.environ["TWITTER_API_KEY"], os.environ["TWITTER_API_SECRET_KEY"])
auth.set_access_token(os.environ["TWITTER_ACCESS_TOKEN"], os.environ["TWITTER_ACCESS_TOKEN_SECRET"])

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

mongo_client = pymongo.MongoClient(os.environ["MONGODB_URL"])
twitter_db = mongo_client.TwitterConstituyenteDB
test_db = mongo_client.testdb

In [None]:
conn = pyathena.connect(s3_staging_dir=os.environ["AWS_ATHENA_S3_STAGING_DIR"], 
        region_name=os.environ["AWS_REGION"])

query = """
SELECT * FROM "twitter-constituyente"."constituyentes_full";
"""
candidates_df = pd.read_sql(query, conn)
candidates_df["electoral_district"] = candidates_df["electoral_district"].astype("str")

candidates_ids = candidates_df["user__id_str"].dropna().to_list()

In [None]:
def bulk_write_to_mongo(collection, data):
    to_insert = len(data)
    try:
        if to_insert > 0:
            collection.insert_many(data, ordered=False)
        return to_insert, 0
    except BulkWriteError as e:
        log.error("BulkWriteError")
        inserted = e.details["nInserted"]
        return inserted, to_insert - inserted

def download_timeline(user_id: str, n: int = 3200, count: int = 200, trim_user=True, tweet_mode="extended", **kwargs):
    log.info(f'Downloading timeline from user id: {user_id}') 
    start_time = time.time()
    tweets = [status for status in tqdm.tqdm(tweepy.Cursor(
        api.user_timeline, 
        user_id=user_id, 
        count=count, 
        trim_user=trim_user, 
        tweet_mode=tweet_mode,
        **kwargs).items(n), total=n)]
    total_time = time.time()  - start_time
    log.info(f"Downloaded finished: {len(tweets)} tweets in {total_time:.4f} seconds.")
    return tweets


def download_retweets(tweet_id: str, trim_user=True, tweet_mode="extended"):
    return api.retweets(tweet_id, count=100, trim_user=trim_user, tweet_mode=tweet_mode)

def download_users(user_ids):
    pass


In [None]:
#  ids of tweets in those time ranges
dic31_2020_id = 1344795494015528970
may15_2021_id = 1393748045909667840


In [None]:
# download feed from candidates
for user_id_str in tqdm.tqdm(candidates_df["user__id_str"].dropna().sample(10)):
    try:
        timeline = download_timeline(user_id_str, since_id=dic31_2020_id, max_id=may15_2021_id)
        writed, not_writed = bulk_write_to_mongo(twitter_db.tweetstest, list(map(lambda status: status._json, timeline)))
        log.info(f"Writed: {writed}. Not writed: {not_writed}")
    except Exception as e:
        log.error(f"Error from user id: {user_id_str}")
        log.error(e)

In [None]:
# downloading retweet for each tweet from a candidate

filter_original_tweets = {
    "user.id_str": {"$in": candidates_ids},
    "retweeted_status": { "$exists": False },
    "retweet_count": { "$gt": 0}
}

count_originals = twitter_db.tweets.count_documents(filter_original_tweets)
tweets_constituyentes_originales = twitter_db.tweets.find(filter_original_tweets)

for tweet in tqdm.tqdm(tweets_constituyentes_originales, total=count_originals):
    retweets = download_retweets(tweet["id_str"])
    writed, not_writed = bulk_write_to_mongo(twitter_db.tweets, list(map(lambda status: status._json, retweets)))
    if writed:
        log.info(f"Writed: {writed}. Not writed: {not_writed}")

In [None]:
# downloading feeds for retweeters

retweeters_filter = {
    "user.id_str": {"$nin": candidates_ids },
    "retweeted_status.user.id_str": { "$in": candidates_ids }
}
retweeters = twitter_db.tweets.find(retweeters_filter, ["user.id_str"])

for user_id_str in tqdm.tqdm(retweeters):
    try:
        timeline = download_timeline(user_id_str["user"]["id_str"], since_id=dic31_2020_id, max_id=may15_2021_id)
    except TweepError as e:
        log.error(e)
        continue
    writed, not_writed = bulk_write_to_mongo(twitter_db.tweets, list(map(lambda status: status._json, timeline)))
    log.info(f"Writed: {writed}. Not writed: {not_writed}")