In [None]:
# default_exp scrapper

# Tweet Scrapper

> Use Twitter API to get tweets

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
#%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#export
import pandas as pd
import tweepy
from os import getenv
from dotenv import load_dotenv
import time
import datetime
import logging

In [None]:
#export
class Scrapper():
    
    logger = None
    
    def __init__(self): 
        # TODO : accept a logger in constructor
        # TODO : accept the tweet config in constructor
        if getenv('TWITTER_CONSUMER_KEY') is None or len(getenv('TWITTER_CONSUMER_KEY')) == 0:
            # Load .env only in Notebook, it will be populated at runtime by docker
            from pathlib import Path
            env_path = Path('..') / '.env'
            if env_path.is_file():
                load_dotenv(dotenv_path=env_path)
            else:
                print('ERROR : no env !')
                exit(4)
        # Logging
        self.logger = logging.getLogger("tweet-archiveur")
        logFormatter = logging.Formatter("%(asctime)s -  %(name)-12s %(levelname)-8s %(message)s")
        self.logger.setLevel(logging.DEBUG)
        # # File logger
        # fh = logging.FileHandler("tweet-archiveur.log")
        # fh.setLevel(logging.DEBUG)
        # fh.setFormatter(logFormatter)
        # logger.addHandler(fh)
        if not len(self.logger.handlers):
            # Console logger
            consoleHandler = logging.StreamHandler()
            consoleHandler.setFormatter(logFormatter)
            self.logger.addHandler(consoleHandler)
        self.logger.info(f'Scrapper ready')
        
    def get_users_accounts(self, csv = None):
        csv = getenv("USERS_CSV") if csv is None else csv
        return pd.read_csv(csv)

    # Convert UTC to Local based on the current date (do not work well around the day we change from/to summertime)
    def datetime_from_utc_to_local(self, utc_datetime):
        now_timestamp = time.time()
        offset = datetime.datetime.fromtimestamp(now_timestamp) - datetime.datetime.utcfromtimestamp(now_timestamp)
        return utc_datetime + offset


    def get_user_tweets(self, user_id):
        auth = tweepy.AppAuthHandler(getenv("TWITTER_CONSUMER_KEY"), getenv("TWITTER_CONSUMER_SECRET"))
        api = tweepy.API(auth, wait_on_rate_limit=True)
        tweets = []
        hashtags = []
        for tweet in tweepy.Cursor(api.user_timeline, id=user_id, tweet_mode='extended').items(100):
            tweet_tmp = {
                'twitter_id' : user_id,
                'tweet_id' : tweet.id,
                'datetime_utc' : tweet.created_at,
                'datetime_local' : self.datetime_from_utc_to_local(tweet.created_at),
                'text' : tweet.full_text, # .encode('utf8')
                'retweet' : tweet.retweet_count,
                'favorite' : tweet.favorite_count
            }
            tweets.append(tweet_tmp)
            for h in tweet.entities.get('hashtags'):
                hashtag = {
                    'tweet_id' : tweet_tmp['tweet_id'],
                    'twitter_id' : tweet_tmp['twitter_id'],
                    'datetime_local' : tweet_tmp['datetime_local'],
                    'hashtag' : h['text']
                }
                hashtags.append(hashtag)
        return tweets, hashtags

    '''
    Get all the tweets of all the users
    input : a list of twitter_id
    output : a list of all tweets
    '''
    def get_all_tweet_as_array(self, users_id):
        tweets = []
        hashtags = []
        total_users = len(users_id)
        for i, user_id in enumerate(users_id):
            tweets_tmp, hashtags_tmp = self.get_user_tweets(user_id)
            tweets += tweets_tmp
            hashtags += hashtags_tmp
            if i % 10 == 0:
                info_str = f'Processing user {i} / {total_users} ({(i*100//total_users*100)/100}%)'
                if self.logger is not None:
                    self.logger.debug(info_str)
                else:
                    print(info_str)
            #if i>3:
            #    break
        return tweets, hashtags

    
    def get_all_tweet_and_store_them(self, database, users_id_to_process):
        # Do a while loop to handle retry on error
        # We store a list of remaining id to process
        total_tweet = 0
        total_users = len(users_id_to_process)
        consecutive_fail = 0
        while len(users_id_to_process) > 0:
            user_id = users_id_to_process[0]
            try:
                # Get tweets
                tweets, hashtags = self.get_user_tweets(user_id)
            except tweepy.TweepError as e:
                if '401' in e.reason :
                    self.logger.warning(f'Error processing {user_id} : The user have a private account, skipping.')
                    users_id_to_process.pop(0)
                    continue
                self.logger.warning(f'Error processing {user_id} : tweepy.TweepError={e.reason} We will retry in 16 minutes to respect twitter API Rate Limit')
                consecutive_fail += 1
                if consecutive_fail > 3:
                    self.logger.error(f'We fail {consecutive_fail} consecutive times, exiting.')
                    exit(3)
                time.sleep(16*60*consecutive_fail)
                # We shuffle the list to try another user next time
                random.shuffle(users_id_to_process)
                continue
            except:
                e = sys.exc_info()[0]
                self.logger.error(f'UNKNOW ERROR processing {user_id} STOPPING : Error={e}')
                exit(2)
            try:
                # Save them to database
                database.insert_tweets(pd.DataFrame(tweets))
                database.insert_hashtags(pd.DataFrame(hashtags))
            except:
                e = sys.exc_info()[0]
                self.logger.error(f'UNKNOW ERROR processing {user_id} STOPPING : Error={e}')
                exit(1)
            # Yes, we succeded
            consecutive_fail = 0
            # Remove user from the list to process
            users_id_to_process.pop(0)
            total_tweet += len(tweets)
            i = total_users - len(users_id_to_process)
            if i % 10 == 0:
                self.logger.debug(f'We got {total_tweet} tweets for now. Processing user {i} / {total_users} ({(i*100//total_users*100)/100}%) {user_id=} ...')
        self.logger.info(f'Done scrapping, we got {total_tweet} tweets from {total_users} tweetos.')

In [None]:
#export
scrapper = Scrapper()

2021-03-22 10:16:08,109 -  tweet-archiveur INFO     Scrapper ready


In [None]:
#hide
df_users = scrapper.get_users_accounts()

In [None]:
#hide
df_users.columns

Index(['twitter', 'nom', 'nom_de_famille', 'prenom', 'sexe', 'twitter_tweets',
       'twitter_followers', 'twitter_following', 'twitter_listed',
       'twitter_favourites', 'twitter_verified', 'twitter_protected',
       'twitter_id', 'twitter_name', 'twitter_description',
       'twitter_created_at', 'sites_web', 'url_institution', 'slug',
       'url_nosdeputes_api'],
      dtype='object')

In [None]:
#hide
user_id = df_users.twitter_id[0]
df_users = df_users[['twitter_id', 'nom', 'twitter_followers', 'twitter_tweets']].head(3)
df_users['nom'] = 'Person name'
df_users.to_csv('../tests/sample-users.csv')
df_users

Unnamed: 0,twitter_id,nom,twitter_followers,twitter_tweets
0,76584619,Person name,23085,6461
1,507168683,Person name,5717,1750
2,314472161,Person name,19323,2789


In [None]:
#hide
tweets, hashtags = scrapper.get_user_tweets(user_id)
tweets[0:1]
hashtags[0:2]

[{'twitter_id': 76584619,
  'tweet_id': 1373697810391044099,
  'datetime_utc': datetime.datetime(2021, 3, 21, 18, 7, 34),
  'datetime_local': datetime.datetime(2021, 3, 21, 19, 7, 34),
  'text': 'Superbe victoire du @nimesolympique face au leader Lille. Tous les Nîmois sont fiers de vous. Nous croyons au maintien. Le Nîmes olympique mérite sa place en @Ligue1UberEats https://t.co/PgbEk0xkOB',
  'retweet': 5,
  'favorite': 13}]

[{'tweet_id': 1373576438469775361,
  'twitter_id': 76584619,
  'datetime_local': datetime.datetime(2021, 3, 21, 11, 5, 17),
  'hashtag': 'Trisomie21'},
 {'tweet_id': 1373576438469775361,
  'twitter_id': 76584619,
  'datetime_local': datetime.datetime(2021, 3, 21, 11, 5, 17),
  'hashtag': 'JourneeMondialeDeLaTrisomie21'}]

In [None]:
#hide
tweets[0]['datetime_utc']
created_date_local = scrapper.datetime_from_utc_to_local(tweets[0]['datetime_utc'])
created_date_local

datetime.datetime(2021, 3, 21, 18, 7, 34)

datetime.datetime(2021, 3, 21, 19, 7, 34)

## Tweets

In [None]:
#hide
df_tweets = pd.DataFrame(tweets)

In [None]:
#hide
pd.set_option('display.max_colwidth', None)
df_tweets.head(2)

Unnamed: 0,twitter_id,tweet_id,datetime_utc,datetime_local,text,retweet,favorite
0,76584619,1373697810391044099,2021-03-21 18:07:34,2021-03-21 19:07:34,Superbe victoire du @nimesolympique face au leader Lille. Tous les Nîmois sont fiers de vous. Nous croyons au maintien. Le Nîmes olympique mérite sa place en @Ligue1UberEats https://t.co/PgbEk0xkOB,5,13
1,76584619,1373576438469775361,2021-03-21 10:05:17,2021-03-21 11:05:17,Aujourd'hui c'est la journée mondiale de la trisomie 21. Ne les oublions pas et n'oublions pas que la différence est une richesse. #Trisomie21 #JourneeMondialeDeLaTrisomie21 https://t.co/Pfp1zQckSx,89,262


In [None]:
#hide
df_tweets.to_csv('../tests/sample-tweets.csv')

## Hashtags

In [None]:
#hide
df_hashtags = pd.DataFrame(hashtags)
df_hashtags.head(3)
df_hashtags.to_csv('../tests/sample-hashtags.csv')

Unnamed: 0,tweet_id,twitter_id,datetime_local,hashtag
0,1373576438469775361,76584619,2021-03-21 11:05:17,Trisomie21
1,1373576438469775361,76584619,2021-03-21 11:05:17,JourneeMondialeDeLaTrisomie21
2,1373395871736463361,76584619,2021-03-20 23:07:46,Bleus


## Loop other users to get all tweets

In [None]:
users_id = df_users.twitter_id.tolist()[0:2]
tweets, hashtag = scrapper.get_all_tweet_as_array(users_id)
print("------ TWEETS   ---------------------------")
tweets[0:2]
print("------ HASHTAGS ---------------------------")
hashtag[0:2]

2021-03-22 10:16:11,898 -  tweet-archiveur DEBUG    Processing user 0 / 2 (0.0%)


------ TWEETS   ---------------------------


[{'twitter_id': 76584619,
  'tweet_id': 1373697810391044099,
  'datetime_utc': datetime.datetime(2021, 3, 21, 18, 7, 34),
  'datetime_local': datetime.datetime(2021, 3, 21, 19, 7, 34),
  'text': 'Superbe victoire du @nimesolympique face au leader Lille. Tous les Nîmois sont fiers de vous. Nous croyons au maintien. Le Nîmes olympique mérite sa place en @Ligue1UberEats https://t.co/PgbEk0xkOB',
  'retweet': 5,
  'favorite': 13},
 {'twitter_id': 76584619,
  'tweet_id': 1373576438469775361,
  'datetime_utc': datetime.datetime(2021, 3, 21, 10, 5, 17),
  'datetime_local': datetime.datetime(2021, 3, 21, 11, 5, 17),
  'text': "Aujourd'hui c'est la journée mondiale de la trisomie 21. Ne les oublions pas et n'oublions pas que la différence est une richesse. #Trisomie21 #JourneeMondialeDeLaTrisomie21 https://t.co/Pfp1zQckSx",
  'retweet': 89,
  'favorite': 262}]

------ HASHTAGS ---------------------------


[{'tweet_id': 1373576438469775361,
  'twitter_id': 76584619,
  'datetime_local': datetime.datetime(2021, 3, 21, 11, 5, 17),
  'hashtag': 'Trisomie21'},
 {'tweet_id': 1373576438469775361,
  'twitter_id': 76584619,
  'datetime_local': datetime.datetime(2021, 3, 21, 11, 5, 17),
  'hashtag': 'JourneeMondialeDeLaTrisomie21'}]

In [None]:
# Force some variable outside Docker
from os import environ
environ["DATABASE_PORT"] = '8479'
environ["DATABASE_HOST"] = 'localhost'
environ["DATABASE_USER"] = 'tweet_archiveur_user'
environ["DATABASE_PASS"] = '1234leximpact'
environ["DATABASE_NAME"] = 'tweet_archiveur'

from tweet_archiveur.database import Database
database = Database()
users_id = df_users.twitter_id.tolist()[0:2]
scrapper.get_all_tweet_and_store_them(database, users_id)
del database

2021-03-22 10:17:13,060 -  tweet-archiveur INFO     Loading database module...
2021-03-22 10:17:13,061 -  tweet-archiveur DEBUG    DEBUG : connect(user=tweet_archiveur_user, password=XXXX, host=localhost, port=8479, database=tweet_archiveur, url=None)
2021-03-22 10:17:17,076 -  tweet-archiveur INFO     Done scrapping, we got 400 tweets from 2 tweetos.


In [None]:
users_id

In [None]:
#hide
tweets[3]
i=20
total_users=603
print((i*100//total_users*100)/100)