In [30]:
import pandas as pd
import os
from datetime import datetime, timedelta
from twarc.client2 import Twarc2
from dotenv import load_dotenv
from pprint import pprint

# Setup environment
load_dotenv()
BEARER_TOKEN = os.getenv('BEARER_TOKEN')
API_KEY = os.getenv('API_KEY')  # cosumer_key
API_KEY_SECRET = os.getenv('API_KEY_SECRET')  # consumer_secret
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')
MY_ID = os.getenv('MY_ID')

# Setup client
t = Twarc2(consumer_key=API_KEY, consumer_secret=API_KEY_SECRET,
           access_token=ACCESS_TOKEN, access_token_secret=ACCESS_TOKEN_SECRET,
           bearer_token=BEARER_TOKEN, )


# Main heavy lifting guy:
def fetch_twitter_data() -> list:
    users_i_follow = []
    data_generator = t.following(user=MY_ID)

    # Use generator to repeatedly get the next list of users:
    for obj in data_generator:
        users_i_follow.extend(obj['data']) if 'data' in obj.keys() \
            else users_i_follow.extend(obj['errors'])

    return users_i_follow


def get_dataframes() -> tuple:

    # Initialize dates we will use:
    today = datetime.strftime(datetime.today(), '%b-%-d-%Y')
    yesterday = datetime.strftime((datetime.today() + timedelta(days=-1)), '%b-%-d-%Y')
    two_days_ago = datetime.strftime((datetime.today() + timedelta(days=-2)), '%b-%-d-%Y')

    # Get twitter data and save to csv, this is today's df
    following_today = fetch_twitter_data()
    todays_data = pd.json_normalize(following_today)
    todays_data.to_csv(f'static/{today}.csv')

    # Read yesterday's df from file:
    yesterdays_data = pd.read_csv(f'static/{yesterday}.csv')

    # Remove file from two days ago if it exists:
    try:
        os.remove(f'static/{two_days_ago}.csv')
    except FileNotFoundError:
        pass

    return todays_data, yesterdays_data



def plant_bad_seed(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function was created as a sanity check. I wanted to
    write a function to plant a difference and see what
    happens to the behavior of the `is_following_list_identical()` function.
    """
    bad_seed = {
        'description': ['some test person'],
        'name': ['john doe'],
        'location': ['somewhere idk'],
        'profile_image_url': ['https://picsum.photos/id/237/200/300'],
        'url': [''],
        'verified': [False],
        'protected': [False],
        'id': ['666'],
        'username': ['jd'],
        'created_at': [datetime.now()],
    }
    bad_df = pd.DataFrame(bad_seed)
    return pd.concat([df, bad_df], ignore_index=True, axis=0)


In [37]:
def is_following_list_identical(todays_df: pd.DataFrame, yesterdays_df: pd.DataFrame) -> tuple:
    # Default data contains a lot of extra columns, so they are dropped:
    extra_columns = ['entities.url.urls', 'entities.description.urls',
                     'public_metrics.followers_count', 'public_metrics.following_count',
                     'public_metrics.tweet_count', 'public_metrics.listed_count',
                     'entities.description.hashtags', 'entities.description.mentions',
                     'pinned_tweet_id', 'entities.description.cashtags']
    todays_df.drop(columns=extra_columns, inplace=True)
    yesterdays_df.drop(columns=['Unnamed: 0', *extra_columns], inplace=True)

    # We can assume each user id is unique, so it can be added to a set for easy comparison
    todays_ids = set(todays_df['id'])
    yesterdays_ids = set(yesterdays_df['id'])

    # To find differences that exist in either set, we use the `.symmetric_difference()` method.
    # https://betterprogramming.pub/a-visual-guide-to-set-comparisons-in-python-6ab7edb9ec41
    sym_diff = todays_ids.symmetric_difference(yesterdays_ids)
    no_diff = len(sym_diff) == 0  # if empty, there are no differences


    if no_diff:
        return False, pd.DataFrame()
    else:
        return True, pd.DataFrame()


# td_df = plant_bad_seed(td_df)  # use to test
td_df, yd_df = get_dataframes()
print(is_following_list_identical(todays_df=td_df, yesterdays_df=yd_df))

(True, {1019796804936712192, 1090152391196868608, 869295527392837632, 1318731086508195843, 1362537194834034691, 1064296043855167493, 715578798284996613, 925457674963124230, 1289725608558198790, 39780359, 1370479173597282313, 1411541627206729739, 1349149096909668363, 948609064484864006, 1108561725563912197, '1083368094314188800', '1192454216', 33783830, 596312093, '1674440275', 757891106, 190726179, 1630896181, '30354991', 569720894, '14222536', '6129732', 127090765, 480624719, 386056280, '394417223', 3384541280, 2996502625, '1083130198042656768', '4651814174', '3306693429', '3316533172', '4696012939', '870642460585754624', 14696588, '747851730', 15384720, '15904411', 82018451, '270982936', 3348693154, 247316650, '30188594', 70320323, '829388413', 537968846, '18923480', '24636272', 19628262, '1098034116342231040', '36686040', 373620985, '1181932215447498758', 42467583, 788065764255211520, 1122857919551119360, 876448351268380672, 1115133840995917824, 1150426271018393601, 7184937763065118