In [1]:
import pandas as pd
import os
from datetime import datetime, timedelta
from twarc.client2 import Twarc2
from dotenv import load_dotenv
from pprint import pprint

# Setup environment
load_dotenv()
BEARER_TOKEN = os.getenv('BEARER_TOKEN')
API_KEY = os.getenv('API_KEY')  # cosumer_key
API_KEY_SECRET = os.getenv('API_KEY_SECRET')  # consumer_secret
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')
MY_ID = os.getenv('MY_ID')

# Setup client
t = Twarc2(consumer_key=API_KEY, consumer_secret=API_KEY_SECRET,
           access_token=ACCESS_TOKEN, access_token_secret=ACCESS_TOKEN_SECRET,
           bearer_token=BEARER_TOKEN, )


# Main heavy lifting guy:
def fetch_data() -> list:
    users_i_follow = []
    data_generator = t.following(user=MY_ID)

    # Use generator to repeatedly get the next list of users:
    for obj in data_generator:
        users_i_follow.extend(obj['data']) if 'data' in obj.keys() \
            else users_i_follow.extend(obj['errors'])

    return users_i_follow

In [27]:
def is_following_list_identical(todays_df: pd.DataFrame, yesterdays_df: pd.DataFrame) -> tuple:
    # Default data contains a lot of extra columns, so they are dropped:
    extra_columns = ['Unnamed: 0', 'entities.url.urls', 'entities.description.urls',
                     'public_metrics.followers_count', 'public_metrics.following_count',
                     'public_metrics.tweet_count', 'public_metrics.listed_count',
                     'entities.description.hashtags', 'entities.description.mentions',
                     'pinned_tweet_id', 'entities.description.cashtags']
    todays_df.drop(columns=extra_columns, inplace=True)
    yesterdays_df.drop(columns=extra_columns, inplace=True)

    # We can assume each user id is unique, so it can be added to a set for easy comparison
    todays_ids = set(todays_df['id'])
    yesterdays_ids = set(yesterdays_df['id'])

    # To find differences that exist in either set, we use the `.symmetric_difference()` method.
    # https://betterprogramming.pub/a-visual-guide-to-set-comparisons-in-python-6ab7edb9ec41

    sym_diff = todays_ids.symmetric_difference(yesterdays_ids)
    no_diff = len(sym_diff) == 0  # if empty, there are no differences

    if no_diff:
        return False, sym_diff
    else:
        return True, sym_diff


def plant_bad_seed(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function was created as a sanity check. I wanted to
    write a function to plant a difference and see what
    happens to the behavior of the `is_following_list_identical()` function.
    """
    bad_seed = {
        'description': ['some test person'],
        'name': ['john doe'],
        'location': ['somewhere idk'],
        'profile_image_url': ['https://picsum.photos/id/237/200/300'],
        'url': [''],
        'verified': [False],
        'protected': [False],
        'id': ['666'],
        'username': ['jd'],
        'created_at': [datetime.now()],
    }
    bad_df = pd.DataFrame(bad_seed)
    return pd.concat([df, bad_df], ignore_index=True, axis=0)

# following_today = fetch_data()
# df = pd.json_normalize(following_today)
# df.to_csv(f'static/{today}.csv')
# df.to_csv(f'static/{yesterday}.csv')

today = datetime.strftime(datetime.today(), '%b-%-d-%Y')
yesterday = datetime.strftime((datetime.today() + timedelta(days=-1)), '%b-%-d-%Y')
td_df = pd.read_csv(f'static/{today}.csv')
yd_df = pd.read_csv(f'static/{yesterday}.csv')

# td_df = plant_bad_seed(td_df)
print(is_following_list_identical(todays_df=td_df, yesterdays_df=yd_df))

(False, set())
