The purpose of this module is to handle interactions with a database.

Reading, writing, creating, deleting, connecting and more.

Goal 1: connect to google

In [32]:
import os
from pathlib import Path

import datetime as dt
import regex as re

import yaml

import pandas as pd

import requests
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import pygsheets

In [27]:
DB_CONFIG_PATH = "../db_config.yaml"

In [28]:
class Timers():
    """A class for  Timing-stamping cell calls."""
    import datetime as dt

    def exec_time(msg="Completed task"):
        """
        Runtime message tracking cell progress. Prints an message and a timestamp.
        
        Parameters
        -------
        msg (str): User provided message. Defaults to a generic statement.
        
        Returns
        -------
        None
        """
        try:
            now = dt.datetime.now().strftime("%H:%M:%S - %Y-%m-%d")
            print(
                "{msg} Timestamp: {now}".format(msg=msg, now=now)
            )
        except Exception as e:
            print("Warning: unable to Run exec_time.\nRawmessage: {msg}.\n{error}".format(msg=msg, error=e))


class Google():
    """"A class to connect to Google services."""

    import pickle
    import pandas as pd
    SERVICE_ACCOUNT = None

    def google_connect(self, credentials_path=None, service_account_env_var=None) -> (any, pygsheets.client.Client):
        """
        Connects to google drive and spreadsheets. Requires '[...]/client_secrets[...].json" and or 
        a service account variable in the form of a name (str).
        Will create a token in '.' to track authentication. 
        Returns a service object to allow connections to google drive files.
        Warning: do not share your token or anyone will have access to all content on your drive.

        Parameters
        -------
        credentials_path (str): Path to client secrets json.
        service_account_env_var (str): Name of environment variable for google connection.
        
        Returns
        -------
        gdrive (googleapiclient.discovery.Resource): Resource object with connection to google drive.
        gsheets (pygsheets.client.Client): pygsheets client object to manipulate gsheets.
        """
        
        SCOPES = ["https://www.googleapis.com/auth/drive"]
        gdrive, gsheets = None, None

        if credentials_path != None: 
            creds = None 

            # Authentication flow.
            if Path("token.pickle").exists():
                with open("token.pickle", "rb") as token:
                    creds = self.pickle.load(token)
            if not creds or not creds.valid:
                if creds and creds.expired and creds.refresh_token:
                    creds.refresh(Request())
                else:
                    flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
                    creds = flow.run_local_server(port=0)
                    # Save access token for future use.
                    with open("token.pickle", "wb") as token:
                        self.pickle.dump(creds, token)

            gdrive = build("drive", "v3", credentials=creds)
            gsheets = pygsheets.authorize(custom_credentials=creds)

        elif service_account_env_var != None:
            # dev note: not getting gdrive in this case yet.
            gsheets = pygsheets.authorize(service_account_env_var=service_account_env_var)

        return gdrive, gsheets
    

    def write_to_googlesheets(
        self, 
        data: pd.DataFrame, 
        gsheetkey: str,
        gsheets: pygsheets.client.Client,
        wks_title: str, 
        row_start="A1",
        
    ) -> None:
        """
        Push DataFrame to Googlesheet via key.

        Parameters
        -------
        data (pd.DataFrame): Dataframe with data to push.
        gsheetkey (str): Key to google sheet.
        gsheets (pygsheets client object): Google sheets connection object.
        data (pd.DataFrame): Dataframe with data to push.
        wks_title (str): Worksheet title.
        row_start (str): Set where the dataframe starting cell will write. Use A1 formatting.
    
        Returns
        -------
        (None)
        """
        df0 = data.copy(deep=True) 

        sh = gsheets.open_by_key(gsheetkey)

        wks = sh.worksheet("title", wks_title)
        wks.clear(start=row_start, end=None)

        if wks.rows < len(df0):
            msg = "Warning: Data rows exceeds worksheet rows available. Expanding worksheet."
            # logger.warning(msg)
            Timers.exec_time(msg)

            wks.resize(rows=len(df0))

        wks.set_dataframe(df0, start=row_start, copy_head=True)

        log_msg = f"Pushed data to gsheet with key:{gsheetkey}"
        # logger.info(log_msg)
        Timers.exec_time(log_msg)


def clean_df(df, drop_columns, pattern, permuted_columns=None) -> pd.DataFrame:
    """
    Takes columns to drop from frame. 
    Cleans up columns using regex pattern matching. 
    If permuted_columns is provided, will re-arrange dataframe columns accordingly.

    Parameters
    -------
    df (pd.DataFrame): Dataframe to process.
    drop_columns ([any]): List of columns to drop.
    pattern (str): Regex expression for column-renaming.
    permuted_columns ([any]): Column re-arrangement list. 

    Returns
    -------
    df (pd.DataFrame): DataFrame.
    """
    df.drop(columns=drop_columns, inplace=True)
    original_columns = df.columns.to_list()
    new_columns = list(map(lambda x: re.sub(pattern, "", x), original_columns))
    columns_dict = dict(zip(original_columns, new_columns))
    df.rename(columns=columns_dict, inplace=True)
    if permuted_columns != None:
        df = df[permuted_columns]
    return df


def update_dataframe_conditionally(
    new_df: pd.DataFrame, 
    original_df: pd.DataFrame,
    merge_columns: list,
    fixed_columns: list,
    update_columns: list,
    sort_by=None,
    ascending=False,
) -> pd.DataFrame:
    """
    Update using provided conditions as lists.

    Parameters
    -------
    new_df (pd.DataFrame): Dataframe with new data.
    original_df (pd.DataFrame): Dataframe with original data.
    merge_columns (['str']): Columns to merge data on. 
    fixed_columns (['str']): Columns to preserve data.
    update_columns (['str']): Columns to update data.
    sort_by (any): Column(s) to sort returning DataFrame. 
    ascending (bool): Direction to sort. Defaults to descending order.

    Returns
    -------
    updated_df (pd.DataFrame): Returns updated DataFrame
    """
    SUFFIXES = ("_new", "_old")
    updated_df = None

    # Reg ex pattern.
    pattern = r"|".join(SUFFIXES)

    # Double-check that update_columns does not conflict with merge columns.
    update_columns = [col for col in update_columns if not col in merge_columns]

    # Start flow.
    original_columns = original_df.columns.to_list()
    merged_df = new_df.merge(original_df, how="outer", on=merge_columns, suffixes=SUFFIXES)
    update_data = new_df.merge(original_df, how="inner", on=merge_columns, suffixes=SUFFIXES)

    # New data.
    drop_columns = [col + SUFFIXES[1] for col in fixed_columns + update_columns]
    new_data = merged_df[merged_df[drop_columns].isna().all(axis=1)]
    new_data = clean_df(new_data, drop_columns, pattern, permuted_columns=original_columns)

    # No-change data.
    drop_columns = [col + SUFFIXES[0] for col in fixed_columns + update_columns]
    no_change_data = merged_df[merged_df[drop_columns].isna().all(axis=1)]
    no_change_data = clean_df(no_change_data, drop_columns, pattern, permuted_columns=original_columns)

    # Update data.
    drop_columns = [col + SUFFIXES[0] for col in fixed_columns] + [col + SUFFIXES[1] for col in update_columns]
    update_data = clean_df(update_data, drop_columns, pattern, permuted_columns=original_columns)

    updated_df = pd.concat([new_data, no_change_data, update_data])
    if sort_by != None:
        updated_df.sort_values(sort_by, ascending=ascending)

    return updated_df

In [29]:
# args
credentials_path = r"/Users/jaimemerizalde/Desktop/JOBS 2023/software/jmailer/secrets/db_secret.json"

# gkey
db_identifier = "1t1wGAQvZuwEWOOgcgtBaqbZoafG_ZCfTV5QGyMfYHTg"

table_identifier = "contacts"

#filepath or list.
recipients = [
    "marco.starger@getgarner.com", 
    "austin.lovell@getgarner.com", 
    "evelyn.siu@getgarner.com",
]



In [12]:
db_configs = yaml.safe_load(open(DB_CONFIG_PATH))

In [13]:
db_configs

# now parse them. 

# if you are going to have settings, make sure you HAVE to fetch these.
merge_columns = db_configs["column_configs"]["merge_columns"]
fixed_columns = db_configs["column_configs"]["fixed_columns"]
update_columns = db_configs["column_configs"]["update_columns"]
sort_by = db_configs["column_configs"].get("sort_by", None)

In [14]:
db_configs

{'column_configs': {'merge_columns': ['FIRST_NAME',
   'LAST_NAME',
   'EMAIL',
   'COMPANY'],
  'fixed_columns': ['FIRST_OUTREACH', 'CREATEDATETIME'],
  'update_columns': ['LAST_OUTREACH'],
  'sort_by': 'CREATEDATETIME'}}

In [151]:
# STart the db work flow

In [15]:

# db connectivity
# to be specified in an argparse config file for convenience

# Google connectivity
gg = Google()
_, gsheets = gg.google_connect(credentials_path=credentials_path)

In [16]:
# database fetcher
sh = gsheets.open_by_key(db_identifier)


In [17]:
# schema fetcher
worksheets = sh.worksheets()
# titles = [wk.title for wk in worksheets]
#wks_dict = dict(zip(titles, worksheets))
#wks_dfs = dict(zip(titles, [wk.get_as_df() for wk in worksheets]))

In [18]:
# table fetcher
wks = sh.worksheet("title", table_identifier)

In [None]:
# completes the connection steps.

In [19]:
#  table data  to dataframe
wks.get_as_df()


Unnamed: 0,CREATEDATETIME,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH,FIRST_OUTREACH
0,2023-09-08,Marco,Starger,marco.starger@getgarner.com,Garner Health,09/08/2023,09/08/2023
1,09/08/2023,Austin,Lovell,austin.lovell@getgarner.com,Garner Health,09/08/2023,09/08/2023
2,2023-09-08,Evelyn,Siu,evelyn.siu@getgarner.com,Garner Health,2023-09-08,2023-09-08
3,09/04/2023,KEVIN,,kevin@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923
4,09/04/2023,JUSTIN,BANYS,justinas.banys@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923


In [229]:
# drive



In [22]:
# the next step is to....

# collect recipient data using cla

# basically, we should have a table-updateer "script" 

# this class or method (originally considered writing a script) is responsible for updating a table provided the data we need.
# credentials as well. 

original_df = wks.get_as_df()


In [23]:
# delete me
# save this data just in case bad connection
# import pickle
# original_df.to_pickle("/Users/jaimemerizalde/Desktop/tempdata.pkl")

What do we want next?  

participant data...  

In [35]:
# so given:

recipients

# give me one of these two 
config_path = "/Users/jaimemerizalde/Desktop/JOBS 2023/software/jmailer/config.yaml"
clearbit_api_key = None 

if config_path != None:
    config = yaml.safe_load(open(config_path))
    credentials = config["credentials"]
    clearbit_api_key = credentials["clearbit"]["api_key"]

# take the config 
recipient_data = {}
for recipient in recipients:
    url = f"https://person.clearbit.com/v2/combined/find?email=:{recipient}"
    # what the hell is giong on?

    clearbit_response = requests.get(url, auth=(clearbit_api_key, None))
    recipient_data[recipient] = clearbit_response

# JSONIFY the data
# Now put it together in a dictionary.
recipient_push_data = {}
for recipient, response in recipient_data.items():
    response_json = response.json()
    recipient_push_data[recipient] = {
        "CREATEDATETIME":  dt.datetime.today().strftime('%Y-%m-%d'), # IF DOES NOT EXIST: dt.datetime.today().strftime('%Y-%m-%d')
        "FIRST_NAME": response_json["person"]["name"]["givenName"],
        "LAST_NAME": response_json["person"]["name"]["familyName"],
        "EMAIL": recipient,
        "COMPANY": response_json["company"]["name"],
        "LAST_OUTREACH":  dt.datetime.today().strftime('%Y-%m-%d'),
        "FIRST_OUTREACH": dt.datetime.today().strftime('%Y-%m-%d'), # IF DOES NOT EXIST: dt.datetime.today().strftime('%Y-%m-%d')
    }

recipient_push_data = pd.DataFrame.from_dict(recipient_push_data, orient="index")
recipient_push_data.reset_index(drop=True, inplace=True)


NameError: name 'requests' is not defined

In [176]:
recipient_push_data

Unnamed: 0,CREATEDATETIME,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH,FIRST_OUTREACH
0,2023-09-08,Marco,Starger,marco.starger@getgarner.com,Garner Health,2023-09-08,2023-09-08
1,2023-09-08,Austin,Lovell,austin.lovell@getgarner.com,Garner Health,2023-09-08,2023-09-08
2,2023-09-08,Evelyn,Siu,evelyn.siu@getgarner.com,Garner Health,2023-09-08,2023-09-08


In [None]:

# complete data fetching flows.

Now that you have the data fetching work, you can do the data blending work.

In [171]:
original_df

Unnamed: 0,CREATEDATETIME,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH,FIRST_OUTREACH
0,09/04/2023,KEVIN,,kevin@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923
1,09/04/2023,JUSTIN,BANYS,justinas.banys@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923


In [167]:
# merged_columns = []

df_update = update_dataframe_conditionally(
    recipient_push_data,
    original_df, 
    merge_columns,
    fixed_columns,
    update_columns,
    sort_by="CREATEDATETIME",
    ascending=False,
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=columns_dict, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=columns_dict, in

I'D SAY that if you have a config file, then you can't really have it mess up. 

Now you have to push the data out.

In [274]:
gg.write_to_googlesheets(df_update, db_identifier, gsheets, table_identifier, row_start="A1")

Pushed data to gsheet with key:1t1wGAQvZuwEWOOgcgtBaqbZoafG_ZCfTV5QGyMfYHTg Timestamp: 22:52:42 - 2023-09-08


Looks like it was successful but now we need to test and see if it can be called AGAIN with "mildly skwed data" and see if 
it's going to mess up what's already in the table. 

In [372]:
df_original = wks.get_as_df()
df_original


row1 = df_original.iloc[[0]].copy(deep=True)
row1["CREATEDATETIME"] = "X-Y-Z"

push_data_2 = df_original.copy(deep=True)

push_data_2.iloc[[0]] = row1
push_data_2


# df_update_2 = update_dataframe_conditionally(
#     push_data_2,
#     df_original, 
#     merge_columns,
#     fixed_columns,
#     update_columns,
#     sort_by="CREATEDATETIME",
#     ascending=False,
# )

Unnamed: 0,CREATEDATETIME,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH,FIRST_OUTREACH
0,X-Y-Z,Marco,Starger,marco.starger@getgarner.com,Garner Health,09/08/2023,09/08/2023
1,09/08/2023,Austin,Lovell,austin.lovell@getgarner.com,Garner Health,09/08/2023,09/08/2023
2,2023-09-08,Evelyn,Siu,evelyn.siu@getgarner.com,Garner Health,2023-09-08,2023-09-08
3,09/04/2023,KEVIN,,kevin@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923
4,09/04/2023,JUSTIN,BANYS,justinas.banys@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923


In [None]:
#SO i think just one arg was out of pocket adn we can quickly resolve this.

# 

In [322]:
# this isn't good.

Unnamed: 0,CREATEDATETIME,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH,FIRST_OUTREACH
0,X-Y-Z,Marco,Starger,marco.starger@getgarner.com,Garner Health,09/08/2023,09/08/2023
1,09/08/2023,Austin,Lovell,austin.lovell@getgarner.com,Garner Health,09/08/2023,09/08/2023
2,2023-09-08,Evelyn,Siu,evelyn.siu@getgarner.com,Garner Health,2023-09-08,2023-09-08
3,09/04/2023,KEVIN,,kevin@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923
4,09/04/2023,JUSTIN,BANYS,justinas.banys@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923


In [338]:
# def update_dataframe_conditionally(
#     new_df: pd.DataFrame, 
#     original_df: pd.DataFrame,
#     merge_columns: list,
#     fixed_columns: list,
#     update_columns: list,
#     sort_by=None,
#     ascending=False,
# ) -> pd.DataFrame:
#     """
#     Update using provided conditions as lists.

#     Parameters
#     -------
#     new_df (pd.DataFrame): Dataframe with new data.
#     original_df (pd.DataFrame): Dataframe with original data.
#     merge_columns (['str']): Columns to merge data on. 
#     fixed_columns (['str']): Columns to preserve data.
#     update_columns (['str']): Columns to update data.
#     sort_by (any): Column(s) to sort returning DataFrame. 
#     ascending (bool): Direction to sort. Defaults to descending order.

#     Returns
#     -------
#     updated_df (pd.DataFrame): Returns updated DataFrame
#     """

new_df = push_data_2.copy(deep=True)
original_df = df_original.copy(deep=True)


SUFFIXES = ("_new", "_old")
updated_df = None

# Reg ex pattern.
pattern = r"|".join(SUFFIXES)

# Double-check that update_columns does not conflict with merge columns.
update_columns = [col for col in update_columns if not col in merge_columns]

# Start flow.
original_columns = original_df.columns.to_list()
merged_df = new_df.merge(original_df, how="outer", on=merge_columns, suffixes=SUFFIXES)
update_data = new_df.merge(original_df, how="inner", on=merge_columns, suffixes=SUFFIXES)


# New data.
drop_columns = [col + SUFFIXES[1] for col in fixed_columns + update_columns]
new_data = merged_df[merged_df[drop_columns].isna().all(axis=1)]
new_data = clean_df(new_data, drop_columns, pattern, permuted_columns=original_columns)

# No-change data.
drop_columns = [col + SUFFIXES[0] for col in fixed_columns + update_columns]
no_change_data = merged_df[merged_df[drop_columns].isna().all(axis=1)]
no_change_data = clean_df(no_change_data, drop_columns, pattern, permuted_columns=original_columns)

# Update data.
drop_columns = [col + SUFFIXES[1] for col in update_columns] + [col + SUFFIXES[1] for col in fixed_columns]
update_data = clean_df(update_data, drop_columns, pattern, permuted_columns=original_columns)

# # updated_df = pd.concat([new_data, no_change_data, update_data])
# #     if sort_by != None:
# #         updated_df.sort_values(sort_by, ascending=ascending)

# #     return updated_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=columns_dict, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=columns_dict, in

In [334]:
merged_df # makes sense they have everything in common so outer merge wont' do a thing 

Unnamed: 0,CREATEDATETIME_new,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH_new,FIRST_OUTREACH_new,CREATEDATETIME_old,LAST_OUTREACH_old,FIRST_OUTREACH_old
0,X-Y-Z,Marco,Starger,marco.starger@getgarner.com,Garner Health,09/08/2023,09/08/2023,2023-09-08,09/08/2023,09/08/2023
1,09/08/2023,Austin,Lovell,austin.lovell@getgarner.com,Garner Health,09/08/2023,09/08/2023,09/08/2023,09/08/2023,09/08/2023
2,2023-09-08,Evelyn,Siu,evelyn.siu@getgarner.com,Garner Health,2023-09-08,2023-09-08,2023-09-08,2023-09-08,2023-09-08
3,09/04/2023,KEVIN,,kevin@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923,09/04/2023,09/02/2923,09/02/2923
4,09/04/2023,JUSTIN,BANYS,justinas.banys@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923,09/04/2023,09/02/2923,09/02/2923


In [351]:
update_data # makes sens, buecase we are saying all data potentially can exist!

Unnamed: 0,CREATEDATETIME_new,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH_new,FIRST_OUTREACH_new,CREATEDATETIME_old,LAST_OUTREACH_old,FIRST_OUTREACH_old
0,X-Y-Z,Marco,Starger,marco.starger@getgarner.com,Garner Health,09/08/2023,09/08/2023,2023-09-08,09/08/2023,09/08/2023
1,09/08/2023,Austin,Lovell,austin.lovell@getgarner.com,Garner Health,09/08/2023,09/08/2023,09/08/2023,09/08/2023,09/08/2023
2,2023-09-08,Evelyn,Siu,evelyn.siu@getgarner.com,Garner Health,2023-09-08,2023-09-08,2023-09-08,2023-09-08,2023-09-08
3,09/04/2023,KEVIN,,kevin@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923,09/04/2023,09/02/2923,09/02/2923
4,09/04/2023,JUSTIN,BANYS,justinas.banys@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923,09/04/2023,09/02/2923,09/02/2923


In [350]:
new_data 

Unnamed: 0,CREATEDATETIME,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH,FIRST_OUTREACH


In [348]:
no_change_data # there isn't any no-change data. this is a problem no? # actually I think this makes sense

# reason being that all the data is present in a potential update. The no_change_data is literally stuff that didn't appear in the new 
# data. But since all of it appears there is no, no_change_data

Unnamed: 0,CREATEDATETIME_new,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH_new,FIRST_OUTREACH_new,CREATEDATETIME_old,LAST_OUTREACH_old,FIRST_OUTREACH_old


In [353]:
drop_columns = [col + SUFFIXES[0] for col in update_columns] + [col + SUFFIXES[1] for col in fixed_columns]
drop_columns
# update_data = clean_df(update_data, drop_columns, pattern, permuted_columns=original_columns)

['LAST_OUTREACH_new', 'FIRST_OUTREACH_old', 'CREATEDATETIME_old']

In [None]:
# basically this broke the code.

# what's a good and fast solution to this problem? Just

In [None]:
# work shopping this problem

In [354]:
new_df = push_data_2.copy(deep=True)
original_df = df_original.copy(deep=True)


SUFFIXES = ("_new", "_old")
updated_df = None

# Reg ex pattern.
pattern = r"|".join(SUFFIXES)

# Double-check that update_columns does not conflict with merge columns.
update_columns = [col for col in update_columns if not col in merge_columns]

# Start flow.
original_columns = original_df.columns.to_list()
merged_df = new_df.merge(original_df, how="outer", on=merge_columns, suffixes=SUFFIXES)

# update_data = new_df.merge(original_df, how="inner", on=merge_columns, suffixes=SUFFIXES)



In [355]:
merged_df # this is what it's like to have a lot of it together.

# outer merge has EVERYTHING


Unnamed: 0,CREATEDATETIME_new,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH_new,FIRST_OUTREACH_new,CREATEDATETIME_old,LAST_OUTREACH_old,FIRST_OUTREACH_old
0,X-Y-Z,Marco,Starger,marco.starger@getgarner.com,Garner Health,09/08/2023,09/08/2023,2023-09-08,09/08/2023,09/08/2023
1,09/08/2023,Austin,Lovell,austin.lovell@getgarner.com,Garner Health,09/08/2023,09/08/2023,09/08/2023,09/08/2023,09/08/2023
2,2023-09-08,Evelyn,Siu,evelyn.siu@getgarner.com,Garner Health,2023-09-08,2023-09-08,2023-09-08,2023-09-08,2023-09-08
3,09/04/2023,KEVIN,,kevin@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923,09/04/2023,09/02/2923,09/02/2923
4,09/04/2023,JUSTIN,BANYS,justinas.banys@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923,09/04/2023,09/02/2923,09/02/2923


In [362]:
update_data = new_df.merge(original_df, how="inner", on=merge_columns, suffixes=SUFFIXES)
update_data

# this is what they both have in common. That's okay to have every piece of data be touched.

# it's the update scheme that matters. 


# # Update data.
# drop_columns = [col + SUFFIXES[0] for col in update_columns] + [col + SUFFIXES[1] for col in fixed_columns]
# # drop_columns # this one which is the last outreach data, first outreach data
# # update_data = clean_df(update_data, drop_columns, pattern, permuted_columns=original_columns)

# # update_data

Unnamed: 0,CREATEDATETIME_new,FIRST_NAME,LAST_NAME,EMAIL,COMPANY,LAST_OUTREACH_new,FIRST_OUTREACH_new,CREATEDATETIME_old,LAST_OUTREACH_old,FIRST_OUTREACH_old
0,X-Y-Z,Marco,Starger,marco.starger@getgarner.com,Garner Health,09/08/2023,09/08/2023,2023-09-08,09/08/2023,09/08/2023
1,09/08/2023,Austin,Lovell,austin.lovell@getgarner.com,Garner Health,09/08/2023,09/08/2023,09/08/2023,09/08/2023,09/08/2023
2,2023-09-08,Evelyn,Siu,evelyn.siu@getgarner.com,Garner Health,2023-09-08,2023-09-08,2023-09-08,2023-09-08,2023-09-08
3,09/04/2023,KEVIN,,kevin@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923,09/04/2023,09/02/2923,09/02/2923
4,09/04/2023,JUSTIN,BANYS,justinas.banys@getgarner.com,GARNER HEALTH,09/02/2923,09/02/2923,09/04/2023,09/02/2923,09/02/2923


['LAST_OUTREACH']

In [373]:
# For update columns what you'd like to do is 
# DEFINITELy don't take the new columns that are supposed to be in 

fixed_columns #< make sure to add "new" to this and drop it 


drop_columns = [col + SUFFIXES[0] for col in fixed_columns] + [col + SUFFIXES[1] for col in update_columns]
drop_columns
 # you definitely want to drop this

['FIRST_OUTREACH_new', 'CREATEDATETIME_new', 'LAST_OUTREACH_old']

What were you working on? t fixing up the last update to the fix. 

In [None]:
# Eventually you'll end up down here and putting together all the tools still in sketch mode  from above. 