In [5]:
import pandas as pd
pd.set_option("display.width",1000)

In [6]:
visit_factory = pd.read_csv("Event - Visit Factory.csv")
factory_product_click = pd.read_csv("Event - Factory Product Click.csv")
product_view = pd.read_csv("Event - Product View.csv")

In [7]:
def factory_linkage(visit_factory,factory_product_click,product_view):
    def correct_timestamp(ts):
        if ts[11:13] == "24":
            ts = ts[:11] + "00" + ts[13:]  # Replace '24' with '00'
            corrected_datetime = pd.to_datetime(ts, format="%Y/%m/%d/%H/%M/%S")
        else:
            corrected_datetime = pd.to_datetime(ts, format="%Y/%m/%d/%H/%M/%S")
        return corrected_datetime


    # Correcting the format for datetime conversion
    visit_factory['event_timestamp'] = visit_factory['event_timestamp'].apply(correct_timestamp)
    factory_product_click['event_timestamp'] = factory_product_click['event_timestamp'].apply(correct_timestamp)

    # Perform the operation again with corrected datetime format and directly select relevant columns post-query
    cross_joined_df = factory_product_click.assign(key=1).merge(
        visit_factory.assign(key=1),
        on='key',
        suffixes=('_fp_click', '_visit')
    ).query(
        "customer_id_fp_click == customer_id_visit & "
        "ga_session_id_fp_click == ga_session_id_visit & "
        "event_timestamp_fp_click > event_timestamp_visit"
    )

    # Find the closest visit event for each product click event and directly select relevant columns
    final_df = cross_joined_df.groupby(['cookie_id_fp_click', 'event_timestamp_fp_click'], as_index=False).apply(
        lambda x: x.loc[(x['event_timestamp_fp_click'] - x['event_timestamp_visit']).idxmin()]
    ).reset_index(drop=True)[['report_date_fp_click', 'cookie_id_fp_click', 'customer_id_fp_click',
                              'ga_session_id_fp_click', 'event_timestamp_fp_click', 'event_name_fp_click',
                              'event_value_fp_click', 'event_timestamp_visit', 'event_value_visit']]

# Rename, remove, and reorder columns
    final_df = final_df.rename(columns={
        'report_date_fp_click': 'report_date',
        'cookie_id_fp_click': 'cookie_id',
        'customer_id_fp_click': 'customer_id',
        'ga_session_id_fp_click': 'ga_session_id',
        'event_timestamp_fp_click': 'event_timestamp_click',
        'event_value_fp_click': 'factory_product_click',
        'event_timestamp_visit': 'event_timestamp_visit',
        'event_value_visit': 'visit_factory'
    })[[
        'report_date', 
        'customer_id', 
        'cookie_id', 
        'ga_session_id',  
        'event_timestamp_visit',
        'visit_factory',
        'event_timestamp_click',
        'factory_product_click'
    ]]

    result_df = visit_factory.merge(
        final_df,
        left_on=['customer_id', 'ga_session_id', 'event_timestamp'],
        right_on=['customer_id', 'ga_session_id', 'event_timestamp_visit'],
        how='left'
    ).drop(['report_date_y', 'cookie_id_y', 'event_timestamp_visit', 'visit_factory'], axis=1) \
    .rename(columns={'event_value': 'visit_factory'})

    # Process the event_value column in product_view, splitting it into product_name and product_id, then remove the original event_value column
    product_view[['product_name', 'product_id']] = product_view['event_value'].str.split('\|\|\|', expand=True)
    product_view = product_view.drop(columns=['event_value']).drop_duplicates(subset=['product_name', 'product_id'])
    product_view['product_name'] = product_view['product_name'].str.strip()

    # Perform a left join between result_df and distinct_products, then remove duplicate product_name columns
    merged_df = result_df.merge(
        product_view,
        how='left',
        left_on='factory_product_click',
        right_on='product_name'
    ).drop(columns=['product_name','event_name_x', 'report_date', 'cookie_id', 'customer_id_y', 'ga_session_id_y','event_timestamp_y','event_name_y'])

    merged_df.rename(columns={col: col.replace('_x', '') for col in merged_df.columns if '_x' in col}, inplace=True)

    return merged_df

In [8]:
full_join = factory_linkage(visit_factory,factory_product_click,product_view)
full_join.to_csv("full_join.csv")