# Npath

## Sequence and convertor review

In [None]:
import pandas as pd
from data import get_data
from feature import plot_important_features_prefixspan
from conversion import convertor_review
from divergence import analyze_divergence

###################################################################
# Parameters
###################################################################

# BigQuery Table Name (GA4) - Replace with your own - <project_id>.<dataset>.events_*
table_name = "<project_id>.<dataset>.events_*"
# Number of days to look back
historical_days = 30
# Goal Event Name
goal_event = "marketo_form_submit"
# Conversion Page Title (Before any | or - characters)
conversion_page_title = "Thank You"
# Brand Name (regex pattern)
brand = "(Brand)"
# Whether to balance each class
balanced = False
# Minimum Frequency of Sequence
min_freq = 5
# Minimum length of sequence
min_seq_len = 4

###################################################################
# Notes
###################################################################
# Place service_account.json in the same directory as this file
# The service account must have access to the BigQuery table

###################################################################
# Main
###################################################################
# Get Data
data = get_data(table_name, historical_days, goal_event, conversion_page_title, brand)

if not isinstance(data, pd.DataFrame) or len(data) == 0:
    print('No data found. Please check your parameters.')
    exit()

# Maybe Balance Data
if balanced:
    convertors = data[data.converted == 1].copy()
    non_convertors = data[data.converted == 0].sample(len(convertors)).copy()

    data = pd.concat([convertors, non_convertors], axis=0)
    data = data.reset_index(drop=True)


# Prefix Span: Graph of Sequence Importance
print('Sequence Importance in Conversions')
plot_important_features_prefixspan(data.copy(), top_n=30, min_freq=min_freq, min_seq_len=min_seq_len)

# Convertor Review: Sequence Patterns of Similarity and Anomalies in Non-Convertors that are clustered with Convertors
print('Sequence patterns for non-converting users that are similar to or unique from converting users')
convertor_review(data.copy(), top_n=10, reducer="pca")

# Divergence: Sequence Patterns where Non-Convertors Diverge from Convertors
print('Top Sequences where non-converting users followed a path that diverged from converting users')
results_df = analyze_divergence(data.copy(), top_n=15, min_freq=min_freq, min_seq_len=min_seq_len)

for i, row in results_df.iterrows():

    print(f"Conversion Sequence: {row['conversion_seq']}")
    print(f"Non-Conversion Sequence: {row['non_conversion_seq']}")
    print(f"Diversion Node: {row['diversion']}")
    print(f"Diversion Score: {round(row['divergence_score'], 2)} Frequency: {row['non_conv_freq']}")
    print()


## Review input data

In [None]:
data.head()


## Cluster user paths into categories


In [None]:
from cluster import analyze_clusters


###################################################################
# Parameters
###################################################################

# OpenAI API Key
openai_api_key = "sk-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
# OpenAI Model
model = "gpt-3.5-turbo"
# Minimum number of sequences in a topic
min_topic_size = 50

###################################################################
# Main
###################################################################

# Analyze Clusters
topic_model = analyze_clusters(data.copy(), model = model, min_topic_size = min_topic_size, api_key = openai_api_key)
