# Getting the common-sequences from the user behaviour

In [None]:
import pandas as pd

df = pd.read_csv('original.csv')

df.columns = ['user_id', 'date', 'event_name', 'total_users', 'user_engagement_duration',
              'average_session_duration', 'screen_page_views', 'screen_page_views_per_user', 'event_count']

# Select relevant columns
selected_columns = ['user_id', 'event_name', 'average_session_duration']
user_events = df[selected_columns]

# Group by user_id
grouped = user_events.groupby('user_id')

# Create a dictionary to store the results
user_event_dict = {}

# Iterate through each user and store their events and durations in a list
for user_id, group in grouped:
    events_and_durations = list(zip(group['event_name'], group['average_session_duration']))
    user_event_dict[user_id] = events_and_durations

# Optionally, save the results into a single CSV file for easier analysis
output_df = pd.DataFrame([
    {'user_id': user_id, 'events_and_durations': events}
    for user_id, events in user_event_dict.items()
])
output_df.to_csv('user_events_and_durations.csv', index=False)
output_df

Unnamed: 0,user_id,events_and_durations
0,100122192,"[(home_page, 1e-06), (login, 0.852997), (notif..."
1,100160846,"[(confirm_order_button, 0.001001), (home_page,..."
2,100376756,"[(app_remove, 0.0)]"
3,100405016,"[(confirm_order_button, 0.002001), (home_deliv..."
4,100412287,"[(confirm_order_button, 0.001001), (home_page,..."
...,...,...
988,799238995,"[(screen_view, 153.7212555), (session_start, 0..."
989,799440267,"[(home_page, 0.0055015), (login, 1.4219895), (..."
990,799528525,"[(login, 1.368989), (screen_view, 691.0530153)..."
991,799800370,"[(login, 1.38399), (notification_receive, 434...."


In [None]:
successful_users = []
unsuccessful_users = []

for index, row in output_df.iterrows():
    has_confirm_order_button = False
    for event, duration in row['events_and_durations']:
        if event == 'confirm_order_button':
            has_confirm_order_button = True
            break

    if has_confirm_order_button:
        successful_users.append({'user_id': row['user_id'], 'events_and_durations': row['events_and_durations']})
    else:
        unsuccessful_users.append({'user_id': row['user_id'], 'events_and_durations': row['events_and_durations']})

# Create DataFrames for successful and unsuccessful users
df_successful = pd.DataFrame(successful_users)
df_unsuccessful = pd.DataFrame(unsuccessful_users)

df_successful

Unnamed: 0,user_id,events_and_durations
0,100160846,"[(confirm_order_button, 0.001001), (home_page,..."
1,100405016,"[(confirm_order_button, 0.002001), (home_deliv..."
2,100412287,"[(confirm_order_button, 0.001001), (home_page,..."
3,100428671,"[(confirm_order_button, 0.001001), (login, 2.0..."
4,100448405,"[(confirm_order_button, 0.001001), (login, 1.0..."
...,...,...
238,745064767,"[(confirm_order_button, 0.001501), (home_page,..."
239,746059301,"[(confirm_order_button, 0.000501), (login, 2.0..."
240,772746004,"[(confirm_order_button, 0.000501), (home_page,..."
241,791728308,"[(confirm_order_button, 0.002001), (login, 2.0..."


In [None]:
desired_page_names = [
    "home_page", "login", "order_page", "otp_screen_reached",
    "product_category_clicked", "repeat_order_clicked",
    "reward_page", "session_start", "track_page"
]

# Function to filter pages based on desired_page_names
def filter_pages(events_and_durations):
    return [(event, duration) for event, duration in events_and_durations if event in desired_page_names]

# Filter pages for successful users
df_successful['Filtered Pages'] = df_successful['events_and_durations'].apply(filter_pages)

# Filter pages for unsuccessful users
df_unsuccessful['Filtered Pages'] = df_unsuccessful['events_and_durations'].apply(filter_pages)

# Drop the original 'events_and_durations' column
df_successful.drop(columns=['events_and_durations'], inplace=True)
df_unsuccessful.drop(columns=['events_and_durations'], inplace=True)

# Display the results
print("Successful Users:")
print(df_successful)
print()

print("Unsuccessful Users:")
print(df_unsuccessful)

Successful Users:
       user_id                                     Filtered Pages
0    100160846  [(home_page, 0.001001), (login, 2.006991), (ot...
1    100405016  [(login, 0.894998), (session_start, 0.0), (tra...
2    100412287  [(home_page, 0.003002), (login, 0.739989), (ot...
3    100428671  [(login, 2.009991), (otp_screen_reached, 0.001...
4    100448405  [(login, 1.064979), (otp_screen_reached, 0.007...
..         ...                                                ...
238  745064767  [(home_page, 0.0006676666667), (login, 2.00532...
239  746059301  [(login, 2.00348), (session_start, 0.0), (trac...
240  772746004  [(home_page, 1.5e-06), (login, 1.2389685), (ot...
241  791728308     [(login, 2.0049955), (track_page, 893.447493)]
242  792066937  [(session_start, 0.0), (home_page, 0.010501), ...

[243 rows x 2 columns]

Unsuccessful Users:
        user_id                                     Filtered Pages
0     100122192  [(home_page, 1e-06), (login, 0.852997), (track...
1     10037

In [None]:


# Define filenames
filename_successful = 'df_successful.csv'
filename_unsuccessful = 'df_unsuccessful.csv'

# Save DataFrames to CSV
df_successful.to_csv(filename_successful, index=False)
df_unsuccessful.to_csv(filename_unsuccessful, index=False)

In [None]:
df_successful

Unnamed: 0,user_id,Filtered Pages
0,100160846,"[(home_page, 0.001001), (login, 2.006991), (ot..."
1,100405016,"[(login, 0.894998), (session_start, 0.0), (tra..."
2,100412287,"[(home_page, 0.003002), (login, 0.739989), (ot..."
3,100428671,"[(login, 2.009991), (otp_screen_reached, 0.001..."
4,100448405,"[(login, 1.064979), (otp_screen_reached, 0.007..."
...,...,...
238,745064767,"[(home_page, 0.0006676666667), (login, 2.00532..."
239,746059301,"[(login, 2.00348), (session_start, 0.0), (trac..."
240,772746004,"[(home_page, 1.5e-06), (login, 1.2389685), (ot..."
241,791728308,"[(login, 2.0049955), (track_page, 893.447493)]"


In [None]:

from collections import Counter
def extract_sequences_with_durations(filtered_pages):
    sequences = []
    current_sequence = []

    for event, duration in filtered_pages:
        if event in desired_page_names:
            current_sequence.append((event, duration))
        else:
            if current_sequence:
                sequences.append(current_sequence)
                current_sequence = []

    if current_sequence:  # Append the last sequence if non-empty
        sequences.append(current_sequence)

    return sequences

# Extract sequences with durations
sequences_successful = []
for filtered_pages in df_successful['Filtered Pages']:
    sequences_successful.extend(extract_sequences_with_durations(filtered_pages))

# Function to find common patterns in sequences
def find_common_patterns(sequences, top_n=20):
    sequence_counter = Counter(tuple(seq) for seq in sequences)
    return sequence_counter.most_common(top_n)

# Find common patterns in successful sequences
common_patterns_successful = find_common_patterns(sequences_successful)

In [None]:
common_patterns_successful

[((('home_page', 0.001001),
   ('login', 2.006991),
   ('otp_screen_reached', 0.001001),
   ('session_start', 0.0),
   ('track_page', 82.7355025)),
  1),
 ((('login', 0.894998),
   ('session_start', 0.0),
   ('track_page', 3.629001),
   ('otp_screen_reached', 1e-06),
   ('session_start', 0.0),
   ('login', 2.008962),
   ('otp_screen_reached', 1e-06),
   ('track_page', 3.715004)),
  1),
 ((('home_page', 0.003002),
   ('login', 0.739989),
   ('otp_screen_reached', 1e-06),
   ('product_category_clicked', 5.027001),
   ('track_page', 125.85401)),
  1),
 ((('login', 2.009991),
   ('otp_screen_reached', 0.001001),
   ('session_start', 0.0),
   ('track_page', 1.438501)),
  1),
 ((('login', 1.064979),
   ('otp_screen_reached', 0.007004),
   ('session_start', 0.0)),
  1),
 ((('home_page', 0.004001),
   ('login', 2.007485),
   ('session_start', 0.0),
   ('track_page', 39.9232014),
   ('order_page', 0.003001),
   ('otp_screen_reached', 0.001001),
   ('session_start', 0.0)),
  1),
 ((('home_page',

In [None]:
def extract_sequences_with_durations(filtered_pages):
    sequences = []
    current_sequence = []

    for event, duration in filtered_pages:
        if event in desired_page_names:
            current_sequence.append((event, duration))
        else:
            if current_sequence:
                sequences.append(current_sequence)
                current_sequence = []

    if current_sequence:  # Append the last sequence if non-empty
        sequences.append(current_sequence)

    return sequences

# Extract sequences with durations
sequences_successful = []
for filtered_pages in df_successful['Filtered Pages']:
    sequences_successful.extend(extract_sequences_with_durations(filtered_pages))

# Function to find common patterns in sequences
def find_common_patterns(sequences, top_n=20):
    sequence_counter = Counter(tuple(seq) for seq in sequences)
    common_patterns = sequence_counter.most_common(top_n)
    return [(pattern, count) for pattern, count in common_patterns]

# Find common patterns in successful sequences
common_patterns_successful = find_common_patterns(sequences_successful)

# Print common patterns
print("Most common sequences in successful users:")
for sequence, count in common_patterns_successful:
    print(f"{sequence}: {count} occurrences")

Most common sequences in successful users:
(('home_page', 0.001001), ('login', 2.006991), ('otp_screen_reached', 0.001001), ('session_start', 0.0), ('track_page', 82.7355025)): 1 occurrences
(('login', 0.894998), ('session_start', 0.0), ('track_page', 3.629001), ('otp_screen_reached', 1e-06), ('session_start', 0.0), ('login', 2.008962), ('otp_screen_reached', 1e-06), ('track_page', 3.715004)): 1 occurrences
(('home_page', 0.003002), ('login', 0.739989), ('otp_screen_reached', 1e-06), ('product_category_clicked', 5.027001), ('track_page', 125.85401)): 1 occurrences
(('login', 2.009991), ('otp_screen_reached', 0.001001), ('session_start', 0.0), ('track_page', 1.438501)): 1 occurrences
(('login', 1.064979), ('otp_screen_reached', 0.007004), ('session_start', 0.0)): 1 occurrences
(('home_page', 0.004001), ('login', 2.007485), ('session_start', 0.0), ('track_page', 39.9232014), ('order_page', 0.003001), ('otp_screen_reached', 0.001001), ('session_start', 0.0)): 1 occurrences
(('home_page', 

In [None]:
from collections import Counter

# Define the filtered pages (events)
filtered_pages = df_successful['Filtered Pages']

# Extract the pages from the data
pages = [page for user in filtered_pages for page in user]

# Count occurrences of each page
page_counts = Counter(pages)

# Sort the pages by occurrence count
sorted_pages = sorted(page_counts.items(), key=lambda x: x[1], reverse=True)

# Display the sorted pages
print("Most occurred filtered pages:")
for page, count in sorted_pages:
    print(f"{page}: {count} occurrences")


Most occurred filtered pages:
('session_start', 0.0): 437 occurrences
('otp_screen_reached', 0.001001): 117 occurrences
('otp_screen_reached', 1e-06): 81 occurrences
('home_page', 1e-06): 55 occurrences
('otp_screen_reached', 0.002001): 45 occurrences
('home_page', 0.001001): 39 occurrences
('order_page', 1e-06): 30 occurrences
('order_page', 0.001001): 28 occurrences
('otp_screen_reached', 0.003001): 23 occurrences
('reward_page', 0.001001): 18 occurrences
('track_page', 1e-06): 16 occurrences
('home_page', 0.001002): 16 occurrences
('home_page', 0.002001): 15 occurrences
('reward_page', 1e-06): 14 occurrences
('home_page', 2e-06): 11 occurrences
('login', 2.004992): 11 occurrences
('home_page', 0.002002): 10 occurrences
('track_page', 0.001001): 9 occurrences
('login', 2.000996): 8 occurrences
('order_page', 0.001002): 7 occurrences
('login', 2.002992): 7 occurrences
('login', 2.003992): 7 occurrences
('login', 2.001996): 7 occurrences
('order_page', 0.003001): 6 occurrences
('otp_sc

In [None]:
def round_durations(filtered_pages):
    return [(event, round(duration, 2)) for event, duration in filtered_pages]

# Apply the rounding function to the 'Filtered Pages' column
df_successful['Filtered Pages'] = df_successful['Filtered Pages'].apply(round_durations)

# Print the updated DataFrame
df_successful

Unnamed: 0,user_id,Filtered Pages
0,100160846,"[(home_page, 0.0), (login, 2.01), (otp_screen_..."
1,100405016,"[(login, 0.89), (session_start, 0.0), (track_p..."
2,100412287,"[(home_page, 0.0), (login, 0.74), (otp_screen_..."
3,100428671,"[(login, 2.01), (otp_screen_reached, 0.0), (se..."
4,100448405,"[(login, 1.06), (otp_screen_reached, 0.01), (s..."
...,...,...
238,745064767,"[(home_page, 0.0), (login, 2.01), (otp_screen_..."
239,746059301,"[(login, 2.0), (session_start, 0.0), (track_pa..."
240,772746004,"[(home_page, 0.0), (login, 1.24), (otp_screen_..."
241,791728308,"[(login, 2.0), (track_page, 893.45)]"


In [None]:
from collections import Counter

# Extract sequences and their durations
sequences = df_successful['Filtered Pages'].apply(lambda x: str(x))
sequences = sequences.apply(eval)

# Calculate total session durations for each sequence
sequence_durations = sequences.apply(lambda x: sum([page[1] for page in x]))

# Combine sequences and their durations into a list of tuples
sequence_duration_tuples = list(zip(sequences, sequence_durations))

# Count the occurrences of each sequence
sequence_counts = Counter([tuple(seq) for seq, dur in sequence_duration_tuples])

# Get the most common sequences
common_sequences = sequence_counts.most_common()

# Display the most common sequences and their durations
common_sequences_durations = [(seq, count, sum([page[1] for page in seq])) for seq, count in common_sequences]

# Convert to DataFrame for better visualization
common_sequences_df = pd.DataFrame(common_sequences_durations, columns=['Sequence', 'Count', 'Total Duration'])

print(common_sequences_df.head())

                                            Sequence  Count  Total Duration
0                                   ((login, 2.01),)      4            2.01
1  ((login, 2.0), (otp_screen_reached, 0.0), (ses...      2            2.00
2  ((login, 2.01), (otp_screen_reached, 0.0), (se...      2            2.01
3  ((login, 2.0), (otp_screen_reached, 0.0), (ses...      2            2.00
4  ((home_page, 0.0), (login, 2.01), (otp_screen_...      1           84.75


In [None]:
common_sequences_df

Unnamed: 0,Sequence,Count,Total Duration
0,"((login, 2.01),)",4,2.01
1,"((login, 2.0), (otp_screen_reached, 0.0), (ses...",2,2.00
2,"((login, 2.01), (otp_screen_reached, 0.0), (se...",2,2.01
3,"((login, 2.0), (otp_screen_reached, 0.0), (ses...",2,2.00
4,"((home_page, 0.0), (login, 2.01), (otp_screen_...",1,84.75
...,...,...,...
232,"((home_page, 0.0), (login, 2.01), (otp_screen_...",1,93.75
233,"((login, 2.0), (session_start, 0.0), (track_pa...",1,11.66
234,"((home_page, 0.0), (login, 1.24), (otp_screen_...",1,354.15
235,"((login, 2.0), (track_page, 893.45))",1,895.45


In [None]:
from collections import Counter

# Extract sequences and their durations
sequences = df_successful['Filtered Pages'].apply(lambda x: str(x))
sequences = sequences.apply(eval)

# Calculate total session durations for each sequence
sequence_durations = sequences.apply(lambda x: sum([page[1] for page in x]))

# Combine sequences and their durations into a list of tuples
sequence_duration_tuples = list(zip(sequences, sequence_durations))

# Count the occurrences of each sequence
sequence_counts = Counter([tuple(seq) for seq, dur in sequence_duration_tuples])

# Get the most common sequences
common_sequences = sequence_counts.most_common()

# Display the most common sequences and their durations
common_sequences_durations = [(seq, count, sum([page[1] for page in seq])) for seq, count in common_sequences]

# Convert to DataFrame for better visualization
common_sequences_df = pd.DataFrame(common_sequences_durations, columns=['Sequence', 'Count', 'Total Duration'])


In [None]:
common_sequences_df.head()

Unnamed: 0,Sequence,Count,Total Duration
0,"((home_page, 0.001001), (login, 2.006991), (ot...",1,84.744495
1,"((login, 0.894998), (session_start, 0.0), (tra...",1,10.247967
2,"((home_page, 0.003002), (login, 0.739989), (ot...",1,131.624003
3,"((login, 2.009991), (otp_screen_reached, 0.001...",1,3.449493
4,"((login, 1.064979), (otp_screen_reached, 0.007...",1,1.071983
