In [1]:
import pandas as pd
import numpy as np
# from retentioneering import datasets
from typing import Literal, Union, List, Optional, Iterable, get_args, Dict, Tuple
import numpy as np 
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def generate_user_events_data(num_users=1000, num_events=10, num_cities=10, num_os=2, num_rows=5000, seed=42):
    """
    Функция для генерации данных о событиях пользователей.
    """
    # Устанавливаем seed для генератора случайных чисел
    np.random.seed(seed)

    # Создаем список пользователей
    users = ['user' + str(i) for i in range(1, num_users+1)]

    # Создаем список событий
    events = ['event' + str(i) for i in range(1, num_events+1)]

    # Создаем список дат
    dates = [datetime.datetime.now() - datetime.timedelta(days=x) for x in range(180)]

    # Создаем список городов
    cities = ['city' + str(i) for i in range(1, num_cities+1)]

    # Создаем список операционных систем
    os = ['iOS', 'Android']

    # Создаем DataFrame
    df = pd.DataFrame({
        'user_id': np.random.choice(users, num_rows),
        'event': np.random.choice(events, num_rows),
        'event_datetime': np.random.choice(dates, num_rows),
        'city': np.random.choice(cities, num_rows),
        'operation_system': np.random.choice(os, num_rows)
    })

    return df

# Importing custom modules

In [3]:
import sys

sys.path.append('d:\\diplom\\product\\insight_pulse')

for path in sys.path:
    print(path)

D:\pythonProg\Python\Python312\python312.zip
D:\pythonProg\Python\Python312\DLLs
D:\pythonProg\Python\Python312\Lib
D:\pythonProg\Python\Python312
d:\diplom\.venv

d:\diplom\.venv\Lib\site-packages
d:\diplom\.venv\Lib\site-packages\win32
d:\diplom\.venv\Lib\site-packages\win32\lib
d:\diplom\.venv\Lib\site-packages\Pythonwin
d:\diplom\product\insight_pulse


In [4]:
from eventframing.eventframe import EventFrame
from eventframing.cols_schema import EventFrameColsSchema
from eventframing.event_type import EventType

from data_preprocessing.data_preprocessor import DataPreprocessor
from data_preprocessing.preprocessors_lib.add_start_end_events import AddStartEndEventsPreprocessor
from data_preprocessing.preprocessors_lib.split_sessions import SplitSessionsPreprocessor
from data_preprocessing.preprocessors_lib.add_cohorts_preprocessor import AddCohortsPreprocessor

from tooling.funnel.funnel import Funnel

# Testing

## Funnel test

In [5]:
data = generate_user_events_data(num_rows=5000)
cols_schema = {'user_id': 'user_id', 'event_timestamp': 'event_datetime', 'event_name': 'event'}
ef = EventFrame(data, cols_schema)
ef.to_dataframe().head()


Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index
1357,user1,event2,2024-10-19 18:55:46.145574,city5,Android,1357,raw,2
4234,user1,event6,2024-10-21 18:55:46.145574,city5,Android,4234,raw,2
2760,user1,event2,2024-11-10 18:55:46.145574,city6,iOS,2760,raw,2
4934,user1,event9,2024-11-22 18:55:46.145574,city3,iOS,4934,raw,2
1049,user1,event10,2024-12-20 18:55:46.145574,city1,iOS,1049,raw,2


In [6]:
funnel = Funnel()

In [18]:
users = data['user_id'].unique()
n_users = users.shape[0]
ef_data = ef.to_dataframe()

segments = [
    ef_data[ef_data['user_id'].isin(users[:n_users//2 + 10])].index,
    ef_data[ef_data['user_id'].isin(users[n_users//2:])].index,
    ef_data[ef_data['user_id'].isin(users[:n_users//2 + 10])].index,
]


In [15]:
segments

[Index([  16,  672, 2663,    0, 4526, 2513, 1093,  579, 4791, 3188,
        ...
        3420, 1038, 2735,  289, 1486, 3123,  759,  631, 3099, 4798],
       dtype='int64', length=2917),
 Index([1357, 4234, 2760, 4934, 1049, 4511, 2839, 1585,  897, 3029,
        ...
        3605, 2003, 3425, 1408, 1333, 1837, 2164, 3787, 4465, 4627],
       dtype='int64', length=2141)]

In [19]:
funnel.fit(
    ef, 
    stages=[['event1'], ['event2', 'event5', 'even4'], ['event3']],
    # stages=['event3'], 
    funnel_type='closed', inside_session=False, 
    stages_names=['1', '2', '3'],
    segments=segments
)

event1
event2__event5__even4
event3
event1
event2__event5__even4
event3
event1
event2__event5__even4
event3


Unnamed: 0,stage,users_count,segment
0,1,216,segment_0
1,2,96,segment_0
2,3,22,segment_0
0,1,184,segment_1
1,2,58,segment_1
2,3,15,segment_1
0,1,216,segment_2
1,2,96,segment_2
2,3,22,segment_2


In [20]:
funnel.plot()

In [21]:
funnel.values

Unnamed: 0_level_0,Unnamed: 1_level_0,stage,users_count,percent_of_previous,percent_of_initial
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
segment_0,0,1,216,100.0,100.0
segment_0,1,2,96,44.444444,44.444444
segment_0,2,3,22,22.916667,10.185185
segment_1,0,1,184,100.0,100.0
segment_1,1,2,58,31.521739,31.521739
segment_1,2,3,15,25.862069,8.152174
segment_2,0,1,216,100.0,100.0
segment_2,1,2,96,44.444444,44.444444
segment_2,2,3,22,22.916667,10.185185


In [11]:
data = ef.data.copy()
cols_schema = ef.cols_schema
stages=['event1', ['event2', 'event5'], 'event3']
# stages=['event1', 'event2', 'event3']
stages_names = ['event1_name', 'event2_name', 'event3_name']


def _collapse_stages(self, data: pd.DataFrame, cols_schema: EventFrameColsSchema,
                    stages: List[Union[str, List[str]]], stages_names: Optional[List[str]]) -> pd.DataFrame:
    new_stages = []
    multiple_stages = []
    new_stages_names = []

    collapse_dict = dict()

    for stage in stages:
        if isinstance(stage, str):
            new_stages.append(stage)
            new_stages_names.append(stage)
        else:
            multiple_stages.append(stage)
            new_stage_name = '__'.join(stage)
            new_stages.append(new_stage_name)
            new_stages_names.append(new_stage_name)
            for sub_stage in stage:
                collapse_dict[sub_stage] = new_stage_name
    
    if stages_names is None:
        stages_names = new_stages_names

    data_copy = data.copy()
    event_col = cols_schema.event_name
    data_copy[event_col] = data_copy[event_col].replace(collapse_dict)

    return data_copy, new_stages, stages_names 


data_copy, new_stages, stages_names = _collapse_stages(data, data, cols_schema, stages, stages_names)
data_copy.event.unique()
# new_stages
# stages_names

array(['event8', 'event7', 'event6', 'event2__event5', 'event1', 'event9',
       'event3', 'event10', 'event4'], dtype=object)

In [12]:
new_stages

['event1', 'event2__event5', 'event3']

In [13]:
stages_names

['event1_name', 'event2_name', 'event3_name']