In [1]:
import pandas as pd
import numpy as np
# from retentioneering import datasets
from typing import Literal, Union, List, Optional, Iterable, get_args, Dict, Tuple
import numpy as np 
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def generate_user_events_data(num_users=1000, num_events=10, num_cities=10, num_os=2, num_rows=5000, seed=42):
    """
    Функция для генерации данных о событиях пользователей.
    """
    # Устанавливаем seed для генератора случайных чисел
    np.random.seed(seed)

    # Создаем список пользователей
    users = ['user' + str(i) for i in range(1, num_users+1)]

    # Создаем список событий
    events = ['event' + str(i) for i in range(1, num_events+1)]

    # Создаем список дат
    dates = [datetime.datetime.now() - datetime.timedelta(days=x) for x in range(180)]

    # Создаем список городов
    cities = ['city' + str(i) for i in range(1, num_cities+1)]

    # Создаем список операционных систем
    os = ['iOS', 'Android']

    # Создаем DataFrame
    df = pd.DataFrame({
        'user_id': np.random.choice(users, num_rows),
        'event': np.random.choice(events, num_rows),
        'event_datetime': np.random.choice(dates, num_rows),
        'city': np.random.choice(cities, num_rows),
        'operation_system': np.random.choice(os, num_rows)
    })

    return df

# Importing custom modules

In [3]:
import sys

sys.path.append('d:\\diplom\\product\\insight_pulse')

for path in sys.path:
    print(path)

D:\pythonProg\Python\Python312\python312.zip
D:\pythonProg\Python\Python312\DLLs
D:\pythonProg\Python\Python312\Lib
D:\pythonProg\Python\Python312
d:\diplom\.venv

d:\diplom\.venv\Lib\site-packages
d:\diplom\.venv\Lib\site-packages\win32
d:\diplom\.venv\Lib\site-packages\win32\lib
d:\diplom\.venv\Lib\site-packages\Pythonwin
d:\diplom\product\insight_pulse


In [4]:
from eventframing.eventframe import EventFrame
from eventframing.cols_schema import EventFrameColsSchema
from eventframing.event_type import EventType

from data_preprocessing.data_preprocessor import DataPreprocessor
from data_preprocessing.preprocessors_lib.add_start_end_events import AddStartEndEventsPreprocessor
from data_preprocessing.preprocessors_lib.split_sessions import SplitSessionsPreprocessor
from data_preprocessing.preprocessors_lib.add_cohorts_preprocessor import AddCohortsPreprocessor

# Testing

## AddStartEndEvents test

In [5]:
data = generate_user_events_data(num_rows=5000)

In [6]:
user_info = data.groupby('user_id').agg(**{
    'first_event': ('event_datetime', 'min'), 
    'last_event': ('event_datetime', 'max')
}).reset_index()

In [7]:
user_info['first_event'].dt.to_period('M').value_counts()

first_event
2024-09    387
2024-08    275
2024-10    185
2024-11     82
2024-12     42
2025-01     13
2025-02      5
Freq: M, Name: count, dtype: int64

In [8]:
user_info['last_event'].dt.to_period('M').value_counts()

last_event
2025-01    380
2025-02    339
2024-12    161
2024-11     58
2024-10     32
2024-09     14
2024-08      5
Freq: M, Name: count, dtype: int64

In [9]:
data.head()

Unnamed: 0,user_id,event,event_datetime,city,operation_system
0,user103,event8,2024-09-15 13:22:53.422852,city6,iOS
1,user436,event7,2024-12-25 13:22:53.422852,city3,iOS
2,user861,event6,2024-09-24 13:22:53.422852,city7,Android
3,user271,event7,2024-11-22 13:22:53.422852,city7,iOS
4,user107,event5,2024-09-16 13:22:53.422852,city4,iOS


In [10]:
cols_schema = {'user_id': 'user_id', 'event_timestamp': 'event_datetime', 'event_name': 'event'}
ef = EventFrame(data, cols_schema)
ef.to_dataframe().head()

Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index
1357,user1,event2,2024-10-18 13:22:53.422852,city5,Android,1357,raw,2
4234,user1,event6,2024-10-20 13:22:53.422852,city5,Android,4234,raw,2
2760,user1,event2,2024-11-09 13:22:53.422852,city6,iOS,2760,raw,2
4934,user1,event9,2024-11-21 13:22:53.422852,city3,iOS,4934,raw,2
1049,user1,event10,2024-12-19 13:22:53.422852,city1,iOS,1049,raw,2


In [11]:
asev = AddStartEndEventsPreprocessor()


In [12]:
pr_ef = asev.apply(ef)

In [13]:
pr_ef.to_dataframe().head()

Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index
5000,user1,path_start,2024-10-18 13:22:53.422852,city5,Android,user1_path_start,path_start,0
0,user1,event2,2024-10-18 13:22:53.422852,city5,Android,1357,raw,2
1,user1,event6,2024-10-20 13:22:53.422852,city5,Android,4234,raw,2
2,user1,event2,2024-11-09 13:22:53.422852,city6,iOS,2760,raw,2
3,user1,event9,2024-11-21 13:22:53.422852,city3,iOS,4934,raw,2


In [14]:
asev.apply(pr_ef).to_dataframe().head()

Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index
5000,user1,path_start,2024-10-18 13:22:53.422852,city5,Android,user1_path_start,path_start,0
0,user1,event2,2024-10-18 13:22:53.422852,city5,Android,1357,raw,2
1,user1,event6,2024-10-20 13:22:53.422852,city5,Android,4234,raw,2
2,user1,event2,2024-11-09 13:22:53.422852,city6,iOS,2760,raw,2
3,user1,event9,2024-11-21 13:22:53.422852,city3,iOS,4934,raw,2


In [13]:
data

Unnamed: 0,user_id,event,event_datetime,city,operation_system
0,user103,event8,2024-09-15 11:16:19.695082,city6,iOS
1,user436,event7,2024-12-25 11:16:19.695082,city3,iOS
2,user861,event6,2024-09-24 11:16:19.695082,city7,Android
3,user271,event7,2024-11-22 11:16:19.695082,city7,iOS
4,user107,event5,2024-09-16 11:16:19.695082,city4,iOS
...,...,...,...,...,...
4995,user290,event2,2024-10-17 11:16:19.695082,city9,iOS
4996,user295,event9,2024-12-24 11:16:19.695082,city10,iOS
4997,user451,event1,2025-02-10 11:16:19.695082,city10,Android
4998,user904,event7,2025-02-02 11:16:19.695082,city9,iOS


In [13]:
pr_ef_data = pr_ef.to_dataframe()

In [14]:
assert(pr_ef_data.user_id.nunique() == ef.to_dataframe().user_id.nunique())
assert(pr_ef_data[pr_ef_data['event_type'] == 'raw'].shape == ef.to_dataframe().shape )

In [15]:
assert(pr_ef_data[pr_ef_data['event_type'] != 'raw'].shape[0] == ef.to_dataframe().user_id.nunique() * 2)

In [16]:
user_preprocess_info = pr_ef_data[pr_ef_data['event_type'] != 'raw'].groupby('user_id').agg(**{
    'first_event_preprocess': ('event_datetime', 'min'), 
    'last_event_preprocess': ('event_datetime', 'max')
}).reset_index()

In [17]:
user_preprocess_info

Unnamed: 0,user_id,first_event_preprocess,last_event_preprocess
0,user1,2024-10-18 13:14:06.727302,2025-01-23 13:14:06.726288
1,user10,2024-09-01 13:14:06.727302,2024-12-21 13:14:06.727302
2,user100,2024-09-08 13:14:06.727302,2025-01-03 13:14:06.727302
3,user1000,2024-09-29 13:14:06.727302,2024-09-29 13:14:06.727302
4,user101,2025-01-01 13:14:06.727302,2025-02-03 13:14:06.726288
...,...,...,...
984,user995,2024-10-18 13:14:06.727302,2025-02-14 13:14:06.726288
985,user996,2024-09-28 13:14:06.727302,2025-02-15 13:14:06.726288
986,user997,2024-10-16 13:14:06.727302,2025-01-22 13:14:06.727302
987,user998,2024-09-13 13:14:06.727302,2025-01-20 13:14:06.727302


In [18]:
merged_user_info = pd.merge(
    user_info,
    user_preprocess_info,
    on='user_id'  
)
assert((merged_user_info['first_event'] != merged_user_info['first_event_preprocess']).sum() == 0)
assert((merged_user_info['last_event'] != merged_user_info['last_event_preprocess']).sum() == 0)
# user_info

## SplitSession test

In [15]:
data = generate_user_events_data(num_rows=5000)
cols_schema = {'user_id': 'user_id', 'event_timestamp': 'event_datetime', 'event_name': 'event'}
ef = EventFrame(data, cols_schema)
ef.to_dataframe().head()


Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index
1357,user1,event2,2024-10-18 13:23:57.274336,city5,Android,1357,raw,2
4234,user1,event6,2024-10-20 13:23:57.274336,city5,Android,4234,raw,2
2760,user1,event2,2024-11-09 13:23:57.274336,city6,iOS,2760,raw,2
4934,user1,event9,2024-11-21 13:23:57.274336,city3,iOS,4934,raw,2
1049,user1,event10,2024-12-19 13:23:57.274336,city1,iOS,1049,raw,2


In [16]:
ss_preprocessor = SplitSessionsPreprocessor(timeout=(10 , 'D'))

In [17]:
ss_preprocessor.timeout

Timedelta('10 days 00:00:00')

In [18]:
pr_ef = ss_preprocessor.apply(ef, cols_schema)
pr_df = pr_ef.to_dataframe()

In [19]:
pr_pr_ef = ss_preprocessor.apply(pr_ef)
pr_pr_df = pr_pr_ef.to_dataframe()

In [20]:
pr_df[pr_df['user_id'] == 'user1']

Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index,session_id
1357,user1,session_start,2024-10-18 13:23:57.274336,city5,Android,user1_1session_start,session_start,1,user1_1
1357,user1,event2,2024-10-18 13:23:57.274336,city5,Android,1357,raw,2,user1_1
4234,user1,event6,2024-10-20 13:23:57.274336,city5,Android,4234,raw,2,user1_1
4234,user1,session_end,2024-10-20 13:23:57.274336,city5,Android,user1_1session_end,session_end,3,user1_1
2760,user1,session_start,2024-11-09 13:23:57.274336,city6,iOS,user1_2session_start,session_start,1,user1_2
2760,user1,event2,2024-11-09 13:23:57.274336,city6,iOS,2760,raw,2,user1_2
2760,user1,session_end,2024-11-09 13:23:57.274336,city6,iOS,user1_2session_end,session_end,3,user1_2
4934,user1,session_start,2024-11-21 13:23:57.274336,city3,iOS,user1_3session_start,session_start,1,user1_3
4934,user1,event9,2024-11-21 13:23:57.274336,city3,iOS,4934,raw,2,user1_3
4934,user1,session_end,2024-11-21 13:23:57.274336,city3,iOS,user1_3session_end,session_end,3,user1_3


In [21]:
pr_pr_df[pr_pr_df['user_id'] == 'user1']

Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index,session_id
1357,user1,session_start,2024-10-18 13:23:57.274336,city5,Android,user1_1session_start,session_start,1,user1_1
1357,user1,event2,2024-10-18 13:23:57.274336,city5,Android,1357,raw,2,user1_1
4234,user1,event6,2024-10-20 13:23:57.274336,city5,Android,4234,raw,2,user1_1
4234,user1,session_end,2024-10-20 13:23:57.274336,city5,Android,user1_1session_end,session_end,3,user1_1
2760,user1,session_start,2024-11-09 13:23:57.274336,city6,iOS,user1_2session_start,session_start,1,user1_2
2760,user1,event2,2024-11-09 13:23:57.274336,city6,iOS,2760,raw,2,user1_2
2760,user1,session_end,2024-11-09 13:23:57.274336,city6,iOS,user1_2session_end,session_end,3,user1_2
4934,user1,session_start,2024-11-21 13:23:57.274336,city3,iOS,user1_3session_start,session_start,1,user1_3
4934,user1,event9,2024-11-21 13:23:57.274336,city3,iOS,4934,raw,2,user1_3
4934,user1,session_end,2024-11-21 13:23:57.274336,city3,iOS,user1_3session_end,session_end,3,user1_3


In [17]:
data = generate_user_events_data(num_rows=50, num_users=5, )
data = data.sort_values(['user_id', 'event_datetime'])

In [18]:
data.head()

Unnamed: 0,user_id,event,event_datetime,city,operation_system
33,user1,event9,2024-08-29 11:43:15.248778,city5,Android
45,user1,event10,2024-10-05 11:43:15.248778,city3,iOS
24,user1,event2,2024-10-15 11:43:15.248778,city4,Android
18,user1,event4,2024-11-05 11:43:15.247778,city3,Android
38,user1,event3,2024-12-07 11:43:15.247778,city2,Android


## AddCohortPreprocessor test

In [5]:
data = generate_user_events_data(num_rows=5000)
cols_schema = {'user_id': 'user_id', 'event_timestamp': 'event_datetime', 'event_name': 'event'}
ef = EventFrame(data, cols_schema)
ef.to_dataframe().head()

Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index
1357,user1,event2,2024-10-18 13:37:59.579628,city5,Android,1357,raw,2
4234,user1,event6,2024-10-20 13:37:59.579628,city5,Android,4234,raw,2
2760,user1,event2,2024-11-09 13:37:59.579628,city6,iOS,2760,raw,2
4934,user1,event9,2024-11-21 13:37:59.579628,city3,iOS,4934,raw,2
1049,user1,event10,2024-12-19 13:37:59.579628,city1,iOS,1049,raw,2


In [19]:
cohorts_preprocessor = AddCohortsPreprocessor('Y')
cohorts_ef = cohorts_preprocessor.apply(ef)
cohorts_df = cohorts_ef.to_dataframe().sort_values(['user_id', 'event_datetime'])

In [20]:
cohorts_df.head()

Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index,cohort_time_unit,cohort_group,cohort_period
1357,user1,event2,2024-10-18 13:37:59.579628,city5,Android,1357,raw,2,2024-01-01,2024-01-01,0
4234,user1,event6,2024-10-20 13:37:59.579628,city5,Android,4234,raw,2,2024-01-01,2024-01-01,0
2760,user1,event2,2024-11-09 13:37:59.579628,city6,iOS,2760,raw,2,2024-01-01,2024-01-01,0
4934,user1,event9,2024-11-21 13:37:59.579628,city3,iOS,4934,raw,2,2024-01-01,2024-01-01,0
1049,user1,event10,2024-12-19 13:37:59.579628,city1,iOS,1049,raw,2,2024-01-01,2024-01-01,0


In [21]:
cohorts_df[cohorts_df['user_id'] == 'user10']

Unnamed: 0,user_id,event,event_datetime,city,operation_system,event_id,event_type,event_type_index,cohort_time_unit,cohort_group,cohort_period
3029,user10,event6,2024-09-01 13:37:59.579628,city6,iOS,3029,raw,2,2024-01-01,2024-01-01,0
995,user10,event3,2024-10-25 13:37:59.579628,city5,iOS,995,raw,2,2024-01-01,2024-01-01,0
3691,user10,event2,2024-11-09 13:37:59.579628,city6,iOS,3691,raw,2,2024-01-01,2024-01-01,0
1372,user10,event5,2024-11-21 13:37:59.579628,city4,Android,1372,raw,2,2024-01-01,2024-01-01,0
2708,user10,event10,2024-12-01 13:37:59.579628,city10,Android,2708,raw,2,2024-01-01,2024-01-01,0
3467,user10,event1,2024-12-21 13:37:59.579628,city6,Android,3467,raw,2,2024-01-01,2024-01-01,0
