In [1]:
import pandas as pd
# from retentioneering import datasets
from typing import Literal, Union, List, Optional, Iterable, get_args, Dict, Tuple, Callable
import numpy as np 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product
from abc import ABC, abstractmethod

In [2]:
import sys

sys.path.append('d:\\diplom\\product\\insight_pulse')

for path in sys.path:
    print(path)

D:\pythonProg\Python\Python312\python312.zip
D:\pythonProg\Python\Python312\DLLs
D:\pythonProg\Python\Python312\Lib
D:\pythonProg\Python\Python312
d:\diplom\.venv

d:\diplom\.venv\Lib\site-packages
d:\diplom\.venv\Lib\site-packages\win32
d:\diplom\.venv\Lib\site-packages\win32\lib
d:\diplom\.venv\Lib\site-packages\Pythonwin
d:\diplom\product\insight_pulse


In [3]:
from eventframing.eventframe import EventFrame
from eventframing.cols_schema import EventFrameColsSchema
from eventframing.event_type import EventType

from data_preprocessing.data_preprocessor import DataPreprocessor
from data_preprocessing.preprocessors_lib.add_start_end_events import AddStartEndEventsPreprocessor
from data_preprocessing.preprocessors_lib.split_sessions import SplitSessionsPreprocessor

from utils.time_units import TimeUnits

In [4]:
def create_test_data():
    data = pd.DataFrame({
        'user_id': [1, 1, 1, 2, 2, 3, 3, 3, 4],
        'event_name': ['A', 'B', 'C', 'A', 'C', 'A', 'B', 'C', 'A'],
        'event_timestamp': pd.to_datetime([
            '2023-01-01 10:00', '2023-01-01 10:01', '2023-01-10 10:02',
            '2023-01-01 10:00', '2023-01-01 10:05',
            '2023-01-01 10:00', '2023-01-01 10:01', '2023-01-01 10:02',
            '2023-01-01 10:10'
        ])
    })
    cols_schema = EventFrameColsSchema({
        'user_id': 'user_id',
        'event_name': 'event_name',
        'event_timestamp': 'event_timestamp'
    })
    return data, cols_schema

ef = EventFrame(*create_test_data())

In [5]:
class _Metric(ABC):
    def __init__(self, formula: Callable, name: str, description: str = None):
        self.formula = formula
        self.name = name
        self.description = description

    
    @staticmethod    
    def get_unique_combinations(data: pd.DataFrame, hue_cols: Union[str, List[str]]) -> List[Dict]:
        """
        Возвращает список всех комбинаций уникальных значений полей hue_cols в наборе данных data.
        
        :param data: pd.DataFrame — входные данные.
        :param hue_cols: Union[str, List[str]] — имя колонки или список имен колонок.
        :return: List[Dict] — список комбинаций уникальных значений полей.
        """
        # Если hue_cols - это строка, преобразуем его в список
        if isinstance(hue_cols, str):
            hue_cols = [hue_cols]

        # Проверим, что все столбцы существуют в DataFrame
        for col in hue_cols:
            if col not in data.columns:
                raise ValueError(f"Column '{col}' does not exist in the DataFrame.")

        # Получаем уникальные значения для каждого из hue_cols
        unique_values = [data[col].unique() for col in hue_cols]

        # Генерируем все комбинации уникальных значений
        combinations = list(product(*unique_values))

        # Создаем результат в виде списка словарей
        result = [{hue_cols[i]: combo[i] for i in range(len(hue_cols))} for combo in combinations]

        return result
    
    @staticmethod    
    def filter_data_frame( data: pd.DataFrame, hue_cols_combo: Dict) -> pd.DataFrame:
        """
        Фильтрует DataFrame по комбинациям уникальных значений hue_cols.
        
        :param data: pd.DataFrame — входные данные.
        :param hue_cols_combos: List[Dict] — список комбинаций уникальных значений hue_cols.
        :return: pd.DataFrame — отфильтрованный DataFrame.
        """
        query = ''
        for col, col_value in hue_cols_combo.items():
            if isinstance(col_value, str):
                query += f"{col} == '{col_value}' & "
            else:
                query += f"{col} == {col_value} & "
        query = query[:-3]
        print(query)
        return data.query(query)
    
    @staticmethod
    def _get_data_and_cols_schema(data: Union[pd.DataFrame, 'EventFrame'],
                            cols_schema: Union[Dict[str, str], 'EventFrameColsSchema']) -> Tuple[pd.DataFrame, EventFrameColsSchema]:
        if isinstance(data, EventFrame):
            return data.data.copy(), data.cols_schema
        elif cols_schema is None:
            raise ValueError("cols_schema is None")
        else:
            return data.copy(), EventFrameColsSchema(cols_schema)
        

In [12]:
class MetricKPI(_Metric):
    def __init__(self, formula: Callable[[Union[pd.DataFrame, EventFrame], Optional[EventFrameColsSchema], dict], float], 
                 name: str, description: str):
        super().__init__(formula, name, description)

    def compute_single_value(self, data: Union[pd.DataFrame, 'EventFrame'], cols_schema: Optional[EventFrameColsSchema] = None, **kwargs) -> float:
        data, cols_schema = super()._get_data_and_cols_schema(data, cols_schema)
        print(data)
        return self.formula(data, **kwargs)
    
    def compute_splitted_values(self, data: Union[pd.DataFrame, 'EventFrame'],  hue_cols: Union[str, List[str]], 
                                cols_schema: Optional[EventFrameColsSchema] = None, **kwargs) -> pd.DataFrame:
        data, cols_schema = super()._get_data_and_cols_schema(data, cols_schema)

        if len(hue_cols) == 0 or hue_cols is None:
            return self.compute_single_value(data, cols_schema, **kwargs)
            
        combinations = self.get_unique_combinations(data, hue_cols)
        result = []
        for combo in combinations:
            print(combo)
            combo_desc = combo.copy()
            combo_desc.update({self.name: self.formula(self.filter_data_frame(data, combo), cols_schema, **kwargs)})
            result.append(combo_desc)
            
        return pd.DataFrame(result)
        

In [25]:
data = pd.DataFrame({
        'user_id': [1, 2, 1, 3, 2, 1],
        'action': ['login', 'login', 'login', 'signup', 'login', 'logout'],
        # 'retained': [True, False, True, False, True, False],
        'retained': [True, True, True, True, True, False],
        'converted': [True, False, True, False, True, False]
    })

print(data)

dau_value = MetricKPI(dau_formula, 'DAU', 'Действие пользователя')
# dau_value.compute_single_value(data, action_to_detect_activity='login')
cols_schema={'event_timestamp': 'event_date', 'user_id': 'user_id', 'event_name': 'event'}
dau_value.compute_splitted_values(data, cols_schema=cols_schema, hue_cols=['retained', 'converted'], action_to_detect_activity='login')

   user_id  action  retained  converted
0        1   login      True       True
1        2   login      True      False
2        1   login      True       True
3        3  signup      True      False
4        2   login      True       True
5        1  logout     False      False
{'retained': np.True_, 'converted': np.True_}
retained == True & converted == True
{'retained': np.True_, 'converted': np.False_}
retained == True & converted == False
{'retained': np.False_, 'converted': np.True_}
retained == False & converted == True
{'retained': np.False_, 'converted': np.False_}
retained == False & converted == False


Unnamed: 0,retained,converted,DAU
0,True,True,2
1,True,False,2
2,False,True,0
3,False,False,1


In [16]:


class MetricDinamic(_Metric):
    def __init__(self, formula: Callable[[Union[pd.DataFrame, EventFrame], Optional[EventFrameColsSchema], dict], float], 
                 name: str, description: str = ''):
        super().__init__(formula, name, description)

    def _get_data_pivot_template(self, data: pd.DataFrame, cols_schema: EventFrameColsSchema, 
                                 period: TimeUnitPeriod, hue_cols: List[str]) -> pd.DataFrame:
        dt_col = cols_schema.event_timestamp
        min_date, max_date = data[dt_col].min(), data[dt_col].max()
        pivot_template = period.generte_monotic_time_range(min_date, max_date)
        if len(hue_cols) > 0:
            for col_name in hue_cols:
                col_values = data[col_name].unique()
                pivot_template = pd.merge(
                    pivot_template,
                    pd.DataFrame({col_name: col_values}),
                    how='cross'
                )
        return pivot_template

    def compute(self, data: Union[pd.DataFrame, 'EventFrame'],
                period: Union[str, TimeUnitPeriod] = 'D',
                hue_cols: Union[str, List[str]] = None, 
                cols_schema: Union[Dict[str, str], 'EventFrameColsSchema'] = None, 
                fillna_value: float = 0, **kwargs) -> pd.DataFrame:
        
        data, cols_schema = super()._get_data_and_cols_schema(data, cols_schema)

        if isinstance(period, str):
            period = TimeUnitPeriod(period)
        period_name = period.alias
        dt_col = cols_schema.event_timestamp
        data = period.add_period_col(data, dt_col, new_col_name=period_name)
    

        if  hue_cols is None or len(hue_cols) == 0:
            hue_cols = []
            result = data.groupby(period_name)\
                .apply(lambda data: self.formula(data, cols_schema, **kwargs), include_groups=False)\
                    .reset_index().rename(columns={0: self.name})
            
        else:
            if isinstance(hue_cols, str):
                hue_cols = [hue_cols]
            combinations = self.get_unique_combinations(data, hue_cols)
            result = None
            # result = pd.DataFrame(columns=[period_name] + hue_cols + [self.name])
            # return result
            for combo in combinations:
                print(combo)
                combo_result = self.filter_data_frame(data, combo).groupby(period_name)\
                    .apply(lambda data: self.formula(data, cols_schema, **kwargs),
                           include_groups=False)\
                        .reset_index().rename(columns={0: self.name})
                for col_name, col_value in combo.items():
                    combo_result[col_name] = [col_value] * combo_result.shape[0]
                if result is None:
                    result = combo_result.loc[:, tuple([period_name] + hue_cols + [self.name])]
                else:
                    result = pd.concat([result, combo_result.loc[:, tuple([period_name] + hue_cols + [self.name])]], axis=0)
            # return result.sort_values(period_name)
        pivot_template = self._get_data_pivot_template(data, cols_schema, period, hue_cols)
        # return data_date_range
        result = pd.merge(
            pivot_template,
            result,
            on=hue_cols + [period_name],
            how='left'
        )
        result[self.name] = result[self.name].fillna(fillna_value)
        return result.sort_values([period_name] + hue_cols) 
        
    

            

In [17]:
import datetime

# Создаем список пользователей
users = ['user1', 'user2', 'user3', 'user4', 'user5']

# Создаем список событий
events = ['event1', 'event2', 'event3', 'event4', 'event5']

# Создаем список дат
dates = [datetime.datetime.now() - datetime.timedelta(days=x) - datetime.timedelta(hours=x) for x in range(35)]

statuses = [1, 2, 3]
os = ['iOS', 'Android']
cities = ['chel', 'msk']

# Создаем DataFrame
df = pd.DataFrame({
    'user_id': np.random.choice(users, 50),
    'event': np.random.choice(events, 50),
    'event_date': np.random.choice(dates, 50),
    'status': np.random.choice(statuses, 50),
    'os': np.random.choice(os, 50),
    'city': np.random.choice(cities, 50)
})

df['date'] = pd.to_datetime(df['event_date'].dt.date)

df.sort_values(['date']).head(10)

Unnamed: 0,user_id,event,event_date,status,os,city,date
23,user2,event1,2025-01-03 22:43:02.612845,1,Android,chel,2025-01-03
12,user5,event1,2025-01-06 00:43:02.612845,3,iOS,msk,2025-01-06
49,user1,event5,2025-01-07 01:43:02.612845,1,iOS,msk,2025-01-07
33,user5,event2,2025-01-08 02:43:02.612845,1,Android,chel,2025-01-08
8,user4,event1,2025-01-08 02:43:02.612845,3,iOS,chel,2025-01-08
20,user5,event1,2025-01-08 02:43:02.612845,2,iOS,chel,2025-01-08
11,user5,event3,2025-01-08 02:43:02.612845,2,iOS,msk,2025-01-08
34,user4,event5,2025-01-09 03:43:02.612845,2,Android,msk,2025-01-09
7,user3,event3,2025-01-09 03:43:02.612845,1,iOS,msk,2025-01-09
19,user3,event4,2025-01-09 03:43:02.612845,3,Android,chel,2025-01-09


In [174]:
df[df['date'] == '2024-12-27']

Unnamed: 0,user_id,event,event_date,status,os,city,date
38,user3,event1,2024-12-27 23:54:26.792589,2,Android,chel,2024-12-27


In [23]:
def dau_formula(data, cols_schema=None, **kwargs):
    user_id_col = cols_schema.user_id
    return data[user_id_col].nunique()

md = MetricDinamic(formula=dau_formula, name='DAU')
cols_schema={'event_timestamp': 'event_date', 'user_id': 'user_id', 'event_name': 'event'}
md.compute(df, cols_schema=cols_schema, hue_cols=['status'], period='D')
# md._get_data_pivot_template(df, EventFrameColsSchema(cols_schema), period=TimeUnitPeriod('D'), hue_cols=[])

{'status': np.int64(1)}
status == 1
{'status': np.int64(2)}
status == 2
{'status': np.int64(3)}
status == 3


Unnamed: 0,date,status,DAU
0,2025-01-03,1,1.0
1,2025-01-03,2,0.0
2,2025-01-03,3,0.0
3,2025-01-04,1,0.0
4,2025-01-04,2,0.0
...,...,...,...
100,2025-02-05,2,1.0
101,2025-02-05,3,0.0
102,2025-02-06,1,0.0
103,2025-02-06,2,1.0


In [20]:
pivot_cols = ['status', 'os']
# pivot_cols = ['status']
concat_str = '____'

pivot_df = df.pivot_table(
    values='user_id', 
    index='date', 
    columns=pivot_cols,
    aggfunc='nunique'
).fillna(0)

df.dtypes
pivot_df.columns = [
    concat_str.join(list(map(str, col))).strip() for col in pivot_df.columns.values
]

pivot_df = pivot_df.reset_index()
pivot_df = pivot_df.melt(id_vars=['date'])
pivot_df[pivot_cols] = pivot_df['variable'].str.split(concat_str, expand=True)

pivot_df.sort_values(['date'] + pivot_cols)

Unnamed: 0,date,variable,value,status,os
0,2025-01-03,1____Android,1.0,1,Android
28,2025-01-03,1____iOS,0.0,1,iOS
56,2025-01-03,2____Android,0.0,2,Android
84,2025-01-03,2____iOS,0.0,2,iOS
112,2025-01-03,3____Android,0.0,3,Android
...,...,...,...,...,...
55,2025-02-06,1____iOS,0.0,1,iOS
83,2025-02-06,2____Android,1.0,2,Android
111,2025-02-06,2____iOS,0.0,2,iOS
139,2025-02-06,3____Android,1.0,3,Android


In [32]:
pivot_df = df.pivot_table(
    values='user_id', 
    index='date', 
    columns=None,
    aggfunc='nunique'
)

pivot_df.reset_index().head()
# pivot_df.reset_index().melt(id_vars=['date'])

Unnamed: 0,date,user_id
0,2024-12-28,2
1,2024-12-29,1
2,2024-12-30,1
3,2024-12-31,2
4,2025-01-01,2


In [34]:

pivot_df = df.pivot_table(
    values='user_id', 
    index='date', 
    columns='status',
    aggfunc='nunique'
)

# pivot_df.reset_index().head().sort_values(['date'])
pivot_df.reset_index().melt(id_vars=['date']).sort_values(['date']).head(15)

Unnamed: 0,date,status,value
0,2024-12-25,1,1.0
48,2024-12-25,3,
24,2024-12-25,2,
1,2024-12-26,1,
49,2024-12-26,3,
25,2024-12-26,2,1.0
2,2024-12-28,1,
50,2024-12-28,3,
26,2024-12-28,2,1.0
51,2024-12-29,3,1.0


In [8]:

class Metric:
    """
    Базовый класс для создания метрик.
    """
    def __init__(self, formula: Callable):
        self.formula = formula
    
    def compute(self, data: pd.DataFrame, **kwargs):
        """
        Вычисляет значение метрики на основе данных.
        
        :param data: pd.DataFrame — данные для вычисления метрики
        :param kwargs: Дополнительные аргументы для формулы
        :return: значение метрики
        """
        return self.formula(data, **kwargs)


class MetricLibrary:
    """
    Класс для хранения предопределенных метрик.
    """
    def __init__(self):
        self.metrics: Dict[str, Metric] = {}
        
        # Определяем предустановленные метрики
        self.metrics['DAU'] = Metric(self.compute_dau)
        self.metrics['WAU'] = Metric(self.compute_wau)
        self.metrics['MAU'] = Metric(self.compute_mau)
        self.metrics['Retention'] = Metric(self.compute_retention)
        self.metrics['Conversion'] = Metric(self.compute_conversion)
        
    def compute_dau(data: pd.DataFrame, action_to_detect_activity:str = 'login'):
        # Логика для расчета DAU
        return data[data['action'] == action_to_detect_activity]['user_id'].nunique()

    def compute_wau(self, data: pd.DataFrame, **kwargs):
        # Логика для расчета WAU
        return data[data['action'] == 'login']['user_id'].nunique()
    
    def compute_mau(self, data: pd.DataFrame, **kwargs):
        # Логика для расчета MAU
        return data[data['action'] == 'login']['user_id'].nunique()

    def compute_retention(self, data: pd.DataFrame, **kwargs):
        # Логика для расчета Retention
        return data[data['retained'] == True]['user_id'].nunique()

    def compute_conversion(self, data: pd.DataFrame, **kwargs):
        # Логика для расчета Conversion Rate
        total_users = data['user_id'].nunique()
        converted_users = data[data['converted'] == True]['user_id'].nunique()
        return converted_users / total_users if total_users > 0 else 0
    
    def add_custom_metric(self, name: str, formula: Callable):
        """
        Добавляет пользовательскую метрику в библиотеку.

        :param name: str — имя пользовательской метрики
        :param formula: Callable — функция для вычисления метрики
        """
        self.metrics[name] = Metric(formula)



# Создаем пример DataFrame
data = pd.DataFrame({
    'user_id': [1, 2, 1, 3, 2, 1],
    'action': ['login', 'login', 'login', 'signup', 'login', 'logout'],
    'retained': [True, False, True, False, True, False],
    'converted': [True, False, True, False, True, False]
})

# # Создаем библиотеку метрик
# metric_library = MetricLibrary()

# # Вычисляем DAU
# dau_value = metric_library.metrics['DAU'].compute(data)
# print(f"DAU: {dau_value}")

# # Вычисляем WAU
# wau_value = metric_library.metrics['WAU'].compute(data)
# print(f"WAU: {wau_value}")

# # Вычисляем собственную метрику
# custom_formula = lambda df, **kwargs: df['user_id'].nunique() * 2  # Пример пользовательской формулы
# metric_library.add_custom_metric("CustomMetric", custom_formula)
# custom_value = metric_library.metrics['CustomMetric'].compute(data)
# print(f"CustomMetric: {custom_value}")

In [15]:
class TimeUnitPeriod:
    def __init__(self, time_unit: str):
        self.time_unit = time_unit
        self.alias = self.get_period_alias()
        self.russian_alias = self.get_period_russian_alias()
        # self.period_compute_func = self.get_period_compute_func()

    def get_period_alias(self) -> str:
        alias_mapping = {
            "Y": "Year",
            "M": "Month",
            "W": "Week",
            "D": "date",
            "h": "Hour",
            "m": "Minute",
            "s": "Second",
            "ms": "Millisecond"
        }
        return alias_mapping.get(self.time_unit, "Unknown")

    def get_period_russian_alias(self) -> str:
        russian_alias_mapping = {
            "Y": "Год",
            "M": "Месяц",
            "W": "Неделя",
            "D": "День",
            "h": "Час",
            "m": "Минуты",
            "s": "Секунда",
            "ms": "Миллисекунда"
        }
        return russian_alias_mapping.get(self.time_unit, "Неизвестно")   

    def add_period_col(self, data: pd.DataFrame, dt_col: str, new_col_name: Optional[str] = None) -> pd.DataFrame:
        data = data.copy()
        if new_col_name is None or new_col_name == '':
            new_col_name = 'time_unit_period'
        
        if self.time_unit == 'D':
            data[new_col_name] = data[dt_col].dt.date
        elif self.time_unit == 'W':
            data[new_col_name] = data[dt_col] - pd.to_timedelta(data[dt_col].dt.weekday, unit='D')
        elif self.time_unit == 'M':
            data[new_col_name] = data[dt_col].apply(lambda time: time.strftime('%Y-%m'))
        elif self.time_unit == 'Y':
            data[new_col_name] = data[dt_col].dt.year
        elif self.time_unit == 'h':
            data[new_col_name] = data[dt_col].dt.floor('H')  # Округляем до часа
        elif self.time_unit == 'm':
            data[new_col_name] = data[dt_col].dt.floor('min')  # Округляем до минуты
        elif self.time_unit == 's':
            data[new_col_name] = data[dt_col].dt.floor('s')  # Округляем до секунды
        elif self.time_unit == 'ms':
            data[new_col_name] = data[dt_col].dt.floor('L')  # Округляем до миллисекунды
        else:
            raise ValueError(f'Unsupported time unit: {self.time_unit}')
        
        if self.time_unit not in ('Y', 'M'):
            data[new_col_name] = pd.to_datetime(data[new_col_name])
        
        return data
    
    def generte_monotic_time_range(self, min_date: str, max_date: str) -> pd.Series:
        """
        Генерирует непрерывную последовательность pd.Timestamp в зависимости от self.time_unit.

        :param min_date: str — минимальная дата в формате 'YYYY-MM-DD'.
        :param max_date: str — максимальная дата в формате 'YYYY-MM-DD'.
        :return: pd.Series — последовательность дат.
        """
        
        if self.time_unit == 'Y':
            min_date, max_date = pd.Series([min_date, max_date]).dt.floor('Y')
            return pd.date_range(start=min_date, end=max_date, freq='YS').to_series().dt.year.to_frame().rename(columns={0: self.alias})
        elif self.time_unit == 'M':
            min_date, max_date = pd.Series([min_date, max_date]).apply(lambda time: time.strftime('%Y-%m'))
            return pd.date_range(start=min_date, end=max_date, freq='MS').to_series().apply(lambda time: time.strftime('%Y-%m')).to_frame().rename(columns={0: self.alias})
        elif self.time_unit == 'W':
            range_dates = pd.Series([min_date, max_date])
            range_dates = pd.to_datetime(range_dates.dt.date) - pd.to_timedelta(range_dates.dt.weekday, unit='D')
            print(range_dates)
            min_date, max_date = range_dates
            return pd.date_range(start=min_date, end=max_date, freq='W-MON').to_frame().rename(columns={0: self.alias})
        elif self.time_unit == 'D':
            min_date, max_date = pd.Series([min_date, max_date]).dt.floor('D')
            return pd.date_range(start=min_date, end=max_date, freq='D').to_frame().rename(columns={0: self.alias})
        elif self.time_unit == 'h':
            min_date, max_date = pd.Series([min_date, max_date]).dt.floor('h')
            return pd.date_range(start=min_date, end=max_date, freq='h').to_frame().rename(columns={0: self.alias})
        elif self.time_unit == 'm':
            min_date, max_date = pd.Series([min_date, max_date]).dt.floor('min')
            return pd.date_range(start=min_date, end=max_date, freq='min').to_frame().rename(columns={0: self.alias})
        elif self.time_unit == 's':
            min_date, max_date = pd.Series([min_date, max_date]).dt.floor('s')
            return pd.date_range(start=min_date, end=max_date, freq='s').to_frame().rename(columns={0: self.alias})
        elif self.time_unit == 'ms':
            min_date, max_date = pd.Series([min_date, max_date]).dt.floor('L')
            return pd.date_range(start=min_date, end=max_date, freq='L').to_frame().rename(columns={0: self.alias})
        else:
            raise ValueError(f'Unsupported time unit: {self.time_unit}')

In [104]:
# a, b = pd.Series([df['event_date'].min(), df['event_date'].max()]).dt.floor('W')
# a, b
print(df['event_date'].min(), df['event_date'].max())
TimeUnitPeriod('s').generte_monotic_time_range(df['event_date'].min(), df['event_date'].max())

2024-12-27 23:54:26.792589 2025-02-01 09:54:26.792589


Unnamed: 0,Second
2024-12-27 23:54:26,2024-12-27 23:54:26
2024-12-27 23:54:27,2024-12-27 23:54:27
2024-12-27 23:54:28,2024-12-27 23:54:28
2024-12-27 23:54:29,2024-12-27 23:54:29
2024-12-27 23:54:30,2024-12-27 23:54:30
...,...
2025-02-01 09:54:22,2025-02-01 09:54:22
2025-02-01 09:54:23,2025-02-01 09:54:23
2025-02-01 09:54:24,2025-02-01 09:54:24
2025-02-01 09:54:25,2025-02-01 09:54:25


In [81]:
pd.date_range('2020-01-01', '2022-01-01', freq='YE', inclusive='both')
# .to_frame().rename(columns={0: 'date'})

DatetimeIndex(['2020-12-31', '2021-12-31'], dtype='datetime64[ns]', freq='YE-DEC')

In [74]:
tup = TimeUnitPeriod('D')
print(tup.time_unit)
pd.merge(
    tup.generate_period('2020-01-01', '2020-02-20'),
    df.head(10),
    how='left',
    on='date'
)

# print(tup.period_alias)
# print(tup.period_russian_alias)
# tup_cf = tup.get_period_compute_func()
# df.sort_values(['date']).head(10)

D


Unnamed: 0,date,user_id,event,event_date,status,os,city
0,2020-01-01,,,NaT,,,
1,2020-01-02,,,NaT,,,
2,2020-01-03,,,NaT,,,
3,2020-01-04,,,NaT,,,
4,2020-01-05,,,NaT,,,
5,2020-01-06,,,NaT,,,
6,2020-01-07,,,NaT,,,
7,2020-01-08,,,NaT,,,
8,2020-01-09,,,NaT,,,
9,2020-01-10,,,NaT,,,


In [71]:
tup.compute_period(df, 'event_date')

func works


  data[new_col_name] = data[dt_col].dt.floor('S')  # Округляем до секунды


Unnamed: 0,user_id,event,event_date,status,os,city,date,time_unit_period
0,user1,event4,2025-01-01 20:22:10.765985,2,Android,chel,2025-01-01,2025-01-01 20:22:10
1,user4,event4,2025-01-19 13:22:10.765985,2,iOS,msk,2025-01-19,2025-01-19 13:22:10
2,user5,event1,2025-01-10 04:22:10.765985,1,iOS,chel,2025-01-10,2025-01-10 04:22:10
3,user3,event1,2025-01-02 21:22:10.765985,3,Android,chel,2025-01-02,2025-01-02 21:22:10
4,user3,event3,2025-01-13 07:22:10.765985,3,Android,msk,2025-01-13,2025-01-13 07:22:10
5,user3,event1,2024-12-30 18:22:10.765985,3,iOS,chel,2024-12-30,2024-12-30 18:22:10
6,user1,event1,2025-01-03 22:22:10.765985,2,iOS,msk,2025-01-03,2025-01-03 22:22:10
7,user5,event3,2025-01-04 23:22:10.765985,2,iOS,chel,2025-01-04,2025-01-04 23:22:10
8,user4,event4,2025-01-12 06:22:10.765985,2,iOS,chel,2025-01-12,2025-01-12 06:22:10
9,user3,event5,2025-01-10 04:22:10.765985,1,iOS,msk,2025-01-10,2025-01-10 04:22:10
