In [1]:
import pandas as pd

# GA Hits
- session_id — ID визита;
- hit_date — дата события;
- hit_time — время события;
- hit_number — порядковый номер события в рамках сессии, последовательность действий;
- hit_type — тип события;
- hit_referer — источник события, с какой страницы перешел пользователь;
- hit_page_path — страница события, на которой произошло событие;
- event_category — тип действия;
- event_action — действие;
- event_label — тег действия;
- event_value — значение результата действия, обычно число.

In [2]:
df = pd.read_csv("C:/Users/Ekaterina/sber_de/de_for_sber/data/raw/ga_hits.csv")

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df['session_id'].value_counts()


session_id
5442565791571325612.1632449195.1632449195    768
6568868914238486437.1632270313.1632270313    678
5959671972744778783.1632490527.1632490600    548
7452598043578978502.1632358598.1632358598    514
3070792010704358528.1629752408.1629752408    498
                                            ... 
4570994095441102117.1630324006.1630324006      1
768506153535315810.1630314342.1630314342       1
5517253735248338264.1630315080.1630315080      1
4054272189286640867.1630334209.1630334209      1
5381267721977892188.1640271196.1640271196      1
Name: count, Length: 1734610, dtype: int64

In [5]:
# Функция для преобразования времени
def convert_milliseconds_to_time(ms):
    if pd.isna(ms):
        return "00:00:00"  # Пропуски заполняем дефолтным временем
    total_seconds = ms // 1000
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

# Преобразуем колонку с учетом NaN
df['hit_time'] = df['hit_time'].apply(convert_milliseconds_to_time)




In [6]:
df['hit_date'] = pd.to_datetime(df['hit_date'],  format='%Y-%m-%d')

In [8]:
df['hit_referer'] = df['hit_referer'].fillna('unknown')

In [9]:
df['event_label'] = df['event_label'].fillna('unknown')

In [10]:
df['event_value'] = df['event_value'].fillna(0)

In [12]:
print(df.isna().sum())  # Покажет количество настоящих NaN


session_id        0
hit_date          0
hit_time          0
hit_number        0
hit_type          0
hit_referer       0
hit_page_path     0
event_category    0
event_action      0
event_label       0
event_value       0
dtype: int64


In [13]:
df.set_index('session_id', inplace=True)


In [16]:
df.head(20)

Unnamed: 0,session_id,hit_date,hit_time,hit_number,hit_type,hit_referer,hit_page_path,event_category,event_action,event_label,event_value
0,5639623078712724064.1640254056.1640254056,2021-12-23,00:09:57,30,event,unknown,sberauto.com/cars?utm_source_initial=google&ut...,quiz,quiz_show,unknown,0.0
1,7750352294969115059.1640271109.1640271109,2021-12-23,00:09:57,41,event,unknown,sberauto.com/cars/fiat?city=1&city=18&rental_c...,quiz,quiz_show,unknown,0.0
2,885342191847998240.1640235807.1640235807,2021-12-23,00:13:16,49,event,unknown,sberauto.com/cars/all/volkswagen/polo/e994838f...,quiz,quiz_show,unknown,0.0
3,142526202120934167.1640211014.1640211014,2021-12-23,00:15:34,46,event,unknown,sberauto.com/cars?utm_source_initial=yandex&ut...,quiz,quiz_show,unknown,0.0
4,3450086108837475701.1640265078.1640265078,2021-12-23,00:12:48,79,event,unknown,sberauto.com/cars/all/mercedes-benz/cla-klasse...,quiz,quiz_show,unknown,0.0
5,6466333295973247896.1640246168.1640246168,2021-12-23,00:11:44,41,event,unknown,sberauto.com/cars?utm_source_initial=youtube&u...,quiz,quiz_show,unknown,0.0
6,281215510786615563.1640245516.1640245516,2021-12-23,00:15:11,102,event,unknown,sberauto.com/cars/toyota?isnew=false&rental_ca...,quiz,quiz_show,unknown,0.0
7,4024492994895054107.1640269084.1640269084,2021-12-23,00:17:51,85,event,unknown,sberauto.com/cars/all/mercedes-benz/glc/f8f330...,quiz,quiz_show,unknown,0.0
8,555009234841130092.1640256620.1640256620,2021-12-23,00:16:53,101,event,VloVXNWduHeTjUoDkjkO,sberauto.com/cars/all/kia/sorento/c38179cb?utm...,quiz,quiz_show,unknown,0.0
9,2692901778487480807.1640206845.1640206845,2021-12-23,00:00:00,1,event,unknown,sberauto.com/cars/all/nissan/x-trail/0744675f?...,card_web,view_card,unknown,0.0


In [15]:
# Сбросить индекс и добавить его как столбец
df.reset_index(inplace=True)
