<div style="color:#007acc; text-align:center; font-size:32px; font-weight:bold; font-family:Arial, sans-serif;">
  Explore Data Analysis and Transform Register Log Data
</div>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns',None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
df=pd.read_csv("../ingest_from_ITS/data/m952_register_2025-05-01_00-00-00_to_2025-06-26_00-00-00.csv") 
df

Unnamed: 0,@timestamp,user.id
0,2025-05-01T00:00:28.354Z,2505010700283540
1,2025-05-01T00:00:34.855Z,2505010700348550
2,2025-05-01T00:02:24.708Z,2505010702247086
3,2025-05-01T00:03:02.080Z,2505010703020807
4,2025-05-01T00:03:08.093Z,2505010703080934
...,...,...
127964,2025-06-25T23:56:39.050Z,2506260656390508
127965,2025-06-25T23:57:20.392Z,2506260657203920
127966,2025-06-25T23:58:13.088Z,2506260658130886
127967,2025-06-25T23:59:10.941Z,2506260659109416


In [3]:
df['user.id']=df['user.id'].astype(str)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127969 entries, 0 to 127968
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   @timestamp  127969 non-null  object
 1   user.id     127969 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [5]:
df.describe()

Unnamed: 0,@timestamp,user.id
count,127969,127969
unique,117929,119109
top,2025-05-19T21:24:08.235Z,2505151818390010
freq,8,4


In [6]:
df.isnull().sum()

@timestamp    0
user.id       0
dtype: int64

In [7]:
null_ratio = df.isnull().mean()
cols_to_drop = null_ratio[null_ratio > 0.95].index
print("column deleted:", list(cols_to_drop))
df = df.drop(columns=cols_to_drop)
df.shape

column deleted: []


(127969, 2)

In [8]:
df['@timestamp'] = pd.to_datetime(df['@timestamp'], errors='coerce', utc=True)
df.shape

(127969, 2)

In [9]:
len(df['user.id'].unique())

119109

In [10]:
df=df.rename(columns={'user.id':'vopenid'})

#### **Create features**

In [11]:
data=df
# now = pd.Timestamp.now(tz='UTC')
now = pd.Timestamp("2025-06-26 00:00:00", tz="UTC")

In [12]:
def last_update_time(group):
    last_time = group['@timestamp'].max()
    return {
        'last_update_time': last_time
    }

In [13]:
def register_days_since(group):
    try:
        register_time = group['@timestamp'].min()
        days_since_register = (now - register_time).days
        return {
            'days_since_register': days_since_register
        }
    except Exception as e:
        print(f"Error: {e}")
        return {
            'days_since_register': None
        }


In [14]:
features_function=[
   last_update_time,
   register_days_since,
]

In [15]:
def extract_group_features(group):
    result={}
    for func in features_function:
        try:
            result.update(func(group))
        except Exception as e:
            print(f"Error at feature: {func}. Error: {e}")
    return pd.Series(result)

df_res = data.groupby('vopenid').apply(extract_group_features).reset_index()
df_res

  df_res = data.groupby('vopenid').apply(extract_group_features).reset_index()


Unnamed: 0,vopenid,last_update_time,days_since_register
0,2505010700283540,2025-05-01 00:00:28.354000+00:00,55
1,2505010700348550,2025-05-01 00:00:34.855000+00:00,55
2,2505010702247086,2025-05-01 00:02:24.708000+00:00,55
3,2505010703020807,2025-05-01 00:03:02.080000+00:00,55
4,2505010703080934,2025-05-01 00:03:08.093000+00:00,55
...,...,...,...
119104,2506260656390508,2025-06-25 23:56:39.050000+00:00,0
119105,2506260657203920,2025-06-25 23:57:20.392000+00:00,0
119106,2506260658130886,2025-06-25 23:58:13.088000+00:00,0
119107,2506260659109416,2025-06-25 23:59:10.941000+00:00,0


In [16]:
df_res=df_res[df_res['days_since_register']>=7]

In [17]:
df_res.to_csv("data/register_transform.csv",index=False,header=True)

In [18]:
df_res.shape

(99270, 3)