In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns',None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [38]:
df=pd.read_csv("../ingest_from_ITS/data/m952_login_2025-05-15_00-00-00_to_2025-06-15_23-59-59.csv",usecols=[
    "@timestamp", "user.id", "event.action","event.its.properties.gold",
    "event.its.properties.diamond","event.its.properties.power_point","event.its.properties.level",
    "event.its.properties.vip_level","event.its.properties.dragon_gold"
    ]) 
df.shape

  df=pd.read_csv("../ingest_from_ITS/data/m952_login_2025-05-15_00-00-00_to_2025-06-15_23-59-59.csv",usecols=[


(14710000, 9)

In [39]:
df=df[~df['user.id'].isnull()]
df.reset_index(drop=True,inplace=True)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271288 entries, 0 to 5271287
Data columns (total 9 columns):
 #   Column                            Dtype 
---  ------                            ----- 
 0   @timestamp                        object
 1   event.action                      object
 2   event.its.properties.dragon_gold  object
 3   event.its.properties.gold         object
 4   event.its.properties.power_point  object
 5   event.its.properties.level        object
 6   event.its.properties.diamond      object
 7   event.its.properties.vip_level    object
 8   user.id                           object
dtypes: object(9)
memory usage: 362.0+ MB


In [41]:
df.head(3)

Unnamed: 0,@timestamp,event.action,event.its.properties.dragon_gold,event.its.properties.gold,event.its.properties.power_point,event.its.properties.level,event.its.properties.diamond,event.its.properties.vip_level,user.id
0,2025-05-14T17:00:00.000Z,its_login,171.0,3752910.0,1425822.0,47.0,390.0,3.0,2504191703569743.0
1,2025-05-14T17:00:00.000Z,its_login,0.0,287350.0,36020.0,20.0,0.0,0.0,2505062131237087.0
2,2025-05-14T17:00:00.000Z,its_login,592.0,7324129.0,1489372.0,51.0,323.0,3.0,2504102040141995.0


In [42]:
df['@timestamp'] = pd.to_datetime(df['@timestamp'], errors='coerce', utc=True)

In [43]:
len(df['user.id'].unique())

186730

In [44]:
numeric_fields = [
    "event.its.properties.vip_level",
    "event.its.properties.dragon_gold",
    "event.its.properties.diamond",
    "event.its.properties.gold",
    "event.its.properties.power_point",
    "event.its.properties.level"
]

for col in numeric_fields:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(-1.0)

In [45]:
df=df.rename(columns={'user.id':'vopenid'})
df['vopenid']=df['vopenid'].astype(str)

In [46]:
now = pd.Timestamp("2025-06-16 12:00:00", tz="UTC")

In [47]:
df=df.sort_values(by=['@timestamp'])

In [49]:
df['@timestamp'].min(),df['@timestamp'].max()

(Timestamp('2025-05-14 17:00:00+0000', tz='UTC'),
 Timestamp('2025-05-28 18:44:49+0000', tz='UTC'))

In [50]:
data=df

In [51]:
# time update
def last_update_time(group):
    last_time = group['@timestamp'].max()
    return {
        'last_update_time': last_time
    }

In [52]:
# count how many days that user login
def num_day_login(group):
    num_day = group['@timestamp'].dt.date.nunique()
    return {
        "num_day_login":num_day,
    }

In [53]:
# count how many sessions that user login per day
def num_session_on_day_login(group):
    if group.empty:
        return {
            'avg_login_per_day': 0
        }
    group['date'] = group['@timestamp'].dt.date
    daily_counts = group.groupby('date').size()
    avg = daily_counts.mean()
    return {
        'avg_login_per_day': avg
    }

In [54]:
# calculate average time amongs logins of user
def avg_between_login(group):
    times = group['@timestamp'].sort_values()
    if len(times) < 2:
        return {
            'avg_between_login': 0
        }
    deltas = times.diff().dropna()
    res=deltas.mean().total_seconds() / 86400
    return {
        'avg_between_login': res
    }

In [55]:
# the longest streak day that user login
def streak_login(group):
    days = pd.to_datetime(group['@timestamp'].dt.date).sort_values().drop_duplicates()
    streak = 1
    max_streak=1
    for i in range(1, len(days)):
        if (days.iloc[i] - days.iloc[i-1]).days == 1:
            streak += 1
            max_streak = max(max_streak, streak)
        else:
            streak = 1
    return {
        'streak_login': max_streak
    }

In [56]:
# how many days that user not login again
def last_since_from_login(group):
    last_time = group['@timestamp'].max()
    last_time = last_time.tz_localize('UTC') if last_time.tzinfo is None else last_time
    res=(now-last_time).days
    return {
        'last_since_from_login': res
    }

In [57]:
# the popular time that user login
def popular_time_login(group):
    hours = group['@timestamp'].dt.hour
    mode = hours.mode()
    res=mode.iloc[0] if not mode.empty else None
    return {
        'popular_time_login': res
    }

In [21]:
# std gold
def std_gold_login(group):
    gold_series = group.sort_values('@timestamp')['event.its.properties.gold']
    if len(gold_series) < 2:
        std_gold = 0.0  
    else:
        std_gold = gold_series.std()
    
    return {
        'gold_std_login': std_gold
    }

In [58]:
# std diamond
def std_diamond_login(group):
    diamond_series = group.sort_values('@timestamp')['event.its.properties.diamond']
    if len(diamond_series) < 2:
        std_diamond = 0.0  
    else:
        std_diamond = diamond_series.std()
    
    return {
        'diamond_std_login': std_diamond
    }

In [59]:
# std silkcoin
def std_silkcoin_login(group):
    silkcoin_series = group.sort_values('@timestamp')['event.its.properties.dragon_gold']
    if len(silkcoin_series) < 2:
        std_silkcoin = 0.0  
    else:
        std_silkcoin = silkcoin_series.std()
    
    return {
        'silkcoin_std_login': std_silkcoin
    }

In [60]:
# the last powerpoint of user
def last_login_powerpoint(group):
    latest = group.sort_values('@timestamp', ascending=False).iloc[0]
    return {
        'powerpoint_last_login': latest['event.its.properties.power_point']
    }

In [61]:
# slope gold
def slope_gold_login(group):
    group_sorted = group.sort_values('@timestamp')
    gold_values = group_sorted['event.its.properties.gold'].values

    n = len(gold_values)
    if n < 2:
        return { 'gold_slope_login': 0.0 }

    x = np.arange(n)
    y = gold_values

    mean_x = x.mean()
    mean_y = y.mean()

    numerator = np.sum((x - mean_x) * (y - mean_y))
    denominator = np.sum((x - mean_x) ** 2)

    slope = numerator / denominator if denominator != 0 else 0.0

    return {
        'gold_slope_login': slope
    }

In [62]:
# slope diamond
def slope_diamond_login(group):
    group_sorted = group.sort_values('@timestamp')
    diamond_values = group_sorted['event.its.properties.diamond'].values

    n = len(diamond_values)
    if n < 2:
        return { 'diamond_slope_login': 0.0 }

    x = np.arange(n)
    y = diamond_values

    mean_x = x.mean()
    mean_y = y.mean()

    numerator = np.sum((x - mean_x) * (y - mean_y))
    denominator = np.sum((x - mean_x) ** 2)

    slope = numerator / denominator if denominator != 0 else 0.0

    return {
        'diamond_slope_login': slope
    }

In [63]:
# slope silkcoin
def slope_silkcoin_login(group):
    group_sorted = group.sort_values('@timestamp')
    silkcoin_values = group_sorted['event.its.properties.dragon_gold'].values

    n = len(silkcoin_values)
    if n < 2:
        return { 'silkcoin_slope_login': 0.0 }

    x = np.arange(n)
    y = silkcoin_values

    mean_x = x.mean()
    mean_y = y.mean()

    numerator = np.sum((x - mean_x) * (y - mean_y))
    denominator = np.sum((x - mean_x) ** 2)

    slope = numerator / denominator if denominator != 0 else 0.0

    return {
        'silkcoin_slope_login': slope
    }

In [64]:
# level of user
def last_login_level(group):
    latest = group.sort_values('@timestamp', ascending=False).iloc[0]
    return {
        'level_last_login': latest['event.its.properties.level']
    }

In [65]:
# vip of user
def last_login_vip(group):
    latest = group.sort_values('@timestamp', ascending=False).iloc[0]
    return {
        'vip_last_login': latest['event.its.properties.vip_level']
    }

In [66]:
features_function=[
   last_update_time,
   num_day_login,
   num_session_on_day_login,
   avg_between_login,
   streak_login,
   last_since_from_login,
   popular_time_login,
   std_gold_login,
   std_diamond_login,
   std_silkcoin_login,
   slope_gold_login,
   slope_diamond_login,
   slope_silkcoin_login,
   last_login_powerpoint,
   last_login_level,
   last_login_vip,
]

In [67]:
def extract_group_features(group):
    result={}
    for func in features_function:
        try:
            result.update(func(group))
        except Exception as e:
            print(f"Error at feature: {func}. Error: {e}")
    return pd.Series(result)

df_res = data.groupby('vopenid').apply(extract_group_features).reset_index()
df_res

  df_res = data.groupby('vopenid').apply(extract_group_features).reset_index()


Unnamed: 0,vopenid,last_update_time,num_day_login,avg_login_per_day,avg_between_login,streak_login,last_since_from_login,popular_time_login,gold_std_login,diamond_std_login,silkcoin_std_login,gold_slope_login,diamond_slope_login,silkcoin_slope_login,powerpoint_last_login,level_last_login,vip_last_login
0,zZzHoaLanzZz,2025-05-19 03:03:46+00:00,1,1.00,0.00,1,28,3,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,56182.00
1,⁀ᶦᵈᵒᶫViệtCon︵⁹²,2025-05-27 13:12:04+00:00,1,1.00,0.00,1,19,13,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,2504081111499176.00,10847526.00
2,!!!ThầnLong!!!,2025-05-25 01:48:58+00:00,1,1.00,0.00,1,22,1,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,2654.00
3,!RockbabyRock!,2025-05-25 08:37:09+00:00,1,1.00,0.00,1,22,8,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,4667902.00
4,!YếnNhi!,2025-05-19 03:10:43+00:00,1,1.00,0.00,1,28,3,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,142719047.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186369,ＶＥＲＥＳ,2025-05-17 07:15:30+00:00,1,1.00,0.00,1,30,7,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,2503041831463798.00,29178943.00
186370,ＶＩＢＢＡＮＫ,2025-05-25 01:52:26+00:00,1,1.00,0.00,1,22,1,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,456923290.00
186371,Ｖａｍｐｉｒｅ,2025-05-19 03:09:46+00:00,1,1.00,0.00,1,28,3,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,18525639.00
186372,ｐｅｎｇｕｉｎ,2025-05-19 03:02:09+00:00,1,1.00,0.00,1,28,3,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,4882709.00


In [68]:
df_res['vopenid'] = df_res['vopenid'].astype(str).str.strip()

In [69]:
df_res.to_csv("data/login_transform.csv",index=False,header=True)

In [70]:
df_res['vopenid'].nunique()

186374

In [71]:
df_res

Unnamed: 0,vopenid,last_update_time,num_day_login,avg_login_per_day,avg_between_login,streak_login,last_since_from_login,popular_time_login,gold_std_login,diamond_std_login,silkcoin_std_login,gold_slope_login,diamond_slope_login,silkcoin_slope_login,powerpoint_last_login,level_last_login,vip_last_login
0,zZzHoaLanzZz,2025-05-19 03:03:46+00:00,1,1.00,0.00,1,28,3,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,56182.00
1,⁀ᶦᵈᵒᶫViệtCon︵⁹²,2025-05-27 13:12:04+00:00,1,1.00,0.00,1,19,13,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,2504081111499176.00,10847526.00
2,!!!ThầnLong!!!,2025-05-25 01:48:58+00:00,1,1.00,0.00,1,22,1,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,2654.00
3,!RockbabyRock!,2025-05-25 08:37:09+00:00,1,1.00,0.00,1,22,8,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,4667902.00
4,!YếnNhi!,2025-05-19 03:10:43+00:00,1,1.00,0.00,1,28,3,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,142719047.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186369,ＶＥＲＥＳ,2025-05-17 07:15:30+00:00,1,1.00,0.00,1,30,7,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,2503041831463798.00,29178943.00
186370,ＶＩＢＢＡＮＫ,2025-05-25 01:52:26+00:00,1,1.00,0.00,1,22,1,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,456923290.00
186371,Ｖａｍｐｉｒｅ,2025-05-19 03:09:46+00:00,1,1.00,0.00,1,28,3,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,18525639.00
186372,ｐｅｎｇｕｉｎ,2025-05-19 03:02:09+00:00,1,1.00,0.00,1,28,3,0.00,0.00,0.00,0.00,0.00,0.00,-1.00,-1.00,4882709.00
