##### Carregando bibliotecas

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import logging
from sklearn.model_selection import train_test_split
import os
import re

##### Declarando paths

In [2]:
root_path = os.getenv('HOST_PATH')
parquets_path = f"{root_path}/artifacts/parquets"
user_target_file = "users_merged_data.parquet"

##### Carregando Parquet de usuários

In [3]:
df_users = pd.read_parquet(f"{parquets_path}/{user_target_file}")

##### Tratando valores do dataframe

Preenchendo na de history com [] e transformando em lista

In [4]:
def is_valid_uuid(value):
    return bool(re.match(r'^[a-f0-9\-]{36}$', value))

df_users.fillna({"history": "[]"}, inplace=True)
df_users["history"] = df_users["history"].apply(lambda x: [
    i.strip() for i in (x.split(",") if isinstance(x, str) else []) if is_valid_uuid(i.strip())
])


Convertendo timestampHistory para datetime

In [5]:
df_users["timestampHistory"] = df_users["timestampHistory"].apply(lambda x: list(map(int, x.split(","))) if isinstance(x, str) else [])

Convertendo média dos dados comportamentais do usuário

In [6]:
def convert_to_mean(value):
    if isinstance(value, str):
        values = [float(x.strip()) for x in value.split(",") if x.strip().replace('.', '', 1).isdigit()]
        return np.mean(values) if values else 0
    return value

cols_to_convert = ["numberOfClicksHistory", "timeOnPageHistory", "scrollPercentageHistory", "pageVisitsCountHistory"]
for col in cols_to_convert:
    df_users[col] = df_users[col].apply(convert_to_mean)

In [7]:
scaler = MinMaxScaler()
df_users[[
    "historySize",
    "numberOfClicksHistory",
    "timeOnPageHistory",
    "scrollPercentageHistory",
    "pageVisitsCountHistory"
]] = \
    scaler.fit_transform(
        df_users[
            [
                "historySize",
                "numberOfClicksHistory",
                "timeOnPageHistory",
                "scrollPercentageHistory",
                "pageVisitsCountHistory"
            ]
        ]
    )

In [8]:
encoder = LabelEncoder()
df_users["userType"] = encoder.fit_transform(df_users["userType"])

In [9]:
df_users.head(3)

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,1,0.000143,"[80aa7bb2-adce-4a55-9711-912c407927a1, d9e5f15...","[1657908085200, 1659634203762]",0.0,0.004015,0.000337,0.0,"1657908085200, 1659634203762"
1,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,1,0.000143,"[19ba89fc-1e06-4c5d-9c57-4a3088dc0511, e273dba...","[1657111508570, 1657481309920]",0.003427,0.003744,0.000189,0.0,"1657111508570, 1657481309920"
2,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,1,0.000143,"[59a61a8a-cc52-453f-b1cd-2bd019e9d574, a056280...","[1657823890328, 1660141444328]",0.002742,0.003611,0.00024,0.0,"1657823890328, 1660141444328"


##### Gerando amostra

In [10]:
# df_users = df_users[df_users['history'].apply(lambda history: sum(article in df_articles['page'].values for article in history) >= len(history) / 2)]
df_users_sampled = df_users.sample(n=10000, random_state=42)

##### Dividindo entre treino e teste

In [11]:
df_users_train, df_users_test = train_test_split(df_users_sampled, test_size=0.2, random_state=42)

##### Gerando parquets de treino e teste

In [12]:
df_users_train.to_parquet(f"{parquets_path}/usuarios_treino_preprocessados.parquet", index=False)
df_users_test.to_parquet(f"{parquets_path}/usuarios_test_preprocessados.parquet", index=False)