##### Carregando libs

In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from datetime import datetime, timezone

##### Declarando paths

In [2]:
root_path = os.getenv('HOST_PATH')
parquets_path = f"{root_path}/artifacts/parquets"
articles_target_file = "articles_merged_data.parquet"

##### Carregando parquets

In [3]:
df_articles = pd.read_parquet(f"{parquets_path}/{articles_target_file}")

In [4]:
df_articles.fillna("", inplace=True)

##### Processando para recencia

In [5]:
def recency_score(timestamp, alpha=0.001):  
    if pd.isnull(timestamp):
        return np.nan
    now = datetime.now(timezone.utc)
    days_diff = (now - timestamp).days
    return np.exp(-alpha * days_diff)  # Evita erro em valores NaN

In [6]:
df_articles["issued"] = pd.to_datetime(df_articles["issued"], utc=True)
df_articles["recency_score"] = df_articles["issued"].apply(recency_score)
df_articles["recency_score"] = df_articles["recency_score"] / df_articles["recency_score"].max()


##### Aplicando decaimento temporal

In [7]:
def time_decay(timestamp, half_life=7):
    """Aplica decaimento exponencial considerando um half-life em dias."""
    now = datetime.now(timezone.utc)
    days_diff = (now - timestamp).days
    return 0.5 ** (days_diff / half_life)

df_articles["time_decay"] = df_articles["issued"].apply(lambda x: time_decay(x, half_life=7))

In [8]:
df_articles["final_score"] = 0.6 * df_articles["recency_score"] + 0.4 * df_articles["time_decay"]


In [9]:
df_articles["issued"] = pd.to_datetime(df_articles["issued"]).astype(int) // 10**9
df_articles["modified"] = pd.to_datetime(df_articles["modified"]).astype(int) // 10**9

##### Gerando amostra

In [10]:
df_articles_sampled = df_articles.sample(n=50000, random_state=42)

##### Dividindo entre treino e teste

In [11]:
df_articles_train, df_articles_test = train_test_split(df_articles_sampled, test_size=0.2, random_state=42)

In [12]:
df_articles_train.head(3)

Unnamed: 0,page,url,issued,modified,title,body,caption,recency_score,time_decay,final_score
118617,d4f8487d-6f75-413d-9657-34aa3e1cd593,http://g1.globo.com/fantastico/noticia/2021/07...,1626053996,1626053996,"Documentário inédito revela caos no Haiti, paí...","Documentário inédito revela caos no Haiti, paí...",Cinco dias depois do assassinato do presidente...,0.670991,7.035664e-58,0.402594
250848,eedb0817-cbe3-4959-aa84-017989fbd42a,http://g1.globo.com/mundo/noticia/2022/06/03/g...,1654248702,1654252525,"Guerra na Ucrânia não terá vencedor, afirma ON...",Soldados ucranianos conversam com morador que ...,Ministro da Defesa ucraniano diz que Rússia já...,0.929601,7.357022e-44,0.55776
55112,661064de-acc8-48bd-93ee-b83500e718c5,http://g1.globo.com/trabalho-e-carreira/concur...,1657530049,1657530049,Brasil tem mais de 110 concursos públicos com ...,Mais de 150 concursos públicos com inscrições ...,Há oportunidades para todos os níveis de escol...,0.965605,3.168582e-42,0.579363


##### Gerando parquets de treino e teste

In [13]:

df_articles_train.to_parquet(f"{parquets_path}/artigos_treino_preprocessados.parquet", index=False)
df_articles_test.to_parquet(f"{parquets_path}/artigos_test_preprocessados.parquet", index=False)