In [32]:
import pandas as pd
import numpy as np
import joblib
import pickle
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import ast
import re



##### Caminho dos arquivos sources

In [33]:
user_csv_path = "/usr/local/airflow/artifacts/parquets"
user_target_csv_file = "users_merged_data.parquet"

articles_csv_path = "/usr/local/airflow/artifacts/parquets"
articles_target_csv_file = "articles_merged_data.parquet"

artifacts_models_path = "/usr/local/airflow/artifacts/models"


##### Carregando os dados dos usuários, artigos e validacao

In [34]:
df_users = pd.read_parquet(f"{user_csv_path}/{user_target_csv_file}")
df_articles = pd.read_parquet(f"{articles_csv_path}/{articles_target_csv_file}")
df_validacao = pd.read_csv("../data/validacao.csv")



In [35]:
df_validacao.shape

(112184, 4)

##### Preenchendo valores ausentes de 'history' com lista vazia

In [36]:
df_users.fillna({"history": "[]"}, inplace=True)
df_articles.fillna("", inplace=True)


In [37]:
df_articles["issued"] = pd.to_datetime(df_articles["issued"]).astype(int) // 10**9
df_articles["modified"] = pd.to_datetime(df_articles["modified"]).astype(int) // 10**9


In [38]:
def convert_to_mean(value):
    if isinstance(value, str):
        values = [float(x.strip()) for x in value.split(",") if x.strip().replace('.', '', 1).isdigit()]
        return np.mean(values) if values else 0
    return value

cols_to_convert = ["numberOfClicksHistory", "timeOnPageHistory", "scrollPercentageHistory", "pageVisitsCountHistory"]
for col in cols_to_convert:
    df_users[col] = df_users[col].apply(convert_to_mean)
    
# df_users.head(3)

In [39]:
scaler = MinMaxScaler()
df_users[[
    "historySize",
    "numberOfClicksHistory",
    "timeOnPageHistory",
    "scrollPercentageHistory",
    "pageVisitsCountHistory"
]] = \
    scaler.fit_transform(
        df_users[
            [
                "historySize",
                "numberOfClicksHistory",
                "timeOnPageHistory",
                "scrollPercentageHistory",
                "pageVisitsCountHistory"
            ]
        ]
    )
# df_users.head(3)

In [40]:
df_users["history"] = df_users["history"].apply(lambda x: x.split(",") if isinstance(x, str) else [])

In [41]:
encoder = LabelEncoder()
df_users["userType"] = encoder.fit_transform(df_users["userType"])


In [42]:
df_users_sampled = df_users.sample(n=5000, random_state=42)
df_articles_sampled = df_articles.sample(n=1000, random_state=42)

In [43]:
interaction_data_sampled = []
for _, row in df_users_sampled.iterrows():
    for article in row['history']:
        interaction_data_sampled.append((row['userId'], article))

df_interactions = pd.DataFrame(interaction_data_sampled, columns=["userId", "page"])


In [44]:
# df_interactions.to_parquet("../artifacts/parquets/interactions.parquet", index=False)
print("✅ Pré-processamento concluído e salvo em Parquet!")

✅ Pré-processamento concluído e salvo em Parquet!


In [45]:
interaction_matrix = df_interactions.pivot_table(index='userId', columns='page', aggfunc='size', fill_value=0)
# interaction_matrix = pd.DataFrame()

# batch_size = 500
# for start in range(0, len(df_interactions), batch_size):
#     batch = df_interactions.iloc[start:start + batch_size]
#     interaction_matrix_batch = batch.groupby(['userId', 'page']).size().unstack(fill_value=0)

#     if interaction_matrix.empty:
#         interaction_matrix = interaction_matrix_batch
#     else:
#         interaction_matrix = interaction_matrix.add(interaction_matrix_batch, fill_value=0)

# interaction_matrix = interaction_matrix.fillna(0).astype(int) 


In [46]:
def compute_similarity_batch(matrix, batch_size=400):
    indices = matrix.index
    similarity_matrix = np.zeros((len(indices), len(indices)))

    for start in range(0, len(indices), batch_size):
        end = min(start + batch_size, len(indices))
        batch_similarity = cosine_similarity(matrix.iloc[start:end], matrix)
        similarity_matrix[start:end] = batch_similarity

    return pd.DataFrame(similarity_matrix, index=indices, columns=indices)

In [47]:
print("🔄 Calculando matriz de similaridade em batches...")
cos_sim_df = compute_similarity_batch(interaction_matrix, batch_size=400)
print("✅ Matriz de similaridade calculada!")

🔄 Calculando matriz de similaridade em batches...
✅ Matriz de similaridade calculada!


In [48]:
print(cos_sim_df.index) 

Index(['00182678dd5d1b34b2fdcd9895047534d727bdbfe8a2c9a685b5dbe41b06ec08',
       '001d8e123f6cd5e5268e5145481b79a7cce0bd040cae9cca83e46dca64928a52',
       '00291fc8bb37f717d15510726f6456ee194683ef1db0597052278cc692bee27f',
       '00303f741dd07bd666fb7b28a6f91ec26c7801e3ef19515a55435c9b2d765637',
       '003815bb5fd4e3630767b97ad3878fd817012d26ff637982d03ce676e2473d45',
       '005a263fa9eb634bb2f1fa59adba5eb10436b95e63f2afbcadfa9a665d62bc4d',
       '005a440341464da88516ced0262e7b5980f67e6996a437500589c0f8d9843339',
       '0062a2ad6f5315aba661afffd689165f42d4eb39699f15a557c3952eef87893d',
       '006d2226166f56d91ca9f0547518e314c1baf437d8129b8c2f05679979763f65',
       '00795353f5d395a7e4a60fa75fe51f118c080779150cb4e61e5add850ef16fe0',
       ...
       'ffaf2bba8524410db873a62300be2a8fa4be7134de5f784a518a3fd0a5a65b7f',
       'ffb6140c9f22dcecfb0d4b198cee8e4800ee9c6cbd943073b6b1de13307b5e49',
       'ffbba239f5833b50d94b28d9921b1320ba50c52c87523e094d8ce2b86a7a0607',
       'ffbfdf

##### K Means

In [49]:
# kmeans = KMeans(n_clusters=3, random_state=42)
# df_users_sampled['cluster'] = kmeans.fit_predict(df_users_sampled[['numberOfClicksHistory', 'timeOnPageHistory']])

# # Exibindo clusters dos usuários
# print("Clusters dos usuários:")
# print(df_users_sampled[['userId', 'cluster']].head())

In [50]:

df_validacao["userId"] = df_validacao["userId"].str.strip()
df_validacao['history'] = df_validacao['history'].apply(
    lambda hist: hist.replace('\n', ' ')
                     .replace("'", ' ')
                     .replace("[", ' ')
                     .replace("]", ' ')
                     .strip() if isinstance(hist, str) else hist
)

cos_sim_df.index = cos_sim_df.index.str.strip()
df_validacao = df_validacao[df_validacao["userId"].isin(cos_sim_df.index)]

df_validacao


Unnamed: 0,userId,userType,history,timestampHistory
31,774b02cc2933f9965beeb972b8ccbc73b422ae5629b7a6...,Logged,df9e72d2-8c0e-4d69-aa89-e38fcb1fece6 b6832e...,[1660655488271 1660655495522]
78,42fc794baae506a7f52437551b575ec64c888d2833782c...,Logged,d06c8c8c-ed25-4ac7-b4eb-16f711dbb9c0 b19d80...,[1660582117088 1660637425062]
155,8e051215c10c6b221212e33161abff8183aaabe821dad6...,Logged,a4b081bb-dcc2-49cf-a975-583d59cca0d1 63e0b4...,[1660576231246 1660664660616]
282,2e47e3a007614db7219e4fc9f33ef2f403b8c2cb44643a...,Logged,c5a47987-c273-409a-ba13-134b906660a8 f52361...,[1660655752266 1660661123504]
410,dc7cf351da0db5daf7f4751a2d00a394bede88142466a8...,Logged,78996117-18ae-4fc5-8779-acfc8d271ca2,[1660617194601]
...,...,...,...,...
111818,fb4251f41f861401dc53ae70dd79b26e06f5ea5ce7326d...,Non-Logged,eac0d074-ca85-4883-8af0-d9b787ec770f,[1660571851482]
111839,1864d65c5667af2966561265872c9af88610c41471a52b...,Non-Logged,62818cca-7b45-4942-a3db-8c34a52fd4c8,[1660561133513]
111872,8881f1b6c262bd9863b17dc228ef0fab5886f77161c3be...,Non-Logged,b5320e4f-5759-48db-aa73-f9684f55a528,[1660678716188]
111966,ae20cb06270c0f3db6207d50d32f817e0accd605198ef1...,Non-Logged,6ceed9c6-265b-48b6-b5ad-4fb26e8f9cbc,[1660672904871]


In [51]:
cos_sim_df.head(3)

userId,00182678dd5d1b34b2fdcd9895047534d727bdbfe8a2c9a685b5dbe41b06ec08,001d8e123f6cd5e5268e5145481b79a7cce0bd040cae9cca83e46dca64928a52,00291fc8bb37f717d15510726f6456ee194683ef1db0597052278cc692bee27f,00303f741dd07bd666fb7b28a6f91ec26c7801e3ef19515a55435c9b2d765637,003815bb5fd4e3630767b97ad3878fd817012d26ff637982d03ce676e2473d45,005a263fa9eb634bb2f1fa59adba5eb10436b95e63f2afbcadfa9a665d62bc4d,005a440341464da88516ced0262e7b5980f67e6996a437500589c0f8d9843339,0062a2ad6f5315aba661afffd689165f42d4eb39699f15a557c3952eef87893d,006d2226166f56d91ca9f0547518e314c1baf437d8129b8c2f05679979763f65,00795353f5d395a7e4a60fa75fe51f118c080779150cb4e61e5add850ef16fe0,...,ffaf2bba8524410db873a62300be2a8fa4be7134de5f784a518a3fd0a5a65b7f,ffb6140c9f22dcecfb0d4b198cee8e4800ee9c6cbd943073b6b1de13307b5e49,ffbba239f5833b50d94b28d9921b1320ba50c52c87523e094d8ce2b86a7a0607,ffbfdf2b0ba51cf305b501570ca7c0aa5066565bbd0d9306037bff7606cce0c3,ffd42d2faf86e4dc0d0534749d65f8a524a27b03a5c0760004a7b8b473cae70e,ffd53e49fef1d5c1e74d59bca2192b81e680c747291a20507d819186dde8249c,ffdc918dbcb8a744f87f020fb7a8a572da5816294eb83e567cf68d1639ef4d5f,ffe13a4671353d41f3985d1e8dcf4f4226f52fbb92cd9075f8a609a10f4f5fc8,ffee9934ea6cc18c9555da8d3dedece24409dbf25045e257aa62dcb62d758f70,fffe297d4d14f23bbf4dcc81b530f40478e4d1118f9e6483dc99461ceb3a51e8
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00182678dd5d1b34b2fdcd9895047534d727bdbfe8a2c9a685b5dbe41b06ec08,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001d8e123f6cd5e5268e5145481b79a7cce0bd040cae9cca83e46dca64928a52,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00291fc8bb37f717d15510726f6456ee194683ef1db0597052278cc692bee27f,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Função para calcular a precisão das recomendações

In [52]:
def evaluate_recommendations(recommended_articles, history):
    # print(f"recommended_articles: {recommended_articles}")
    # print(f"history: {history}")

    # Verifica se history é um array NumPy e converte para lista
    if isinstance(history, np.ndarray):
        history = history.tolist()  # Converte array NumPy para lista normal
    
    # Verifica se history é uma string concatenada em vez de uma lista real
    if isinstance(history, str):
        history = history.strip("[]").replace("'", "").split()
        
    # print(type(history))
    relevant_articles = [article.strip().lower() for article in history]
    # print(relevant_articles)
    
    
    recommended_articles = [article.strip().lower() for article in recommended_articles]
    correct_recommendations = len(set(recommended_articles).intersection(set(relevant_articles)))
    total_recommendations = len(recommended_articles)
    return correct_recommendations, total_recommendations

In [53]:
df_validacao.shape

(963, 4)

##### Recomendações para usuários de validação

In [54]:
correct_recommendations = 0
total_recommendations = 0

for _, row in df_validacao.iterrows():
    user_id = row['userId']
    # print(user_id)
    # Verificar se o user_id está presente no cos_sim_df (índice)
    if user_id not in cos_sim_df.index:
        print(f"❌ O userId {user_id} não está presente na matriz de similaridade.")
        continue  # Pula o usuário se não estiver no cos_sim_df

    # Usuários similares
    user_similarity = cos_sim_df.loc[user_id]
    # print(user_similarity)
    similar_users = user_similarity.sort_values(ascending=False).index[1:3]
    # print(similar_users)

    # Artigos recomendados
    # recommended_articles = []
    # for similar_user in similar_users:
    #     similar_user_articles = df_interactions[df_interactions['userId'] == similar_user]['page']
    #     recommended_articles.extend(similar_user_articles)

    recommended_articles = {
        article.strip() for similar_user in similar_users
        for article in df_interactions[df_interactions['userId'] == similar_user]['page']
    }
    # Removendo duplicatas
    recommended_articles = list(recommended_articles)

    # Avaliar precisão das recomendações
    correct, total = evaluate_recommendations(recommended_articles, row['history'])
    correct_recommendations += correct
    total_recommendations += total

##### Calculando a precisão das recomendações

In [55]:
print(f"correct_recommendations: {correct_recommendations}")
print(f"total_recommendations: {total_recommendations}")
precision = correct_recommendations / total_recommendations if total_recommendations > 0 else 0
print(f"✅ Precisão das recomendações: {precision:.6f}")

correct_recommendations: 209
total_recommendations: 374559
✅ Precisão das recomendações: 0.000558


In [56]:
# Recomendações para um usuário logado
user_id = df_users_sampled['userId'].iloc[0]  # Exemplo de usuário logado (pode ser qualquer userId)
user_similarity = cos_sim_df[user_id]
similar_users = user_similarity.sort_values(ascending=False).index[1:3]
print(f"Usuários mais similares ao usuário {user_id}:")
print(similar_users)

Usuários mais similares ao usuário c196609069bdb5a080bdc889d71028674e580318f0bd6c1bcc869ee0e632a735:
Index(['ac5ced0ff05fef6681f246a4a2703e38fbb76009ddc7c58632d5c35d83050da3',
       'ac35f2157929adc7128d6309da6ad34e9f0d9483b00143b598afacddcd78d129',
       'ac29f3dcb11736a2ca4df2a1071b8e8bb20b7c1e83b97fe18c669fe767f86d0e',
       'ac28b85fb23da7b4c32ffb8adef475078382ee265f783e9001ac00f0c513ee3a',
       'ac18e45fb2f8b5ef362abc2c365005e602b7786651b0652400ab761ed8a1d88f',
       'ac0631ac8052e2dc17390159b003ab5acdbe061e851432911b636a1eec2d1b1a',
       'abf7d8705bc6b4e1244d82a2afece0724926e523f097616c19029a1e40297b7b',
       'abf47a3e9a9410f4770ec494ee6b98ea1b4841f0d52bdeebf262ba68a743bffd',
       'abd5af7165511c280e238297d8b5838bcbf5adbb983665daa0e2c0b8850382bb'],
      dtype='object', name='userId')


##### Sugestões de artigos baseadas em similaridade (para o usuário logado)


In [57]:
recommended_articles = []
for similar_user in similar_users:
    similar_user_articles = df_interactions[df_interactions['userId'] == similar_user]['page']
    recommended_articles.extend(similar_user_articles)

In [58]:
# Exibindo os artigos recomendados
recommended_articles = list(set(recommended_articles))  # Removendo duplicatas
recommended_articles = [article.strip() for article in recommended_articles]  # Removendo espaços extras
print("Artigos recomendados:")
print(recommended_articles)

Artigos recomendados:
['6b6635bb-d15a-4698-97b2-f3f25be0d019', '533b3efd-b7d4-418e-92a9-0a1d88440963', 'ebc00bed-3e2e-45d3-8a9b-0ce430f30d0f', '727213de-82b8-41b6-9296-2c2ee095c001', '07d86b0a-993d-4f78-8437-c1eadd2b3c46', 'eaf9c665-560e-4bca-9574-983f2e11264a', 'eb538002-709e-4054-9a68-8808aa5d5678', 'e065162f-baf3-4d42-a6d9-5acdb981b37f', '325cb5b1-8e3b-464c-8d13-2bf4748d2c89', '85beb942-01d5-44ce-a5aa-296139c73667', '362d282e-5d9d-4691-86a6-21bcac0703d7', '13f2cc37-f575-44d5-b33f-045d0b0a912b', '7fe849c0-4a55-429d-b480-11ee216909dd', '76844ba9-6ab4-4397-aeb9-7bab8df2f95d', 'a3a2b2fd-9ae8-4fcf-bbd9-7db104411fcc', '139c9e4d-b7eb-43d0-b2fc-1cb230f0adc8', '78f442cc-1fb3-4bf0-a9c7-ab6cf5a73526', '6db754a3-73b8-4ffa-895f-a03725486bff', '2ad4930b-bd99-4884-9c07-12cc46f53417', '6452a218-de3c-4511-b482-d58f26b7ecda', '1870371b-1bec-465d-9213-0f9b3162dfad', '882e7c95-935a-4eab-9ece-f85f5f7d0f4e', '39959832-be7e-4420-94ab-9244f409bfe9', 'cbd5caa8-568d-408e-909f-056c32d6b75a', 'c2d694c1-f651-44

In [59]:
# Recomendações para usuários não logados: artigos populares
popular_articles = df_interactions.groupby('page').size().reset_index(name='popularity')
popular_articles = popular_articles.sort_values('popularity', ascending=False)
print("Artigos mais populares:")
print(popular_articles.head(3))

Artigos mais populares:
                                        page  popularity
13221   d2593c3d-2347-40d9-948c-b6065e8459a9         208
15610   f6b5d170-48b9-4f8e-88d4-c84b6668f3bd         190
6708    6a83890a-d9e9-4f6b-a6c6-90d031785bbf         162


In [60]:
joblib.dump(kmeans, f"{artifacts_models_path}/kmeans_model.pkl")

# Save cosine similarity matrix
with open(f"{artifacts_models_path}/cos_sim_matrix.pkl", "wb") as f:
    pickle.dump(cos_sim_df, f)

# Save user and article DataFrames for later use
df_users_sampled.to_pickle(f"{artifacts_models_path}/usuarios_preprocessados.pkl")
df_articles_sampled.to_pickle(f"{artifacts_models_path}/artigos_preprocessados.pkl")