In [61]:
import pandas as pd
import numpy as np
import joblib
import pickle
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

##### Caminho dos arquivos CSV

In [62]:
user_csv_path = "../data/files/treino"
user_target_csv_file = "treino_parte1.csv"

articles_csv_path = "../data/itens/itens"
articles_target_csv_file = "itens-parte1.csv"


##### Carregando os dados dos usuários e artigos

In [None]:
df_users = pd.read_csv(f"{user_csv_path}/{user_target_csv_file}")
df_articles = pd.read_csv(f"{articles_csv_path}/{articles_target_csv_file}")

##### Preenchendo valores ausentes de 'history' com lista vazia

In [63]:
df_users.fillna({"history": "[]"}, inplace=True)  # Se history estiver vazio, substitui por lista vazia
df_articles.fillna("", inplace=True)

In [64]:
df_articles["issued"] = pd.to_datetime(df_articles["issued"]).astype(int) // 10**9
df_articles["modified"] = pd.to_datetime(df_articles["modified"]).astype(int) // 10**9


In [65]:
def convert_to_mean(value):
    if isinstance(value, str):
        values = [float(x.strip()) for x in value.split(",") if x.strip().replace('.', '', 1).isdigit()]
        return np.mean(values) if values else 0
    return value

cols_to_convert = ["numberOfClicksHistory", "timeOnPageHistory", "scrollPercentageHistory", "pageVisitsCountHistory"]
for col in cols_to_convert:
    df_users[col] = df_users[col].apply(convert_to_mean)

In [66]:
scaler = MinMaxScaler()
df_users[[
    "historySize",
    "numberOfClicksHistory",
    "timeOnPageHistory",
    "scrollPercentageHistory",
    "pageVisitsCountHistory"
]] = \
    scaler.fit_transform(
        df_users[
            [
                "historySize",
                "numberOfClicksHistory",
                "timeOnPageHistory",
                "scrollPercentageHistory",
                "pageVisitsCountHistory"
            ]
        ]
    )


In [67]:
df_users["history"] = df_users["history"].apply(lambda x: x.split(",") if isinstance(x, str) else [])


In [68]:
encoder = LabelEncoder()
df_users["userType"] = encoder.fit_transform(df_users["userType"])

In [69]:
df_users["userId"] = df_users["userId"].astype("category").cat.codes
df_articles["page"] = df_articles["page"].astype("category").cat.codes

In [70]:
df_users_sampled = df_users.sample(n=1000)  # Amostra de 1000 usuários
df_articles_sampled = df_articles.sample(n=100)  # Amostra de 1000 artigos

In [71]:
interaction_data_sampled = []
for _, row in df_users_sampled.iterrows():
    for article in row['history']:
        interaction_data_sampled.append((row['userId'], article))

df_interactions = pd.DataFrame(interaction_data_sampled, columns=["userId", "page"])


In [72]:
df_users_sampled.to_parquet("../artifacts/parquets/usuarios_preprocessados.parquet", index=False)
df_articles_sampled.to_parquet("../artifacts/parquets/artigos_preprocessados.parquet", index=False)
df_interactions.to_parquet("../artifacts/parquets/interactions.parquet", index=False)

print("✅ Pré-processamento concluído e salvo em Parquet!")

✅ Pré-processamento concluído e salvo em Parquet!


##### Collaborative Filtering

In [73]:
interaction_matrix = df_interactions.pivot_table(index='userId', columns='page', aggfunc='size', fill_value=0)



In [74]:
cos_sim = cosine_similarity(interaction_matrix)
cos_sim_df = pd.DataFrame(cos_sim, index=interaction_matrix.index, columns=interaction_matrix.index)

In [75]:
print("Matriz de similaridade entre usuários:")
print(cos_sim_df.head())

Matriz de similaridade entre usuários:
userId     13     74     103       113       192    573    680    742    \
userId                                                                    
13      1.000000    0.0    0.0  0.031757  0.026326    0.0    0.0    0.0   
74      0.000000    1.0    0.0  0.000000  0.000000    0.0    0.0    0.0   
103     0.000000    0.0    1.0  0.000000  0.000000    0.0    0.0    0.0   
113     0.031757    0.0    0.0  1.000000  0.081096    0.0    0.0    0.0   
192     0.026326    0.0    0.0  0.081096  1.000000    0.0    0.0    0.0   

userId  772       778    ...  98645     98698     98708  98731  99162  99171  \
userId                   ...                                                   
13        0.0  0.024626  ...    0.0  0.000000  0.041451    0.0    0.0    0.0   
74        0.0  0.000000  ...    0.0  0.000000  0.000000    0.0    0.0    0.0   
103       0.0  0.000000  ...    0.0  0.000000  0.000000    0.0    0.0    0.0   
113       0.0  0.000000  ...    0.0

##### K Means

In [76]:
kmeans = KMeans(n_clusters=3, random_state=42)
df_users_sampled['cluster'] = kmeans.fit_predict(df_users_sampled[['numberOfClicksHistory', 'timeOnPageHistory']])

# Exibindo clusters dos usuários
print("Clusters dos usuários:")
print(df_users_sampled[['userId', 'cluster']].head())

Clusters dos usuários:
       userId  cluster
57957   54594        1
67143   56041        0
52847   99852        0
92536   46674        0
7268    96376        0


##### Carregando os dados de validação

In [1]:
df_validacao = pd.read_csv("../data/validacao.csv")

NameError: name 'pd' is not defined

##### Função para calcular a precisão das recomendações

In [None]:
def evaluate_recommendations(user_id, similar_users, recommended_articles, history):
    relevant_articles = [article.strip() for article in history]
    correct_recommendations = len(set(recommended_articles).intersection(set(relevant_articles)))
    total_recommendations = len(recommended_articles)
    return correct_recommendations, total_recommendations

##### Recomendações para usuários de validação

In [None]:
correct_recommendations = 0
total_recommendations = 0

for _, row in df_validacao.iterrows():
    user_id = row['userId']
    
    # Usuários similares
    user_similarity = cos_sim_df[user_id]
    similar_users = user_similarity.sort_values(ascending=False).index[1:4]
    
    # Artigos recomendados
    recommended_articles = []
    for similar_user in similar_users:
        similar_user_articles = df_interactions[df_interactions['userId'] == similar_user]['page']
        recommended_articles.extend(similar_user_articles)
    
    # Removendo duplicatas
    recommended_articles = list(set(recommended_articles))
    
    # Avaliar precisão das recomendações
    correct, total = evaluate_recommendations(user_id, similar_users, recommended_articles, row['history'])
    correct_recommendations += correct
    total_recommendations += total

##### Calculando a precisão das recomendações

In [None]:
precision = correct_recommendations / total_recommendations if total_recommendations > 0 else 0
print(f"✅ Precisão das recomendações: {precision:.2f}")

In [77]:
# Recomendações para um usuário logado
user_id = df_users_sampled['userId'].iloc[0]  # Exemplo de usuário logado (pode ser qualquer userId)
user_similarity = cos_sim_df[user_id]
similar_users = user_similarity.sort_values(ascending=False).index[1:4]
print(f"Usuários mais similares ao usuário {user_id}:")
print(similar_users)

Usuários mais similares ao usuário 54594:
Index([63599, 13, 63937], dtype='int64', name='userId')


##### Sugestões de artigos baseadas em similaridade (para o usuário logado)


In [78]:
recommended_articles = []
for similar_user in similar_users:
    similar_user_articles = df_interactions[df_interactions['userId'] == similar_user]['page']
    recommended_articles.extend(similar_user_articles)

In [79]:
# Exibindo os artigos recomendados
recommended_articles = list(set(recommended_articles))  # Removendo duplicatas
print("Artigos recomendados:")
print(recommended_articles)

Artigos recomendados:
[' 27625dab-0353-4ea8-b88c-b945c9e782dc', ' 564f0976-781b-44e7-96c8-7377097d0101', ' f4ac3eb5-e145-4685-b9e9-b61f3551aa58', '83ebda93-e1cf-422d-ae6b-18298c55ed8e', ' 8d477e04-3bab-4ad9-8fe3-799059238a9c', ' a6e7224d-da3e-468f-bc51-26331659e06a', ' 0f9229b8-9135-4352-914e-24d0d93f42b1', ' 2330ef03-c745-4707-845a-4b1bf57d9120', ' 61e07f64-cddf-46f2-b50c-ea0a39c22050', ' eba80def-f516-4e77-b812-0173f7a2eb77', ' 2008f497-c05f-49e0-88dd-86aa1e395f15', ' 96d328b9-5da7-4389-a9ac-765e98971ab7', ' 458bf0ec-efb4-4bfd-9446-c80295e6aa87', ' a9edcef3-47c9-4be0-803d-6350d045a78b', ' 4df3873b-0e51-46ea-8dd7-e820fb7faf14', 'b9a15097-20af-4816-a7db-c6f248a76212', ' dc009ac8-363b-40ad-ba25-c11e4318ea8e', ' 4c586bb4-f71d-4b39-9df8-e38ac3f632a0', ' f13bf60f-17b5-4443-9513-c52e7a933bac', ' d973bcc5-eb9d-4c11-8017-c88828858da4', ' cd57090e-4d8f-40bd-acc7-f53844dd2414', ' 469ba500-91f0-4797-abc6-1d407af8939f', ' 882e7c95-935a-4eab-9ece-f85f5f7d0f4e', ' 69e871f3-1c88-4482-86ef-ca19191f2a

In [80]:
# Recomendações para usuários não logados: artigos populares
popular_articles = df_interactions.groupby('page').size().reset_index(name='popularity')
popular_articles = popular_articles.sort_values('popularity', ascending=False)
print("Artigos mais populares:")
print(popular_articles.head(3))

Artigos mais populares:
                                       page  popularity
5365   f6b5d170-48b9-4f8e-88d4-c84b6668f3bd          40
676    1f32787b-de2b-49be-8c20-ddaeae34cc22          36
5224   f0a78e58-ec7e-494c-9462-fbd6446a9a89          34


In [81]:
artifacts_models_path = "../artifacts/models"

joblib.dump(kmeans, f"{artifacts_models_path}/kmeans_model.pkl")

# Save cosine similarity matrix
with open(f"{artifacts_models_path}/cos_sim_matrix.pkl", "wb") as f:
    pickle.dump(cos_sim_df, f)

# Save user and article DataFrames for later use
df_users_sampled.to_pickle(f"{artifacts_models_path}/usuarios_preprocessados.pkl")
df_articles_sampled.to_pickle(f"{artifacts_models_path}/artigos_preprocessados.pkl")