In [138]:
import pandas as pd
import numpy as np
import joblib
import pickle
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [139]:
df_users = pd.read_csv("../data/files/treino/treino_parte1.csv")
df_articles = pd.read_csv("../data/itens/itens/itens-parte1.csv")

In [140]:
df_users.fillna({"history": "[]"}, inplace=True)  # Se history estiver vazio, substitui por lista vazia
df_articles.fillna("", inplace=True)

In [141]:
df_articles["issued"] = pd.to_datetime(df_articles["issued"]).astype(int) // 10**9
df_articles["modified"] = pd.to_datetime(df_articles["modified"]).astype(int) // 10**9


In [142]:
def convert_to_mean(value):
    if isinstance(value, str):
        values = [float(x.strip()) for x in value.split(",") if x.strip().replace('.', '', 1).isdigit()]
        return np.mean(values) if values else 0
    return value

cols_to_convert = ["numberOfClicksHistory", "timeOnPageHistory", "scrollPercentageHistory", "pageVisitsCountHistory"]
for col in cols_to_convert:
    df_users[col] = df_users[col].apply(convert_to_mean)

In [143]:
scaler = MinMaxScaler()
df_users[["historySize", "numberOfClicksHistory", "timeOnPageHistory", "scrollPercentageHistory", "pageVisitsCountHistory"]] = \
    scaler.fit_transform(df_users[["historySize", "numberOfClicksHistory", "timeOnPageHistory", "scrollPercentageHistory", "pageVisitsCountHistory"]])


In [144]:
df_users["history"] = df_users["history"].apply(lambda x: x.split(",") if isinstance(x, str) else [])


In [145]:
encoder = LabelEncoder()
df_users["userType"] = encoder.fit_transform(df_users["userType"])

In [146]:
df_users["userId"] = df_users["userId"].astype("category").cat.codes
df_articles["page"] = df_articles["page"].astype("category").cat.codes

In [147]:
df_users_sampled = df_users.sample(n=1000)  # Amostra de 1000 usuários
df_articles_sampled = df_articles.sample(n=1000)  # Amostra de 100 artigos

In [148]:
interaction_data_sampled = []
for _, row in df_users_sampled.iterrows():
    for article in row['history']:
        interaction_data_sampled.append((row['userId'], article))

df_interactions = pd.DataFrame(interaction_data_sampled, columns=["userId", "page"])


In [149]:
df_users_sampled.to_parquet("../data/output/usuarios_preprocessados.parquet", index=False)
df_articles_sampled.to_parquet("../data/output/artigos_preprocessados.parquet", index=False)
df_interactions.to_parquet("../data/output/interactions.parquet", index=False)

print("✅ Pré-processamento concluído e salvo em Parquet!")

✅ Pré-processamento concluído e salvo em Parquet!


##### Collaborative Filtering

In [150]:
interaction_matrix = df_interactions.pivot_table(index='userId', columns='page', aggfunc='size', fill_value=0)



In [151]:
cos_sim = cosine_similarity(interaction_matrix)
cos_sim_df = pd.DataFrame(cos_sim, index=interaction_matrix.index, columns=interaction_matrix.index)

In [152]:
print("Matriz de similaridade entre usuários:")
print(cos_sim_df.head())

Matriz de similaridade entre usuários:
userId  75     110    137    278    336    392    609    721    791    \
userId                                                                  
75        1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
110       0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
137       0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0   
278       0.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0   
336       0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0   

userId     840    ...  99258  99270  99279  99491  99520  99642  99685  99745  \
userId            ...                                                           
75      0.000000  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
110     0.000000  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
137     0.000000  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
278     0.000000  ...    0.0    0.0    0.0  

##### K Means

In [153]:
kmeans = KMeans(n_clusters=3, random_state=42)
df_users['cluster'] = kmeans.fit_predict(df_users[['numberOfClicksHistory', 'timeOnPageHistory']])

# Exibindo clusters dos usuários
print("Clusters dos usuários:")
print(df_users[['userId', 'cluster']].head())

Clusters dos usuários:
   userId  cluster
0   97469        0
1   17012        0
2    4213        0
3   75845        0
4   90379        1


In [154]:
# Recomendações para um usuário logado
user_id = df_users_sampled['userId'].iloc[0]  # Exemplo de usuário logado (pode ser qualquer userId)
user_similarity = cos_sim_df[user_id]
similar_users = user_similarity.sort_values(ascending=False).index[1:4]
print(f"Usuários mais similares ao usuário {user_id}:")
print(similar_users)

Usuários mais similares ao usuário 85185:
Index([76907, 79765, 13294], dtype='int64', name='userId')


##### Sugestões de artigos baseadas em similaridade (para o usuário logado)


In [155]:
recommended_articles = []
for similar_user in similar_users:
    similar_user_articles = df_interactions[df_interactions['userId'] == similar_user]['page']
    recommended_articles.extend(similar_user_articles)

In [156]:
# Exibindo os artigos recomendados
recommended_articles = list(set(recommended_articles))  # Removendo duplicatas
print("Artigos recomendados:")
print(recommended_articles)

Artigos recomendados:
[' e1a89c3a-1649-486f-a964-ecfb33821e77', ' ef3da0ac-fcc0-4595-ae62-8620ed348425', ' dcbcfc8d-6010-41df-a39d-008b0c7bc85d', ' 4c63d7cd-4902-4ffb-9b94-578b1b2151f0', ' f00b5f62-6785-4132-a066-c27de1b31159', ' 74989b80-d633-4d28-9292-0be71c309ec7', ' 3b33c0e8-8b98-49f8-953b-20b8111a1c3d', ' 2230ede8-9909-467f-8cb7-0ae75cc1b35c', ' 34ff0824-cbc3-4277-a5a9-406fbc6768dc', ' 8845c618-159f-4f85-8b58-4f92a5d9853c', ' ab32543f-6f40-42e5-a357-537af0fa2947', ' 5d8388f3-736d-451b-a794-d826453e12e1', ' 4ccb6ed7-d196-4284-94d6-fb3dc8a53d61', ' 861f3ebb-292c-4a4e-90f5-5e0958ea0ff6', ' ca361779-0059-451e-9369-c2dfaa3cc971', ' 224ac679-9143-4e31-8465-4ee3c6e559af', 'a7ab0f81-5509-4898-8090-f838953a4bdd', ' bc17d65d-aea1-4c9c-82d5-7528730e1d0c', ' 27453ffd-5354-440a-a6b1-0b56eb6b6821', ' 8e8d5bc1-e4b3-4e99-912a-3b68dfc8964f', ' 0ad0c4b9-81cf-4d9d-9d35-1c867af65f8c', ' 38578b5c-4509-49df-ad79-c62cc914e4a5', ' 3f201d73-9c2d-4cd1-9e48-ee0438cf179c', ' 56c980ff-4346-4723-9cfc-7438d1b00

In [157]:
# Recomendações para usuários não logados: artigos populares
popular_articles = df_interactions.groupby('page').size().reset_index(name='popularity')
popular_articles = popular_articles.sort_values('popularity', ascending=False)
print("Artigos mais populares:")
print(popular_articles.head(3))

Artigos mais populares:
                                       page  popularity
4965   d2593c3d-2347-40d9-948c-b6065e8459a9          38
5719   f0a78e58-ec7e-494c-9462-fbd6446a9a89          37
755    1f32787b-de2b-49be-8c20-ddaeae34cc22          34


In [158]:
# Save KMeans model
joblib.dump(kmeans, "../data/output/kmeans_model.pkl")

# Save cosine similarity matrix
with open("../data/output/cos_sim_matrix.pkl", "wb") as f:
    pickle.dump(cos_sim_df, f)

# Save user and article DataFrames for later use
df_users_sampled.to_pickle("../data/output/usuarios_preprocessados.pkl")
df_articles_sampled.to_pickle("../data/output/artigos_preprocessados.pkl")

NameError: name 'pickle' is not defined