In [159]:
import pandas as pd
import numpy as np
import joblib
import pickle
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [160]:
user_csv_path = "../data/files/treino"
user_target_csv_file = "treino_parte1.csv"

articles_csv_path = "../data/itens/itens"
articles_target_csv_file = "itens-parte1.csv"

df_users = pd.read_csv(f"{user_csv_path}/{user_target_csv_file}")
df_articles = pd.read_csv(f"{articles_csv_path}/{articles_target_csv_file}")

In [161]:
df_users.fillna({"history": "[]"}, inplace=True)  # Se history estiver vazio, substitui por lista vazia
df_articles.fillna("", inplace=True)

In [162]:
df_articles["issued"] = pd.to_datetime(df_articles["issued"]).astype(int) // 10**9
df_articles["modified"] = pd.to_datetime(df_articles["modified"]).astype(int) // 10**9


In [163]:
def convert_to_mean(value):
    if isinstance(value, str):
        values = [float(x.strip()) for x in value.split(",") if x.strip().replace('.', '', 1).isdigit()]
        return np.mean(values) if values else 0
    return value

cols_to_convert = ["numberOfClicksHistory", "timeOnPageHistory", "scrollPercentageHistory", "pageVisitsCountHistory"]
for col in cols_to_convert:
    df_users[col] = df_users[col].apply(convert_to_mean)

In [164]:
scaler = MinMaxScaler()
df_users[[
    "historySize",
    "numberOfClicksHistory",
    "timeOnPageHistory",
    "scrollPercentageHistory",
    "pageVisitsCountHistory"
]] = \
    scaler.fit_transform(
        df_users[
            [
                "historySize",
                "numberOfClicksHistory",
                "timeOnPageHistory",
                "scrollPercentageHistory",
                "pageVisitsCountHistory"
            ]
        ]
    )


In [165]:
df_users["history"] = df_users["history"].apply(lambda x: x.split(",") if isinstance(x, str) else [])


In [166]:
encoder = LabelEncoder()
df_users["userType"] = encoder.fit_transform(df_users["userType"])

In [167]:
df_users["userId"] = df_users["userId"].astype("category").cat.codes
df_articles["page"] = df_articles["page"].astype("category").cat.codes

In [168]:
df_users_sampled = df_users.sample(n=1000)  # Amostra de 1000 usuários
df_articles_sampled = df_articles.sample(n=1000)  # Amostra de 1000 artigos

In [169]:
interaction_data_sampled = []
for _, row in df_users_sampled.iterrows():
    for article in row['history']:
        interaction_data_sampled.append((row['userId'], article))

df_interactions = pd.DataFrame(interaction_data_sampled, columns=["userId", "page"])


In [170]:
df_users_sampled.to_parquet("../artifacts/parquets/usuarios_preprocessados.parquet", index=False)
df_articles_sampled.to_parquet("../artifacts/parquets/artigos_preprocessados.parquet", index=False)
df_interactions.to_parquet("../artifacts/parquets/interactions.parquet", index=False)

print("✅ Pré-processamento concluído e salvo em Parquet!")

✅ Pré-processamento concluído e salvo em Parquet!


##### Collaborative Filtering

In [171]:
interaction_matrix = df_interactions.pivot_table(index='userId', columns='page', aggfunc='size', fill_value=0)



In [172]:
cos_sim = cosine_similarity(interaction_matrix)
cos_sim_df = pd.DataFrame(cos_sim, index=interaction_matrix.index, columns=interaction_matrix.index)

In [173]:
print("Matriz de similaridade entre usuários:")
print(cos_sim_df.head())

Matriz de similaridade entre usuários:
userId  103    424    563    686    789    813    1067   1078   1111   1375   \
userId                                                                         
103       1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
424       0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
563       0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
686       0.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0   
789       0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0   

userId  ...  98507  98574  98760  98857  98967  99224  99650  99732  99803  \
userId  ...                                                                  
103     ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
424     ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
563     ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
686     ..

##### K Means

In [182]:
kmeans = KMeans(n_clusters=3, random_state=42)
df_users['cluster'] = kmeans.fit_predict(df_users[['numberOfClicksHistory', 'timeOnPageHistory']])

# Exibindo clusters dos usuários
print("Clusters dos usuários:")
print(df_users[['userId', 'cluster']].head())

Clusters dos usuários:
   userId  cluster
0   97469        0
1   17012        0
2    4213        0
3   75845        0
4   90379        1


In [180]:
# Recomendações para um usuário logado
user_id = df_users_sampled['userId'].iloc[0]  # Exemplo de usuário logado (pode ser qualquer userId)
user_similarity = cos_sim_df[user_id]
similar_users = user_similarity.sort_values(ascending=False).index[1:4]
print(f"Usuários mais similares ao usuário {user_id}:")
print(similar_users)

Usuários mais similares ao usuário 69817:
Index([45314, 67821, 68834], dtype='int64', name='userId')


##### Sugestões de artigos baseadas em similaridade (para o usuário logado)


In [176]:
recommended_articles = []
for similar_user in similar_users:
    similar_user_articles = df_interactions[df_interactions['userId'] == similar_user]['page']
    recommended_articles.extend(similar_user_articles)

In [177]:
# Exibindo os artigos recomendados
recommended_articles = list(set(recommended_articles))  # Removendo duplicatas
print("Artigos recomendados:")
print(recommended_articles)

Artigos recomendados:
[' 8864af44-c4a2-4862-82de-be58579ac4f3', ' d8de2192-12b4-4fb5-97d8-8a6360a10e78', ' 1c27cf97-b20c-4e40-b1f1-288b721517b3', ' 45fcb63b-d80b-42af-8aa1-988149bcd1da', ' 54cb252a-78b5-463d-b31f-f54f63d85a29', ' 7594da99-d606-4338-a373-710a7dec776a', '5ecc4b24-ff5d-4f8c-8449-53de3b34d213', ' e3cdb277-ad80-4025-b5de-cd98cc67d23d', ' e93f13f3-466e-4a34-8cb3-09460b909d29', ' 8e0884e2-50ba-44c0-87b9-d64d913288af', ' 8be03387-6c44-43cc-93ad-65824af9c22a', ' af617145-c750-4973-81eb-a1ff00e6ed1c', ' 46fd2c8e-9759-4c39-91e4-94255a55c994', ' 87144e30-a396-47ed-8168-8c7992503192', ' 61e07f64-cddf-46f2-b50c-ea0a39c22050', ' 3936f6c7-3f6c-4205-8cda-3178160b6cd4', ' bf44b8ea-c737-4eec-a51b-01006202f303', ' 44afad69-6dff-4068-9e10-3092b659e6d4', ' 12c44ec3-3624-4c0d-990d-27c38e7a5848', ' 788b88f5-d957-4152-8a8e-4c8656405b24', ' 6c92672d-9c82-4fb1-b07f-c701a90e2fd2', ' ca35decb-6955-44b5-b2fa-eb4529c3cb5c', ' 5e426801-9a31-40b5-a492-26db7545dccb', ' cbc832d6-af68-41bf-91d4-8740d71c5

In [178]:
# Recomendações para usuários não logados: artigos populares
popular_articles = df_interactions.groupby('page').size().reset_index(name='popularity')
popular_articles = popular_articles.sort_values('popularity', ascending=False)
print("Artigos mais populares:")
print(popular_articles.head(3))

Artigos mais populares:
                                       page  popularity
4392   d2593c3d-2347-40d9-948c-b6065e8459a9          49
641    1f32787b-de2b-49be-8c20-ddaeae34cc22          41
429    15281e10-e6bc-48bc-9b1b-94402f83699b          34


In [179]:
artifacts_models_path = "../artifacts/models"


joblib.dump(kmeans, f"{artifacts_models_path}/kmeans_model.pkl")

# Save cosine similarity matrix
with open(f"{artifacts_models_path}/cos_sim_matrix.pkl", "wb") as f:
    pickle.dump(cos_sim_df, f)

# Save user and article DataFrames for later use
df_users_sampled.to_pickle(f"{artifacts_models_path}/usuarios_preprocessados.pkl")
df_articles_sampled.to_pickle(f"{artifacts_models_path}/artigos_preprocessados.pkl")

In [None]:
df_users.to_parquet("users1.parquet")
