##### Importando libs

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import nltk


##### Declarando paths

In [2]:
root_path = os.getenv('HOST_PATH')
parquets_path = f"{root_path}/artifacts/parquets"
models_path = f"{root_path}/artifacts/models"
user_target_file = "usuarios_treino_preprocessados.parquet"
articles_target_file = "artigos_treino_preprocessados.parquet"

##### Carregando Parquets

In [3]:
df_users = pd.read_parquet(f"{parquets_path}/{user_target_file}")
df_articles = pd.read_parquet(f"{parquets_path}/{articles_target_file}")

In [4]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]  

True

In [5]:
stop_words_pt = stopwords.words('portuguese')

vectorizer = TfidfVectorizer(
  stop_words=stop_words_pt,
   max_df=0.85,
   min_df=2,
   ngram_range=(1,2)
)
tfidf_matrix = vectorizer.fit_transform(df_articles['title'] + " " + df_articles['caption'] + " " + df_articles['body'])

In [6]:
from sklearn.neighbors import NearestNeighbors

n_neighbors = 5
nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine').fit(tfidf_matrix)
distances, indices = nbrs.kneighbors(tfidf_matrix)
cosine_sim = 1 - distances

Filtragem Colaborativa

In [7]:
ratings = []
for _, user in df_users.iterrows():
    for news_id in user["history"]:
        rating = (user["timeOnPageHistory"] * 0.4 + 
                  user["numberOfClicksHistory"] * 0.3 + 
                  user["scrollPercentageHistory"] * 0.3)  # Modifique os pesos conforme necessário
        ratings.append({"userId": user["userId"], "itemId": news_id, "rating": rating})
ratings_df = pd.DataFrame(ratings)

In [8]:
user_item_matrix = ratings_df.pivot(index="userId", columns="itemId", values="rating").fillna(0)

In [9]:
sparse_matrix = csr_matrix(user_item_matrix.values)

SVD

In [10]:
n_components = min(100, min(user_item_matrix.shape) - 1)  # Limite para evitar estouro de memória
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd.fit(sparse_matrix)


##### Gerando Pickle e Parquet (users x noticias)

In [11]:
ratings_df.to_parquet(f"{parquets_path}/interactions.parquet", index=False)

with open(f"{models_path}/user_item_matrix.pkl", 'wb') as f:
    pickle.dump(user_item_matrix, f)

with open(f"{models_path}/content_model.pkl", 'wb') as f:
    pickle.dump(cosine_sim, f)
    
with open(f"{models_path}/collaborative_model.pkl", "wb") as f:
    pickle.dump(svd, f)

##### Retornando recomendações

In [12]:
def recommend_collaborative(user_id, top_n=10):
    """Gera recomendações usando o modelo colaborativo baseado em SVD."""
    if user_id not in user_item_matrix.index:
        return []

    user_index = user_item_matrix.index.get_loc(user_id)
    user_vector = user_factors[user_index]
    # Predição para todos os itens
    item_scores = np.dot(user_vector, svd.components_)
    # Ordenar por maior nota prevista
    sorted_items = np.argsort(item_scores)[::-1]
    # Obter os IDs dos itens recomendados
    recommended_items = user_item_matrix.columns[sorted_items][:top_n].tolist()
    return recommended_items

In [13]:
def recommend_content(user_id, top_n=5):
    if user_id not in ratings_df["userId"].values:
        return []  # Usuário não encontrado

    # Pegamos os artigos que o usuário já interagiu
    user_history = ratings_df[ratings_df["userId"] == user_id]["itemId"].tolist()

    # Pegamos os artigos mais similares com base na matriz de similaridade
    recommendations = []
    for article in user_history:
        if article in df_articles["page"].values:  # Certifica que o artigo existe
            idx = df_articles[df_articles["page"] == article].index[0]
            similar_indices = cosine_sim[idx].argsort()[::-1][1:top_n+1]  # Pegar os mais similares
            recommendations.extend(df_articles.iloc[similar_indices]["page"].tolist())

    return list(set(recommendations))  # Remove duplicatas

# print("Recomendações por conteúdo:", recommend_content('a50f16d51820754a3db8281180950c3619c2b5a154926cb383de0fd023404756', top_n=5))


In [14]:
def recommend_combined(user_id, top_n=10, weight_collaborative=0.5, weight_content=0.5):
    """Combina recomendações dos dois modelos."""
    collab_recs = recommend_collaborative(user_id, top_n)
    content_recs = recommend_content(user_id, top_n)

    # Criar um ranking combinado
    scores = {}
    for idx, item in enumerate(collab_recs):
        scores[item] = scores.get(item, 0) + (weight_collaborative * (top_n - idx))

    for idx, item in enumerate(content_recs):
        scores[item] = scores.get(item, 0) + (weight_content * (top_n - idx))

    # Ordenar itens pelo score combinado
    final_recommendations = sorted(scores, key=scores.get, reverse=True)[:top_n]
    return final_recommendations

In [15]:
user_id = "a50f16d51820754a3db8281180950c3619c2b5a154926cb383de0fd023404756"
user_factors = svd.transform(sparse_matrix)
print("Recomendações colaborativas:", recommend_collaborative(user_id, top_n=6))
print("Recomendações por conteúdo:", recommend_content(user_id, top_n=6))
print("Recomendações combinadas:", recommend_combined(user_id, top_n=6))

Recomendações colaborativas: ['25d54fe7-248b-4e1d-89a1-20b36aba6159', '4e163060-2488-4627-a8ba-bbfc54a4d57d', '91dd61bf-7c49-4276-bba6-1db2804921a9', '283a0372-4e74-4239-936a-0a6225bda7d7', '1f32787b-de2b-49be-8c20-ddaeae34cc22', 'ea3b1e84-69a9-461a-92b4-8206903def33']
Recomendações por conteúdo: ['eedb0817-cbe3-4959-aa84-017989fbd42a', 'e210245c-a59b-4acb-9cde-6d02870d7c74', '314afc37-b66a-4a76-9d73-d4b34e8d4117', '661064de-acc8-48bd-93ee-b83500e718c5']
Recomendações combinadas: ['25d54fe7-248b-4e1d-89a1-20b36aba6159', 'eedb0817-cbe3-4959-aa84-017989fbd42a', '4e163060-2488-4627-a8ba-bbfc54a4d57d', 'e210245c-a59b-4acb-9cde-6d02870d7c74', '91dd61bf-7c49-4276-bba6-1db2804921a9', '314afc37-b66a-4a76-9d73-d4b34e8d4117']


In [16]:
user_vector1 = user_item_matrix.loc[user_id].values.reshape(1, -1)
user_factors1 = svd.transform(user_vector1)
item_scores1 = svd.inverse_transform(user_factors1)
top_items1 = np.argsort(item_scores1[0])[::-1][:6]
recommended_items = user_item_matrix.columns[top_items1].tolist()

print(recommended_items)

['25d54fe7-248b-4e1d-89a1-20b36aba6159', '4e163060-2488-4627-a8ba-bbfc54a4d57d', '91dd61bf-7c49-4276-bba6-1db2804921a9', '283a0372-4e74-4239-936a-0a6225bda7d7', '1f32787b-de2b-49be-8c20-ddaeae34cc22', 'ea3b1e84-69a9-461a-92b4-8206903def33']


In [17]:
popular_articles = ratings_df.sort_values(by="rating", ascending=False)["itemId"]
popular_articles.head()

22391    855d20b7-53f2-4678-a10f-55402d085018
68622    54915fbc-682c-4b69-a0b9-7546b603c55d
68623    e63ef205-d49a-4cd6-ac85-059535e3d68c
68624    4fb755e0-6051-411e-a54d-3c8032fc2a22
68625    057ca1d3-db06-44cf-bc9d-2db348d31742
Name: itemId, dtype: object