##### Importando libs

In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import nltk


##### Declarando paths

In [5]:
root_path = os.getenv('HOST_PATH')
parquets_path = f"{root_path}/artifacts/parquets"
models_path = f"{root_path}/artifacts/models"
user_target_file = "usuarios_treino_preprocessados.parquet"
articles_target_file = "artigos_treino_preprocessados.parquet"

##### Carregando Parquets

In [6]:
df_users = pd.read_parquet(f"{parquets_path}/{user_target_file}")
df_articles = pd.read_parquet(f"{parquets_path}/{articles_target_file}")

##### Content-Based Filtering

Processamento de Texto com TF-IDF

In [7]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/heijimor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]  

True

In [8]:
stop_words_pt = stopwords.words('portuguese')

vectorizer = TfidfVectorizer(
  stop_words=stop_words_pt,
   max_df=0.85,
   min_df=2,
   ngram_range=(1,2)
)
tfidf_matrix = vectorizer.fit_transform(df_articles['title'] + " " + df_articles['caption'] + " " + df_articles['body'])

Construção do Modelo Baseado em Similaridade de Conteúdo

In [9]:
from sklearn.neighbors import NearestNeighbors

n_neighbors = 5
nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine').fit(tfidf_matrix)
distances, indices = nbrs.kneighbors(tfidf_matrix)
cosine_sim = 1 - distances

##### Collaborative Filtering

Criando Matriz de Interação Usuário-Item

In [10]:
ratings = []
for _, user in df_users.iterrows():
    for news_id in user["history"]:
        rating = (user["timeOnPageHistory"] * 0.4 + 
                  user["numberOfClicksHistory"] * 0.3 + 
                  user["scrollPercentageHistory"] * 0.3)  # Modifique os pesos conforme necessário
        ratings.append({"userId": user["userId"], "itemId": news_id, "rating": rating})
ratings_df = pd.DataFrame(ratings)

In [11]:
ratings_df.tail()

Unnamed: 0,userId,itemId,rating
112397,8c790612212b9d739494b13b092482aea653764bd64fba...,ea12f8aa-58b6-4ad8-97c7-2a4266f95c57,0.001779
112398,8c790612212b9d739494b13b092482aea653764bd64fba...,e5185368-70f8-4998-a738-ca22f300da7b,0.001779
112399,4640aa0608306e79a7a97603eaa761dd86e77d9e846261...,003e44b2-5658-42fd-90bb-d7d2f53004ea,0.000616
112400,20ffa9309bfe4d6b5128c8a870bec65dcfc1ba590a1b48...,6889f55d-8f47-47a9-bb3f-492d1e58b225,0.000441
112401,33ed0eb44989a7b6e3bca0cca275cbcd2945a18ecc7b5b...,9015cd8f-cd3d-4191-900f-b397d13470ad,0.004254


Aplicando Recência às Interações

In [12]:
recency_dict = df_articles["recency_score"].to_dict()

In [13]:
print("Valores do recency_dict:")
print(pd.Series(recency_dict).describe())

Valores do recency_dict:
count    40000.000000
mean         0.646739
std          0.271357
min          0.074051
25%          0.402524
50%          0.699772
75%          0.901225
max          1.000000
dtype: float64


In [14]:
user_item_matrix = ratings_df.pivot(index="userId", columns="itemId", values="rating").fillna(0)

In [15]:
# Antes da multiplicação
print("Antes da recência:")
print(user_item_matrix.describe())

Antes da recência:
itemId  0000352b-5a88-4a69-8f30-120da7169573  \
count                           7.970000e+03   
mean                            2.618697e-07   
std                             2.337838e-05   
min                             0.000000e+00   
25%                             0.000000e+00   
50%                             0.000000e+00   
75%                             0.000000e+00   
max                             2.087101e-03   

itemId  00030c8c-7fe8-4c44-aa6d-2c6a899c3ce5  \
count                           7.970000e+03   
mean                            1.429523e-07   
std                             1.276204e-05   
min                             0.000000e+00   
25%                             0.000000e+00   
50%                             0.000000e+00   
75%                             0.000000e+00   
max                             1.139330e-03   

itemId  00084600-ab5e-45b3-8fc0-8ab12ec74903  \
count                           7.970000e+03   
mean               

In [16]:
for item in user_item_matrix.columns:
    if item in recency_dict:
        normalized_score = (recency_dict[item] - min_score) / (max_score - min_score)
        user_item_matrix[item] *= normalized_score

In [17]:
print("Depois da recência:")
print(user_item_matrix.describe())

Depois da recência:
itemId  0000352b-5a88-4a69-8f30-120da7169573  \
count                           7.970000e+03   
mean                            2.618697e-07   
std                             2.337838e-05   
min                             0.000000e+00   
25%                             0.000000e+00   
50%                             0.000000e+00   
75%                             0.000000e+00   
max                             2.087101e-03   

itemId  00030c8c-7fe8-4c44-aa6d-2c6a899c3ce5  \
count                           7.970000e+03   
mean                            1.429523e-07   
std                             1.276204e-05   
min                             0.000000e+00   
25%                             0.000000e+00   
50%                             0.000000e+00   
75%                             0.000000e+00   
max                             1.139330e-03   

itemId  00084600-ab5e-45b3-8fc0-8ab12ec74903  \
count                           7.970000e+03   
mean              

Redução de Dimensionalidade com SVD

In [18]:
sparse_matrix = csr_matrix(user_item_matrix.values)

In [19]:
n_components = min(100, min(user_item_matrix.shape) - 1)  # Limite para evitar estouro de memória
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd.fit(sparse_matrix)


##### Gerando Pickle e Parquet (users x noticias)

In [20]:
ratings_df.to_parquet(f"{parquets_path}/interactions.parquet", index=False)

with open(f"{models_path}/user_item_matrix.pkl", 'wb') as f:
    pickle.dump(user_item_matrix, f)

with open(f"{models_path}/content_model.pkl", 'wb') as f:
    pickle.dump(cosine_sim, f)
    
with open(f"{models_path}/collaborative_model.pkl", "wb") as f:
    pickle.dump(svd, f)

##### Tetes rápidos das recomendações

In [21]:
def recommend_collaborative(user_id, top_n=10):
    """Gera recomendações usando o modelo colaborativo baseado em SVD."""
    if user_id not in user_item_matrix.index:
        return []

    user_index = user_item_matrix.index.get_loc(user_id)
    user_vector = user_factors[user_index]
    # Predição para todos os itens
    item_scores = np.dot(user_vector, svd.components_)
    # Ordenar por maior nota prevista
    sorted_items = np.argsort(item_scores)[::-1]
    # Obter os IDs dos itens recomendados
    recommended_items = user_item_matrix.columns[sorted_items][:top_n].tolist()
    return recommended_items

In [22]:
def recommend_content(user_id, top_n=5):
    if user_id not in ratings_df["userId"].values:
        return []  # Usuário não encontrado

    # Pegamos os artigos que o usuário já interagiu
    user_history = ratings_df[ratings_df["userId"] == user_id]["itemId"].tolist()

    # Pegamos os artigos mais similares com base na matriz de similaridade
    recommendations = []
    for article in user_history:
        if article in df_articles["page"].values:
            idx = df_articles[df_articles["page"] == article].index[0]
            similar_indices = cosine_sim[idx].argsort()[::-1][1:top_n+1]
            recommendations.extend(df_articles.iloc[similar_indices]["page"].tolist())

    return list(set(recommendations))

In [23]:
def recommend_combined(user_id, top_n=10, weight_collaborative=0.5, weight_content=0.5):
    """Combina recomendações dos dois modelos."""
    collab_recs = recommend_collaborative(user_id, top_n)
    content_recs = recommend_content(user_id, top_n)

    # Criar um ranking combinado
    scores = {}
    for idx, item in enumerate(collab_recs):
        scores[item] = scores.get(item, 0) + (weight_collaborative * (top_n - idx))

    for idx, item in enumerate(content_recs):
        scores[item] = scores.get(item, 0) + (weight_content * (top_n - idx))

    # Ordenar itens pelo score combinado
    final_recommendations = sorted(scores, key=scores.get, reverse=True)[:top_n]
    return final_recommendations

In [24]:
user_id = "a50f16d51820754a3db8281180950c3619c2b5a154926cb383de0fd023404756"
user_factors = svd.transform(sparse_matrix)
print("Recomendações colaborativas:", recommend_collaborative(user_id, top_n=6))
print("Recomendações por conteúdo:", recommend_content(user_id, top_n=6))
print("Recomendações combinadas:", recommend_combined(user_id, top_n=6))

Recomendações colaborativas: ['25d54fe7-248b-4e1d-89a1-20b36aba6159', '4e163060-2488-4627-a8ba-bbfc54a4d57d', '91dd61bf-7c49-4276-bba6-1db2804921a9', '283a0372-4e74-4239-936a-0a6225bda7d7', '1f32787b-de2b-49be-8c20-ddaeae34cc22', 'ea3b1e84-69a9-461a-92b4-8206903def33']
Recomendações por conteúdo: ['314afc37-b66a-4a76-9d73-d4b34e8d4117', 'eedb0817-cbe3-4959-aa84-017989fbd42a', '661064de-acc8-48bd-93ee-b83500e718c5', 'e210245c-a59b-4acb-9cde-6d02870d7c74']
Recomendações combinadas: ['25d54fe7-248b-4e1d-89a1-20b36aba6159', '314afc37-b66a-4a76-9d73-d4b34e8d4117', '4e163060-2488-4627-a8ba-bbfc54a4d57d', 'eedb0817-cbe3-4959-aa84-017989fbd42a', '91dd61bf-7c49-4276-bba6-1db2804921a9', '661064de-acc8-48bd-93ee-b83500e718c5']


In [25]:
user_vector1 = user_item_matrix.loc[user_id].values.reshape(1, -1)
user_factors1 = svd.transform(user_vector1)
item_scores1 = svd.inverse_transform(user_factors1)
top_items1 = np.argsort(item_scores1[0])[::-1][:6]
recommended_items = user_item_matrix.columns[top_items1].tolist()

print(recommended_items)

['25d54fe7-248b-4e1d-89a1-20b36aba6159', '4e163060-2488-4627-a8ba-bbfc54a4d57d', '91dd61bf-7c49-4276-bba6-1db2804921a9', '283a0372-4e74-4239-936a-0a6225bda7d7', '1f32787b-de2b-49be-8c20-ddaeae34cc22', 'ea3b1e84-69a9-461a-92b4-8206903def33']


In [27]:
recommendations = np.argsort(cosine_sim[0])[::-1][:6]
recommended_article_ids = df_articles['page'].iloc[recommendations].tolist()
recommended_article_ids

['d4f8487d-6f75-413d-9657-34aa3e1cd593',
 'eedb0817-cbe3-4959-aa84-017989fbd42a',
 '661064de-acc8-48bd-93ee-b83500e718c5',
 '314afc37-b66a-4a76-9d73-d4b34e8d4117',
 'e210245c-a59b-4acb-9cde-6d02870d7c74']