In [1]:
import pandas as pd
import glob
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix

from sklearn.metrics.pairwise import cosine_similarity



In [2]:
csv_file = "../data/files/treino/treino_parte1.csv"
df = pd.read_csv(csv_file)

parquet_file = csv_file.replace('.csv', '.parquet')
df.to_parquet(parquet_file, index=False)

columns_to_split = [
    "history", "timestampHistory", "numberOfClicksHistory", "timeOnPageHistory", 
    "scrollPercentageHistory", "pageVisitsCountHistory", "timestampHistory_new"
]
df[columns_to_split] = df[columns_to_split].apply(lambda col: col.str.split(','))
df = df.explode(columns_to_split).reset_index(drop=True)
# df['timestampHistory'] = pd.to_datetime(df['timestampHistory'])
df.head(5)

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,c8aab885-433d-4e46-8066-479f40ba7fb2,1657146417045,76,20380,50.3,2,1657146417045
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,68d2039c-c9aa-456c-ac33-9b2e8677fba7,1657146605778,38,21184,18.18,1,1657146605778
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,13e423ce-1d69-4c78-bc18-e8c8f7271964,1657146698738,41,35438,16.46,1,1657146698738
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,3325b5a1-979a-4cb3-82b6-63905c9edbe8,1656684240278,7,6049,25.35,1,1656684240278
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,fe856057-f97d-419f-ab1c-97c5c3e0719c,1656761266729,80,210489,45.66,1,1656761266729


In [3]:
df_articles = pd.read_csv("../data/itens/itens/itens-parte2.csv")
df_articles['page'] = df_articles['page'].astype(str)
df_articles.head()

Unnamed: 0,page,url,issued,modified,title,body,caption
0,7371a9b5-5824-4c57-8704-00a74feebe79,http://g1.globo.com/al/alagoas/noticia/2018/09...,2018-09-13 14:52:55+00:00,2018-09-14 16:14:49+00:00,Corpo de motorista da Uber é encontrado em can...,Corpo de motorista de aplicativo desaparecido ...,"Segundo a polícia, Antônio Vitor foi solicitad..."
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,http://g1.globo.com/am/amazonas/noticia/detent...,2018-05-20 20:42:40+00:00,2018-05-20 20:42:40+00:00,Detento recapturado após fuga por túnel volta ...,Detento disse que passou nome falso ao dar ent...,Ele tinha registro em presídio com nome falso....
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,http://g1.globo.com/ap/amapa/noticia/audios-mo...,2017-07-30 00:37:17+00:00,2017-07-30 00:48:42+00:00,Áudios mostram conversa entre bandidos durante...,Áudios mostram possível conversa entre bandido...,Revista realizada na sexta-feira (28) no Iapen...
3,5cc3bd27-80c7-457d-a807-2e8e7fddf031,http://g1.globo.com/ap/amapa/noticia/2020/11/0...,2020-11-06 12:54:00+00:00,2020-11-12 21:22:52+00:00,FOTOS: Apagão no Amapá,"Moradores da capital do Amapá, em Macapá, faze...",Incêndio em subestação de energia deixa 13 dos...
4,d6956177-db96-42f5-9f68-dd0d6e930661,http://g1.globo.com/ap/amapa/noticia/2019/05/2...,2019-05-27 13:43:03+00:00,2019-05-27 18:19:06+00:00,Profissionais da educação no AP paralisam ativ...,Profissionais da educação paralisam atividades...,Ato comprometeu aulas em escolas nesta segunda...


In [7]:
interaction_matrix = lil_matrix((len(df), len(df_articles)))
article_id_to_idx = {page: idx for idx, page in enumerate(df_articles['page'])}
for i, row in df.iterrows():
    user_idx = i  # O índice do usuário na matriz
    visited_pages = row['history'].split(',')  # Dividir as páginas/artigos visitados

    for page in visited_pages:
        if page in article_id_to_idx:
            article_idx = article_id_to_idx[page]
            interaction_matrix[user_idx, article_idx] = 1

interaction_sparse = csr_matrix(interaction_matrix)

# Compute user similarity using sparse matrix (efficient)
user_similarity_sparse = cosine_similarity(interaction_sparse, dense_output=False)
similarity_df = pd.DataFrame.sparse.from_spmatrix(
    user_similarity_sparse,
    index=df['userId'],  # Usa os IDs reais dos usuários
    columns=df['history']
)
similarity_df.head()
# user_article_interaction = pd.DataFrame(0, index=df['userId'], columns=df_articles['page'])
# user_article_interaction


userId,f98d1132f60d46883ce49583257104d15ce723b3bbda2147c1e31ac76f0bf069,f98d1132f60d46883ce49583257104d15ce723b3bbda2147c1e31ac76f0bf069,f98d1132f60d46883ce49583257104d15ce723b3bbda2147c1e31ac76f0bf069,2c1080975e257ed630e26679edbe4d5c850c65f3e09f655798b0bba9b42f2110,2c1080975e257ed630e26679edbe4d5c850c65f3e09f655798b0bba9b42f2110,2c1080975e257ed630e26679edbe4d5c850c65f3e09f655798b0bba9b42f2110,2c1080975e257ed630e26679edbe4d5c850c65f3e09f655798b0bba9b42f2110,2c1080975e257ed630e26679edbe4d5c850c65f3e09f655798b0bba9b42f2110,2c1080975e257ed630e26679edbe4d5c850c65f3e09f655798b0bba9b42f2110,2c1080975e257ed630e26679edbe4d5c850c65f3e09f655798b0bba9b42f2110,...,53c8ecb6e8e52bcc33fbc3fbf74c67a3d562ebdf59625f1bdb6a18db0e6b19a8,b3a4321ec8c23fb2ceb4e0e5d659e892d91c0250a69a2f5c3392c6a2b0168cba,b3a4321ec8c23fb2ceb4e0e5d659e892d91c0250a69a2f5c3392c6a2b0168cba,b3a4321ec8c23fb2ceb4e0e5d659e892d91c0250a69a2f5c3392c6a2b0168cba,c9b0cc1f5ab28c7938296d2eaecee1632e13379cf7167e4b37c5e9602e46d2f6,c9b0cc1f5ab28c7938296d2eaecee1632e13379cf7167e4b37c5e9602e46d2f6,c9b0cc1f5ab28c7938296d2eaecee1632e13379cf7167e4b37c5e9602e46d2f6,a514cbbff28c52eee1ce6d28935399876275be807fd1d13856731ab814b19a41,a514cbbff28c52eee1ce6d28935399876275be807fd1d13856731ab814b19a41,a514cbbff28c52eee1ce6d28935399876275be807fd1d13856731ab814b19a41
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f98d1132f60d46883ce49583257104d15ce723b3bbda2147c1e31ac76f0bf069,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
f98d1132f60d46883ce49583257104d15ce723b3bbda2147c1e31ac76f0bf069,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
f98d1132f60d46883ce49583257104d15ce723b3bbda2147c1e31ac76f0bf069,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2c1080975e257ed630e26679edbe4d5c850c65f3e09f655798b0bba9b42f2110,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2c1080975e257ed630e26679edbe4d5c850c65f3e09f655798b0bba9b42f2110,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_merged = df.merge(df_articles, left_on='history', right_on='page', how='left')
df_merged.head()

Unnamed: 0,userId,userType,historySize,history,timestampHistory,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,timestampHistory_new,page,url,issued,modified,title,body,caption
0,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,c8aab885-433d-4e46-8066-479f40ba7fb2,1657146417045,76,20380,50.3,2,1657146417045,c8aab885-433d-4e46-8066-479f40ba7fb2,http://g1.globo.com/sc/santa-catarina/noticia/...,2022-03-19 21:03:21+00:00,2022-03-19 21:03:21+00:00,"Você viu? 'Musa das Estradas' faz vídeo de pé,...",Caminhoneira Aline Füchter em pé em casa\nRepr...,Caminhoneira Aline Füchter ficou em pé em fren...
1,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,68d2039c-c9aa-456c-ac33-9b2e8677fba7,1657146605778,38,21184,18.18,1,1657146605778,,,,,,,
2,f98d1132f60d46883ce49583257104d15ce723b3bbda21...,Non-Logged,3,13e423ce-1d69-4c78-bc18-e8c8f7271964,1657146698738,41,35438,16.46,1,1657146698738,,,,,,,
3,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,3325b5a1-979a-4cb3-82b6-63905c9edbe8,1656684240278,7,6049,25.35,1,1656684240278,,,,,,,
4,2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...,Non-Logged,60,fe856057-f97d-419f-ab1c-97c5c3e0719c,1656761266729,80,210489,45.66,1,1656761266729,,,,,,,


In [6]:
df.shape

(1426291, 10)