In [1]:
from google.colab import drive
import sys
import os

drive.mount('/content/drive')

# Personalized path to your project directory
path = '/content/drive/MyDrive/cod/LEA3_RecSys' # Replace with your actual path

# Add the path to sys.path
sys.path.append(path)
os.chdir(path)



Mounted at /content/drive


In [222]:
import numpy as np
import pandas as pd
import sqlite3 as sql
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import interact ## para análisis interactivo
from sklearn import neighbors ### basado en contenido un solo producto consumido
import joblib




In [223]:

#############################################
#### conectar_base_de_Datos#################
############################################

conn=sql.connect('data/db_books2')
cur=conn.cursor()

cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())


[('books',), ('book_ratings',), ('users',), ('usuarios_sel',), ('books_sel',), ('ratings_final',), ('users_final',), ('books_final',), ('full_ratings',)]


In [4]:
#######################################################################
#### 3 Sistema de recomendación basado en contenido KNN #################
#### Con base en todo lo visto por el usuario #######################
#######################################################################



##### cargar data frame escalado y con dummies ###
books_dum2= joblib.load('salidas/books_dum2.joblib')

### carga data frame normal que tiene nombres de libros
books=pd.read_sql('select * from books_final', conn )
#books['year_pub']=books.year_pub.astype('int')



#### seleccionar usuario para recomendaciones ####
usuarios=pd.read_sql('select distinct (user_id) as user_id from ratings_final',conn)

user_id=31226 ### para ejemplo manual


def recomendar(user_id=list(usuarios['user_id'].value_counts().index)):

    ###seleccionar solo los ratings del usuario seleccionado
    ratings=pd.read_sql('select *from ratings_final where user_id=:user',conn, params={'user':user_id})

    ###convertir ratings del usuario a array
    l_books_r=ratings['isbn'].to_numpy()

    ###agregar la columna de isbn y titulo del libro a dummie para filtrar y mostrar nombre
    books_dum2[['isbn','book_title']]=books[['isbn','book_title']]

    ### filtrar libros calificados por el usuario
    books_r=books_dum2[books_dum2['isbn'].isin(l_books_r)]

    ## eliminar columna nombre e isbn
    books_r=books_r.drop(columns=['isbn','book_title'])
    books_r["indice"]=1 ### para usar group by y que quede en formato pandas tabla de centroide
    ##centroide o perfil del usuario
    centroide=books_r.groupby("indice").mean()


    ### filtrar libros no leídos
    books_nr=books_dum2[~books_dum2['isbn'].isin(l_books_r)]
    ## eliminbar nombre e isbn
    books_nr=books_nr.drop(columns=['isbn','book_title'])

    ### entrenar modelo
    model=neighbors.NearestNeighbors(n_neighbors=11, metric='cosine')
    model.fit(books_nr)
    dist, idlist = model.kneighbors(centroide)

    ids=idlist[0] ### queda en un array anidado, para sacarlo
    recomend_b=books.loc[ids][['book_title','isbn']]
    leidos=books[books['isbn'].isin(l_books_r)][['book_title','isbn']]

    return recomend_b


recomendar(52853)


print(interact(recomendar))

interactive(children=(Dropdown(description='user_id', options=(79961, 31226, 52853, 167471, 119517, 8066, 2820…

<function recomendar at 0x79a6bdab7420>


In [5]:
#instalar lightfm para sistema de recomendación de filtro colaborativo


!pip install lightfm


Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.1/316.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=831164 sha256=e51451ae29b8479c3b0c77396c6e97b5540bfe66500be6e989b8681c9ee8fefc
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [246]:
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
from sklearn.model_selection import GridSearchCV

In [225]:
ratings = pd.read_sql('SELECT * FROM full_ratings ', conn)

#Crear data set en formato lightfm, primero lista de usuarios unicos y items únicos
dataset_train = Dataset()
dataset_test = Dataset()

all_unique_users = ratings['user_id'].unique()
all_unique_items = ratings['isbn'].unique()

dataset_train.fit(users=all_unique_users, items=all_unique_items)
dataset_test.fit(users=all_unique_users, items=all_unique_items)



In [226]:

## verificar que el data set creado tenga los mismos registros que base original
cur.execute('SELECT count(distinct user_id), count (distinct isbn) FROM ratings_final WHERE book_rating > 0')
print(cur.fetchall())

num_users, num_items = dataset_train.interactions_shape()
print(num_users, num_items)




[(2151, 1388)]
2427 1377


In [227]:
#### separar en train y test
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

### para trabajar con el paquete light FM, se deben crear datos que crean consecutivos para cada usuario e item

train_interactions_list = [(row['user_id'], row['isbn'], row['book_rating']) for index, row in train_df.iterrows()]
train_interactions, train_weights = dataset_train.build_interactions(train_interactions_list)

test_interactions_list = [(row['user_id'], row['isbn'], row['book_rating']) for index, row in test_df.iterrows()]
test_interactions, test_weights = dataset_test.build_interactions(test_interactions_list)


In [228]:
print(f"Shape of Interactions Matrix: {train_interactions.shape}")
print(f"Shape of Weights Matrix: {train_weights.shape}")
print(f"Shape of Interactions Matrix: {test_interactions.shape}")
print(f"Shape of Weights Matrix: {test_weights.shape}")

Shape of Interactions Matrix: (2427, 1377)
Shape of Weights Matrix: (2427, 1377)
Shape of Interactions Matrix: (2427, 1377)
Shape of Weights Matrix: (2427, 1377)


In [230]:
#Este código es de verificación para mirar los datos cómo se estructuran
train_int_array=train_interactions.toarray()
train_weights_array=train_weights.toarray()

test_int_array=test_interactions.toarray()
test_weights_array=test_weights.toarray()

print(train_int_array)
print(test_int_array)

print(train_weights_array)
print(test_weights_array)



[[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [257]:
loss_model='warp'  ## 'logistic', 'bpr', 'warp', 'warp-kos'

model=LightFM(loss=loss_model, random_state=42)
model.fit(train_interactions, epochs=10, verbose=True, sample_weight=train_weights)

### auc por usuario
train_auc = auc_score(model, train_interactions)
test_auc = auc_score(model, test_interactions)


print('AUC: train %.2f, test %.2f.' % (train_auc.mean(), test_auc.mean()))




Epoch: 100%|██████████| 10/10 [00:00<00:00, 24.10it/s]


AUC: train 0.70, test 0.62.


In [244]:
def recommendation(model, data, original_user_id, conn,k):

  data=dataset_train
  original_user_id=52853

  ## libros no leidos y códigos
  df_nr_books=pd.read_sql(f'select * from full_ratings where user_id<>{original_user_id}',conn)
  isbn_nr_books=df_nr_books['isbn'].values
  item_id_nr=[value for key, value in data.mapping()[2].items() if value not in isbn_nr_books]

  uid_index=data.mapping()[0][original_user_id] ## id usuario según modelo

  scores=model.predict(uid_index, item_id_nr)
  sorted_indices = np.argsort(-scores).tolist()

  top_items = [key for key, value in data.mapping()[2].items() if value in sorted_indices[:k]]
  recommended=df_nr_books[df_nr_books['isbn'].isin(top_items)][['isbn', 'book_title']]
  recommended.drop_duplicates(inplace=True)

  return recommended


array([-0.95442706, -0.39551088, -0.35809022, ..., -0.9214462 ,
       -1.9105419 , -1.95727   ], dtype=float32)

In [243]:
recommendation(model, dataset_train,52853, conn, 4)

Unnamed: 0,isbn,book_title
20,0316666343,The Lovely Bones: A Novel
66,044021145X,The Firm
120,0060928336,Divine Secrets of the Ya-Ya Sisterhood: A Novel
502,0446310786,To Kill a Mockingbird
