<a href="https://colab.research.google.com/github/jsebastianquiroga/PUJ_NLP_IA/blob/main/proyecto/inferencia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m114.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
Co

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import gdown
import joblib

# Check for GPU and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)

def generate_embeddings(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    tokens = {key: val.to(device) for key, val in tokens.items()}
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().reshape(-1)

# Download and load the data and models
gdown.download('https://drive.google.com/uc?id=1tsSItM8O_VeDI3mjJ87gFObYFe7FdJ7q', 'knn_model.pkl', quiet=False)
knn_model = joblib.load('knn_model.pkl')

gdown.download('https://drive.google.com/uc?id=1E-KjJCjGno41DchF_lr55FKU7JkY-IEf', 'merged_df_embeddings.parquet', quiet=False)
merged_df = pd.read_parquet('merged_df_embeddings.parquet')

# Combine 'description' and 'categories' text, then generate embeddings
merged_df['combined_text'] = merged_df['description'] + " " + merged_df['categories']
merged_df['combined_embedding'] = merged_df['combined_text'].apply(generate_embeddings)

# Train the k-NN model
knn_model = NearestNeighbors(n_neighbors=6)  # Fetching 6 neighbors including the book itself
knn_model.fit(np.stack(merged_df['combined_embedding'].values))

def recommend_books(new_text):
    new_embedding = generate_embeddings(new_text)
    distances, indices = knn_model.kneighbors([new_embedding])
    recommended_books = merged_df.iloc[indices[0][1:]]  # Exclude the first index as it will be the book itself
    return recommended_books[['Title', 'authors', 'publishedDate']]

# Example usage
recommend_books('I want to read something about fantasy and love')

Using device: cuda


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading...
From: https://drive.google.com/uc?id=1tsSItM8O_VeDI3mjJ87gFObYFe7FdJ7q
To: /content/knn_model.pkl
100%|██████████| 417M/417M [00:10<00:00, 40.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1E-KjJCjGno41DchF_lr55FKU7JkY-IEf
To: /content/merged_df_embeddings.parquet
100%|██████████| 546M/546M [00:09<00:00, 58.3MB/s]


In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import gdown
import joblib

# Ensure GPU is available
if not torch.cuda.is_available():
    raise ValueError("No GPU found, please ensure you're running on a GPU instance.")

device = torch.device("cuda")

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)

def generate_embeddings(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    tokens = {key: val.to(device) for key, val in tokens.items()}
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().reshape(-1)

# Download and load the data and models
gdown.download('https://drive.google.com/uc?id=1tsSItM8O_VeDI3mjJ87gFObYFe7FdJ7q', 'knn_model.pkl', quiet=False)
knn_model = joblib.load('knn_model.pkl')

gdown.download('https://drive.google.com/uc?id=1E-KjJCjGno41DchF_lr55FKU7JkY-IEf', 'merged_df_embeddings.parquet', quiet=False)
merged_df = pd.read_parquet('merged_df_embeddings.parquet')

# Define the recommendation function
def recommend_books_1(new_text):
    new_embedding = generate_embeddings(new_text)
    distances, indices = knn_model.kneighbors([new_embedding])

    # Normalize the review scores to [0, 1]
    max_score = merged_df['review/score'].max()
    min_score = merged_df['review/score'].min()
    merged_df['normalized_score'] = (merged_df['review/score'] - min_score) / (max_score - min_score)

    # Adjust distances based on review scores
    for i, index in enumerate(indices[0]):
        distances[0][i] *= (1 - merged_df.iloc[index]['normalized_score'])

    # Now, sort the books based on adjusted distances
    sorted_indices = np.argsort(distances[0])

    # Get top 5 books based on adjusted distances
    recommended_books = merged_df.iloc[sorted_indices[1:6]]  # Exclude the first index as it will be the book itself
    return recommended_books[['Title', 'authors', 'publishedDate']]

# Example usage
print(recommend_books_1('I want to read something fantasy and love'))
