In [3]:

import sys
from pathlib import Path

# Define o caminho para o diretório raiz do projeto
project_root = Path("/home/igor/github-projects/book-review")

# Adiciona o diretório raiz do projeto ao sys.path
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

import polars as pl
from src.paths import * 
from src.config import *
model_data = pl.read_parquet(FEATURE_STORE_DIR / 'model_data.parquet')

In [4]:
model_data.head()

Title,User_id,score,summary,text,review_length,summary_length,sentiment_score,current_date,review_age_days,text_length,description,authors,publisher,publishedDate,categories,avg_score,count
str,str,f64,str,str,u32,u32,f64,date,str,u32,str,str,str,str,str,f64,u32
"""Its Only Art If Its Well Hung!""","""AVCGYZL8FQQTD""",4.0,"""Nice collection of Julie Strai…","""This is only for Julie Strain …",457,38,0.9408,2024-11-11,"""790646400000000000""",457,"""Unknown""","""['Julie Strain']""","""Unknown""","""1996""","""['Comics & Graphic Novels']""",4.0,1
"""Dr. Seuss: American Icon""","""A30TK6U7DNS82R""",5.0,"""Really Enjoyed It""","""I don't care much for Dr. Seus…",1423,17,0.9876,2024-11-11,"""635558400000000000""",1423,"""Philip Nel takes a fascinating…","""['Philip Nel']""","""A&C Black""","""2005-01-01""","""['Biography & Autobiography']""",4.555556,9
"""Dr. Seuss: American Icon""","""A3UH4UZ4RSVO82""",5.0,"""Essential for every personal a…","""If people become the books the…",1752,47,0.9935,2024-11-11,"""652492800000000000""",1752,"""Philip Nel takes a fascinating…","""['Philip Nel']""","""A&C Black""","""2005-01-01""","""['Biography & Autobiography']""",4.555556,9
"""Dr. Seuss: American Icon""","""A2MVUWT453QH61""",4.0,"""Phlip Nel gives silly Seuss a …","""Theodore Seuss Geisel (1904-19…",3662,47,0.9807,2024-11-11,"""640569600000000000""",3662,"""Philip Nel takes a fascinating…","""['Philip Nel']""","""A&C Black""","""2005-01-01""","""['Biography & Autobiography']""",4.555556,9
"""Dr. Seuss: American Icon""","""A22X4XUPKF66MR""",4.0,"""Good academic overview""","""Philip Nel - Dr. Seuss: Americ…",1542,22,0.9803,2024-11-11,"""623289600000000000""",1542,"""Philip Nel takes a fascinating…","""['Philip Nel']""","""A&C Black""","""2005-01-01""","""['Biography & Autobiography']""",4.555556,9


In [6]:
# Sample the data for testing to reduce computation
sample_data = model_data.sample(fraction=0.1, seed=42,shuffle=True)

In [8]:
# Combine relevant columns into a single text string for each book review
combined_texts = [
    f"Title: {row['Title']}\n"
    f"Description: {row['description']}\n"
    f"Authors: {row['authors']}\n"
    f"Publisher: {row['publisher']}\n"
    f"Published Date: {row['publishedDate']}\n"
    f"Categories: {row['categories']}\n"
    f"Average Score: {row['avg_score']}\n"
    f"Review Summary: {row['summary']}\n"
    f"Review Text: {row['text']}\n"
    f"Review Score: {row['score']}\n"
    for row in sample_data.to_dicts()
]

In [9]:
# Define generic question templates
questions = [
    "What is the general opinion about this book?",
    "What do readers think about the book's narrative and style?",
    "Could you summarize the feedback for this book?",
    "What are the strong and weak points mentioned in the reviews?"
]

# Prepare the training pairs by combining each text with each question
training_pairs = [{"question": q, "context": combined_text} for combined_text in combined_texts for q in questions]


In [3]:
import sys
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity

# Inicializa o caminho do projeto
project_root = Path("/home/igor/github-projects/book-review")
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.paths import *
from src.config import *
import polars as pl

# Carrega o modelo e tokenizador
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Carrega o dado
model_data = pl.read_parquet(FEATURE_STORE_DIR / 'model_data.parquet')
sample_data = model_data.sample(fraction=0.001, seed=42, shuffle=True)

# Combina colunas relevantes em um único texto
combined_texts = [
    f"Title: {row['Title']}\nDescription: {row['description']}\nAuthors: {row['authors']}\n"
    f"Publisher: {row['publisher']}\nPublished Date: {row['publishedDate']}\nCategories: {row['categories']}\n"
    f"Average Score: {row['avg_score']}\nReview Summary: {row['summary']}\nReview Text: {row['text']}\n"
    for row in sample_data.to_dicts()
]

# Perguntas de exemplo
questions = [
    "What is the general opinion about this book?",
    "What do readers think about the book's narrative and style?",
    "Could you summarize the feedback for this book?",
    "What are the strong and weak points mentioned in the reviews?"
]

# Função para gerar embeddings
def generate_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state[:, 0, :]  # CLS pooling
    return embeddings

# Gera embeddings das perguntas e contextos
question_embeddings = generate_embeddings(questions)
context_embeddings = generate_embeddings(combined_texts)

# Função para responder perguntas encontrando a similaridade máxima
def get_most_relevant_answer(question_embedding, context_embeddings, contexts):
    similarities = cosine_similarity(question_embedding, context_embeddings)
    max_index = similarities.argmax().item()
    return contexts[max_index], similarities[max_index].item()

# Teste com uma pergunta de exemplo
question_text = "What is the most loved book?"
question_embedding = generate_embeddings([question_text])

# Encontra e exibe a resposta mais relevante
most_relevant_answer, similarity_score = get_most_relevant_answer(question_embedding, context_embeddings, combined_texts)
print(f"Question: {question_text}")
print(f"Most Relevant Answer: {most_relevant_answer}")
print(f"Similarity Score: {similarity_score}")


: 

In [None]:
def ask_model(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        start_scores, end_scores = outputs.start_logits, outputs.end_logits
        start_idx = torch.argmax(start_scores)
        end_idx = torch.argmax(end_scores) + 1
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_idx:end_idx]))
    return answer

# Test a sample question
sample_question = "What is the general opinion about this book?"
sample_context = combined_texts[0]  # Test on the first sample
print(ask_model(sample_question, sample_context))