In [2]:
import polars as pl
import sys
from pathlib import Path

# Define o caminho para o diretório raiz do projeto
project_root = Path("/home/igor/github-projects/book-review")

# Adiciona o diretório raiz do projeto ao sys.path
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))
from datetime import datetime
import logging
from src.sentiment_analysis import analyze_sentiment  # Ensure this function is correctly implemented
from src.paths import FEATURE_STORE_DIR, FILTERED_DATA_DIR

# Logger setup for notebook visualization
logger = logging.getLogger("feature_engineering_test")
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Load filtered data
try:
    reviews = pl.read_csv(project_root / "data/filtered/rating_reviews_filtered.csv")
    books_info = pl.read_csv(project_root / "data/filtered/books_info_filtered.csv")
    logger.info("Filtered data loaded successfully.")
except Exception as e:
    logger.error(f"Error loading filtered data: {e}")


INFO - Filtered data loaded successfully.


In [14]:
# Review and summary length features
try:
    reviews = reviews.with_columns([
        pl.col("text").str.len_chars().alias("review_length"),
        pl.col("summary").str.len_chars().alias("summary_length")
    ])
    logger.info("Review and summary length features calculated successfully.")
except Exception as e:
    logger.error(f"Error in review length calculation: {e}")


INFO - Review and summary length features calculated successfully.


In [15]:
reviews

Title,User_id,score,time,summary,text,review_length,summary_length
str,str,f64,str,str,str,u32,u32
"""Its Only Art If Its Well Hung!""","""AVCGYZL8FQQTD""",4.0,"""1999-10-23T00:00:00.000""","""Nice collection of Julie Strai…","""This is only for Julie Strain …",457,38
"""Dr. Seuss: American Icon""","""A30TK6U7DNS82R""",5.0,"""2004-09-21T00:00:00.000""","""Really Enjoyed It""","""I don't care much for Dr. Seus…",1423,17
"""Dr. Seuss: American Icon""","""A3UH4UZ4RSVO82""",5.0,"""2004-03-09T00:00:00.000""","""Essential for every personal a…","""If people become the books the…",1752,47
"""Dr. Seuss: American Icon""","""A2MVUWT453QH61""",4.0,"""2004-07-25T00:00:00.000""","""Phlip Nel gives silly Seuss a …","""Theodore Seuss Geisel (1904-19…",3662,47
"""Dr. Seuss: American Icon""","""A22X4XUPKF66MR""",4.0,"""2005-02-10T00:00:00.000""","""Good academic overview""","""Philip Nel - Dr. Seuss: Americ…",1542,22
…,…,…,…,…,…,…,…
"""The Idea of History""","""AI1QNMVF2E3TN""",5.0,"""2003-07-01T00:00:00.000""","""R. G. Collingwood's Most Famou…","""Highly Recommended.This book i…",1632,36
"""The Idea of History""","""AOFGOUMXLMVZS""",4.0,"""2012-07-17T00:00:00.000""","""Thoughtful Critic of History""","""History is not a scientific pr…",12422,28
"""The Idea of History""","""A1SMUB9ASL5L9Y""",4.0,"""2012-03-14T00:00:00.000""","""Quite good and ahead of its ti…","""This is pretty interesting. Co…",597,45
"""The Idea of History""","""A2AQMEKZKK5EE4""",4.0,"""2007-05-27T00:00:00.000""","""Easier reads of those not well…","""This is a good book but very e…",161,55


In [16]:
try:
    sentiment_scores = analyze_sentiment(reviews["text"].to_list())
    reviews = reviews.with_columns(
        pl.Series(sentiment_scores).alias("sentiment_score")
    )
    logger.info("Sentiment analysis completed successfully.")
except Exception as e:
    logger.error(f"Error in sentiment analysis: {e}")


INFO - Sentiment analysis completed successfully.


In [17]:
reviews

Title,User_id,score,time,summary,text,review_length,summary_length,sentiment_score
str,str,f64,str,str,str,u32,u32,f64
"""Its Only Art If Its Well Hung!""","""AVCGYZL8FQQTD""",4.0,"""1999-10-23T00:00:00.000""","""Nice collection of Julie Strai…","""This is only for Julie Strain …",457,38,0.9408
"""Dr. Seuss: American Icon""","""A30TK6U7DNS82R""",5.0,"""2004-09-21T00:00:00.000""","""Really Enjoyed It""","""I don't care much for Dr. Seus…",1423,17,0.9876
"""Dr. Seuss: American Icon""","""A3UH4UZ4RSVO82""",5.0,"""2004-03-09T00:00:00.000""","""Essential for every personal a…","""If people become the books the…",1752,47,0.9935
"""Dr. Seuss: American Icon""","""A2MVUWT453QH61""",4.0,"""2004-07-25T00:00:00.000""","""Phlip Nel gives silly Seuss a …","""Theodore Seuss Geisel (1904-19…",3662,47,0.9807
"""Dr. Seuss: American Icon""","""A22X4XUPKF66MR""",4.0,"""2005-02-10T00:00:00.000""","""Good academic overview""","""Philip Nel - Dr. Seuss: Americ…",1542,22,0.9803
…,…,…,…,…,…,…,…,…
"""The Idea of History""","""AI1QNMVF2E3TN""",5.0,"""2003-07-01T00:00:00.000""","""R. G. Collingwood's Most Famou…","""Highly Recommended.This book i…",1632,36,0.9595
"""The Idea of History""","""AOFGOUMXLMVZS""",4.0,"""2012-07-17T00:00:00.000""","""Thoughtful Critic of History""","""History is not a scientific pr…",12422,28,-0.9643
"""The Idea of History""","""A1SMUB9ASL5L9Y""",4.0,"""2012-03-14T00:00:00.000""","""Quite good and ahead of its ti…","""This is pretty interesting. Co…",597,45,0.875
"""The Idea of History""","""A2AQMEKZKK5EE4""",4.0,"""2007-05-27T00:00:00.000""","""Easier reads of those not well…","""This is a good book but very e…",161,55,0.6858


In [18]:
try:
    current_date = datetime.now()
    reviews = reviews.with_columns(
        (pl.lit(current_date) - pl.col("time")).dt.days().alias("review_age_days")
    )
    logger.info("Review age calculated successfully.")
except Exception as e:
    logger.error(f"Error in review age calculation: {e}")


ERROR - Error in review age calculation: 'ExprDateTimeNameSpace' object has no attribute 'days'


In [19]:
try:
    current_year = datetime.now().year
    books_info = books_info.with_columns(
        (current_year - pl.col("publishedDate").dt.year()).alias("book_age")
    )
    logger.info("Book age feature calculated successfully.")
except Exception as e:
    logger.error(f"Error in book age calculation: {e}")


ERROR - Error in book age calculation: `year` operation not supported for dtype `str`


In [20]:
try:
    avg_ratings = reviews.groupby("Title").agg(pl.col("score").mean().alias("avg_score"))
    books_info = books_info.join(avg_ratings, on="Title", how="left")
    logger.info("Average rating per book calculated successfully.")
except Exception as e:
    logger.error(f"Error in average rating calculation: {e}")


ERROR - Error in average rating calculation: 'DataFrame' object has no attribute 'groupby'


In [21]:
try:
    author_review_counts = reviews.groupby("Title").count().alias("review_count")
    books_info = books_info.join(author_review_counts, on="Title", how="left")
    logger.info("Review count per book calculated successfully.")
except Exception as e:
    logger.error(f"Error in review count calculation: {e}")


ERROR - Error in review count calculation: 'DataFrame' object has no attribute 'groupby'


In [None]:
try:
    reviews.write_csv(FEATURE_STORE_DIR / "rating_reviews_features.csv")
    books_info.write_csv(FEATURE_STORE_DIR / "books_info_features.csv")
    logger.info("Feature-engineered data saved to feature_store successfully.")
except Exception as e:
    logger.error(f"Error saving feature-engineered data: {e}")
