In [2]:
import polars as pl
import sys
from pathlib import Path

# Define o caminho para o diretório raiz do projeto
project_root = Path("/home/igor/github-projects/book-review")

# Adiciona o diretório raiz do projeto ao sys.path
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))
from datetime import datetime
import logging
from src.sentiment_analysis import analyze_sentiment  # Ensure this function is correctly implemented
from src.paths import FEATURE_STORE_DIR, FILTERED_DATA_DIR

# Logger setup for notebook visualization
logger = logging.getLogger("feature_engineering_test")
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Load filtered data
try:
    reviews = pl.read_csv(project_root / "data/filtered/rating_reviews_filtered.csv")
    books_info = pl.read_csv(project_root / "data/filtered/books_info_filtered.csv")
    logger.info("Filtered data loaded successfully.")
except Exception as e:
    logger.error(f"Error loading filtered data: {e}")


INFO - Filtered data loaded successfully.


In [14]:
# Review and summary length features
try:
    reviews = reviews.with_columns([
        pl.col("text").str.len_chars().alias("review_length"),
        pl.col("summary").str.len_chars().alias("summary_length")
    ])
    logger.info("Review and summary length features calculated successfully.")
except Exception as e:
    logger.error(f"Error in review length calculation: {e}")


INFO - Review and summary length features calculated successfully.


In [15]:
reviews

Title,User_id,score,time,summary,text,review_length,summary_length
str,str,f64,str,str,str,u32,u32
"""Its Only Art If Its Well Hung!""","""AVCGYZL8FQQTD""",4.0,"""1999-10-23T00:00:00.000""","""Nice collection of Julie Strai…","""This is only for Julie Strain …",457,38
"""Dr. Seuss: American Icon""","""A30TK6U7DNS82R""",5.0,"""2004-09-21T00:00:00.000""","""Really Enjoyed It""","""I don't care much for Dr. Seus…",1423,17
"""Dr. Seuss: American Icon""","""A3UH4UZ4RSVO82""",5.0,"""2004-03-09T00:00:00.000""","""Essential for every personal a…","""If people become the books the…",1752,47
"""Dr. Seuss: American Icon""","""A2MVUWT453QH61""",4.0,"""2004-07-25T00:00:00.000""","""Phlip Nel gives silly Seuss a …","""Theodore Seuss Geisel (1904-19…",3662,47
"""Dr. Seuss: American Icon""","""A22X4XUPKF66MR""",4.0,"""2005-02-10T00:00:00.000""","""Good academic overview""","""Philip Nel - Dr. Seuss: Americ…",1542,22
…,…,…,…,…,…,…,…
"""The Idea of History""","""AI1QNMVF2E3TN""",5.0,"""2003-07-01T00:00:00.000""","""R. G. Collingwood's Most Famou…","""Highly Recommended.This book i…",1632,36
"""The Idea of History""","""AOFGOUMXLMVZS""",4.0,"""2012-07-17T00:00:00.000""","""Thoughtful Critic of History""","""History is not a scientific pr…",12422,28
"""The Idea of History""","""A1SMUB9ASL5L9Y""",4.0,"""2012-03-14T00:00:00.000""","""Quite good and ahead of its ti…","""This is pretty interesting. Co…",597,45
"""The Idea of History""","""A2AQMEKZKK5EE4""",4.0,"""2007-05-27T00:00:00.000""","""Easier reads of those not well…","""This is a good book but very e…",161,55


In [16]:
try:
    sentiment_scores = analyze_sentiment(reviews["text"].to_list())
    reviews = reviews.with_columns(
        pl.Series(sentiment_scores).alias("sentiment_score")
    )
    logger.info("Sentiment analysis completed successfully.")
except Exception as e:
    logger.error(f"Error in sentiment analysis: {e}")


INFO - Sentiment analysis completed successfully.


In [17]:
reviews

Title,User_id,score,time,summary,text,review_length,summary_length,sentiment_score
str,str,f64,str,str,str,u32,u32,f64
"""Its Only Art If Its Well Hung!""","""AVCGYZL8FQQTD""",4.0,"""1999-10-23T00:00:00.000""","""Nice collection of Julie Strai…","""This is only for Julie Strain …",457,38,0.9408
"""Dr. Seuss: American Icon""","""A30TK6U7DNS82R""",5.0,"""2004-09-21T00:00:00.000""","""Really Enjoyed It""","""I don't care much for Dr. Seus…",1423,17,0.9876
"""Dr. Seuss: American Icon""","""A3UH4UZ4RSVO82""",5.0,"""2004-03-09T00:00:00.000""","""Essential for every personal a…","""If people become the books the…",1752,47,0.9935
"""Dr. Seuss: American Icon""","""A2MVUWT453QH61""",4.0,"""2004-07-25T00:00:00.000""","""Phlip Nel gives silly Seuss a …","""Theodore Seuss Geisel (1904-19…",3662,47,0.9807
"""Dr. Seuss: American Icon""","""A22X4XUPKF66MR""",4.0,"""2005-02-10T00:00:00.000""","""Good academic overview""","""Philip Nel - Dr. Seuss: Americ…",1542,22,0.9803
…,…,…,…,…,…,…,…,…
"""The Idea of History""","""AI1QNMVF2E3TN""",5.0,"""2003-07-01T00:00:00.000""","""R. G. Collingwood's Most Famou…","""Highly Recommended.This book i…",1632,36,0.9595
"""The Idea of History""","""AOFGOUMXLMVZS""",4.0,"""2012-07-17T00:00:00.000""","""Thoughtful Critic of History""","""History is not a scientific pr…",12422,28,-0.9643
"""The Idea of History""","""A1SMUB9ASL5L9Y""",4.0,"""2012-03-14T00:00:00.000""","""Quite good and ahead of its ti…","""This is pretty interesting. Co…",597,45,0.875
"""The Idea of History""","""A2AQMEKZKK5EE4""",4.0,"""2007-05-27T00:00:00.000""","""Easier reads of those not well…","""This is a good book but very e…",161,55,0.6858


## date

In [64]:
current_date = datetime.now().strftime("%Y-%m-%d")

In [68]:
# Convert the string to datetime and calculate review_age_days
reviews = reviews.with_columns([
    # Convert the current_date string to datetime type
    pl.lit(current_date).str.to_datetime().cast(pl.Date).alias("current_date")
])
# Convert the string to datetime and calculate review_age_days
#reviews = reviews.with_columns([
#    # Convert the current_date string to datetime type
#    pl.col("time").str.to_datetime().cast(pl.Date).alias("time")
#])

# Calculate the review age in days by subtracting 'time' from 'current_date'
reviews = reviews.with_columns( review_age_days =
    pl.col("current_date") - pl.col("time")
)


In [111]:
reviews

Title,User_id,score,time,summary,text,review_length,summary_length,sentiment_score,current_date,review_age_days
str,str,f64,date,str,str,u32,u32,f64,date,str
"""Its Only Art If Its Well Hung!""","""AVCGYZL8FQQTD""",4.0,1999-10-23,"""Nice collection of Julie Strai…","""This is only for Julie Strain …",457,38,0.9408,2024-11-11,"""790646400000000000"""
"""Dr. Seuss: American Icon""","""A30TK6U7DNS82R""",5.0,2004-09-21,"""Really Enjoyed It""","""I don't care much for Dr. Seus…",1423,17,0.9876,2024-11-11,"""635558400000000000"""
"""Dr. Seuss: American Icon""","""A3UH4UZ4RSVO82""",5.0,2004-03-09,"""Essential for every personal a…","""If people become the books the…",1752,47,0.9935,2024-11-11,"""652492800000000000"""
"""Dr. Seuss: American Icon""","""A2MVUWT453QH61""",4.0,2004-07-25,"""Phlip Nel gives silly Seuss a …","""Theodore Seuss Geisel (1904-19…",3662,47,0.9807,2024-11-11,"""640569600000000000"""
"""Dr. Seuss: American Icon""","""A22X4XUPKF66MR""",4.0,2005-02-10,"""Good academic overview""","""Philip Nel - Dr. Seuss: Americ…",1542,22,0.9803,2024-11-11,"""623289600000000000"""
…,…,…,…,…,…,…,…,…,…,…
"""The Idea of History""","""AI1QNMVF2E3TN""",5.0,2003-07-01,"""R. G. Collingwood's Most Famou…","""Highly Recommended.This book i…",1632,36,0.9595,2024-11-11,"""674265600000000000"""
"""The Idea of History""","""AOFGOUMXLMVZS""",4.0,2012-07-17,"""Thoughtful Critic of History""","""History is not a scientific pr…",12422,28,-0.9643,2024-11-11,"""388800000000000000"""
"""The Idea of History""","""A1SMUB9ASL5L9Y""",4.0,2012-03-14,"""Quite good and ahead of its ti…","""This is pretty interesting. Co…",597,45,0.875,2024-11-11,"""399600000000000000"""
"""The Idea of History""","""A2AQMEKZKK5EE4""",4.0,2007-05-27,"""Easier reads of those not well…","""This is a good book but very e…",161,55,0.6858,2024-11-11,"""551059200000000000"""


In [62]:
reviews

Title,User_id,score,time,summary,text,review_length,summary_length,sentiment_score,current_date,review_age_days
str,str,f64,date,str,str,u32,u32,f64,date,duration[ms]
"""Its Only Art If Its Well Hung!""","""AVCGYZL8FQQTD""",4.0,1999-10-23,"""Nice collection of Julie Strai…","""This is only for Julie Strain …",457,38,0.9408,2024-11-11,9151d
"""Dr. Seuss: American Icon""","""A30TK6U7DNS82R""",5.0,2004-09-21,"""Really Enjoyed It""","""I don't care much for Dr. Seus…",1423,17,0.9876,2024-11-11,7356d
"""Dr. Seuss: American Icon""","""A3UH4UZ4RSVO82""",5.0,2004-03-09,"""Essential for every personal a…","""If people become the books the…",1752,47,0.9935,2024-11-11,7552d
"""Dr. Seuss: American Icon""","""A2MVUWT453QH61""",4.0,2004-07-25,"""Phlip Nel gives silly Seuss a …","""Theodore Seuss Geisel (1904-19…",3662,47,0.9807,2024-11-11,7414d
"""Dr. Seuss: American Icon""","""A22X4XUPKF66MR""",4.0,2005-02-10,"""Good academic overview""","""Philip Nel - Dr. Seuss: Americ…",1542,22,0.9803,2024-11-11,7214d
…,…,…,…,…,…,…,…,…,…,…
"""The Idea of History""","""AI1QNMVF2E3TN""",5.0,2003-07-01,"""R. G. Collingwood's Most Famou…","""Highly Recommended.This book i…",1632,36,0.9595,2024-11-11,7804d
"""The Idea of History""","""AOFGOUMXLMVZS""",4.0,2012-07-17,"""Thoughtful Critic of History""","""History is not a scientific pr…",12422,28,-0.9643,2024-11-11,4500d
"""The Idea of History""","""A1SMUB9ASL5L9Y""",4.0,2012-03-14,"""Quite good and ahead of its ti…","""This is pretty interesting. Co…",597,45,0.875,2024-11-11,4625d
"""The Idea of History""","""A2AQMEKZKK5EE4""",4.0,2007-05-27,"""Easier reads of those not well…","""This is a good book but very e…",161,55,0.6858,2024-11-11,6378d


In [74]:
try:
    avg_ratings = reviews.group_by("Title").agg(pl.col("score").mean().alias("avg_score"))
    books_info = books_info.join(avg_ratings, on="Title", how="left")
    logger.info("Average rating per book calculated successfully.")
except Exception as e:
    logger.error(f"Error in average rating calculation: {e}")


INFO - Average rating per book calculated successfully.


In [76]:
books_info

Title,description,authors,publisher,publishedDate,categories,avg_score
str,str,str,str,str,str,f64
"""Its Only Art If Its Well Hung!""","""Unknown""","""['Julie Strain']""","""Unknown""","""1996""","""['Comics & Graphic Novels']""",4.0
"""Dr. Seuss: American Icon""","""Philip Nel takes a fascinating…","""['Philip Nel']""","""A&C Black""","""2005-01-01""","""['Biography & Autobiography']""",4.555556
"""Wonderful Worship in Smaller C…","""This resource includes twelve …","""['David R. Ray']""","""Unknown""","""2000""","""['Religion']""",5.0
"""Whispers of the Wicked Saints""","""Julia Thomas finds her life sp…","""['Veronica Haddon']""","""iUniverse""","""2005-02""","""['Fiction']""",3.71875
"""Nation Dance: Religion, Identi…","""Unknown""","""['Edward Long']""","""Unknown""","""2003-03-01""","""Uncategorized""",5.0
…,…,…,…,…,…,…
"""The Orphan Of Ellis Island (Ti…","""During a school trip to Ellis …","""['Elvira Woodruff']""","""Scholastic Paperbacks""","""2000-06-01""","""['Juvenile Fiction']""",4.5
"""Red Boots for Christmas""","""Everyone in the village of Fri…","""Unknown""","""Unknown""","""1995""","""['Juvenile Fiction']""",5.0
"""Mamaw""","""Give your Mamaw a useful, beau…","""['Wild Wild Cabbage']""","""Unknown""","""2018-01-17""","""Uncategorized""",5.0
"""The Autograph Man""","""Alex-Li Tandem sells autograph…","""['Zadie Smith']""","""Vintage""","""2003-08-12""","""['Fiction']""",2.5


In [110]:
try:
    author_review_counts = reviews.group_by("Title").len()
    books_info = books_info.join(author_review_counts, on="Title", how="left")
    logger.info("Review count per book calculated successfully.")
except Exception as e:
    logger.error(f"Error in review count calculation: {e}")


INFO - Review count per book calculated successfully.


In [79]:
author_review_counts

Title,count
str,u32
"""Fast Food for the Soul:""",9
"""Teaching Developmental Reading…",1
"""Who's Grace? (John Smyth Myste…",3
"""The adventures of Jerry Muskra…",8
"""The Ice Mask (Severn House Lar…",1
…,…
"""Jackie: A Life in Pictures""",7
"""Flik's Perfect Gift (Disney's …",1
"""Minute for Murder""",1
"""How to Repair and Maintain Ame…",5


In [80]:
books_info

Title,description,authors,publisher,publishedDate,categories,avg_score,count
str,str,str,str,str,str,f64,u32
"""Its Only Art If Its Well Hung!""","""Unknown""","""['Julie Strain']""","""Unknown""","""1996""","""['Comics & Graphic Novels']""",4.0,1
"""Dr. Seuss: American Icon""","""Philip Nel takes a fascinating…","""['Philip Nel']""","""A&C Black""","""2005-01-01""","""['Biography & Autobiography']""",4.555556,9
"""Wonderful Worship in Smaller C…","""This resource includes twelve …","""['David R. Ray']""","""Unknown""","""2000""","""['Religion']""",5.0,4
"""Whispers of the Wicked Saints""","""Julia Thomas finds her life sp…","""['Veronica Haddon']""","""iUniverse""","""2005-02""","""['Fiction']""",3.71875,32
"""Nation Dance: Religion, Identi…","""Unknown""","""['Edward Long']""","""Unknown""","""2003-03-01""","""Uncategorized""",5.0,1
…,…,…,…,…,…,…,…
"""The Orphan Of Ellis Island (Ti…","""During a school trip to Ellis …","""['Elvira Woodruff']""","""Scholastic Paperbacks""","""2000-06-01""","""['Juvenile Fiction']""",4.5,8
"""Red Boots for Christmas""","""Everyone in the village of Fri…","""Unknown""","""Unknown""","""1995""","""['Juvenile Fiction']""",5.0,1
"""Mamaw""","""Give your Mamaw a useful, beau…","""['Wild Wild Cabbage']""","""Unknown""","""2018-01-17""","""Uncategorized""",5.0,1
"""The Autograph Man""","""Alex-Li Tandem sells autograph…","""['Zadie Smith']""","""Vintage""","""2003-08-12""","""['Fiction']""",2.5,4


In [82]:
reviews

Title,User_id,score,time,summary,text,review_length,summary_length,sentiment_score,current_date,review_age_days
str,str,f64,date,str,str,u32,u32,f64,date,duration[ms]
"""Its Only Art If Its Well Hung!""","""AVCGYZL8FQQTD""",4.0,1999-10-23,"""Nice collection of Julie Strai…","""This is only for Julie Strain …",457,38,0.9408,2024-11-11,9151d
"""Dr. Seuss: American Icon""","""A30TK6U7DNS82R""",5.0,2004-09-21,"""Really Enjoyed It""","""I don't care much for Dr. Seus…",1423,17,0.9876,2024-11-11,7356d
"""Dr. Seuss: American Icon""","""A3UH4UZ4RSVO82""",5.0,2004-03-09,"""Essential for every personal a…","""If people become the books the…",1752,47,0.9935,2024-11-11,7552d
"""Dr. Seuss: American Icon""","""A2MVUWT453QH61""",4.0,2004-07-25,"""Phlip Nel gives silly Seuss a …","""Theodore Seuss Geisel (1904-19…",3662,47,0.9807,2024-11-11,7414d
"""Dr. Seuss: American Icon""","""A22X4XUPKF66MR""",4.0,2005-02-10,"""Good academic overview""","""Philip Nel - Dr. Seuss: Americ…",1542,22,0.9803,2024-11-11,7214d
…,…,…,…,…,…,…,…,…,…,…
"""The Idea of History""","""AI1QNMVF2E3TN""",5.0,2003-07-01,"""R. G. Collingwood's Most Famou…","""Highly Recommended.This book i…",1632,36,0.9595,2024-11-11,7804d
"""The Idea of History""","""AOFGOUMXLMVZS""",4.0,2012-07-17,"""Thoughtful Critic of History""","""History is not a scientific pr…",12422,28,-0.9643,2024-11-11,4500d
"""The Idea of History""","""A1SMUB9ASL5L9Y""",4.0,2012-03-14,"""Quite good and ahead of its ti…","""This is pretty interesting. Co…",597,45,0.875,2024-11-11,4625d
"""The Idea of History""","""A2AQMEKZKK5EE4""",4.0,2007-05-27,"""Easier reads of those not well…","""This is a good book but very e…",161,55,0.6858,2024-11-11,6378d


In [104]:
# Convert 'review_age_days' from duration to string for CSV compatibility
reviews = reviews.with_columns(
        pl.col("review_age_days").cast(pl.Int64).cast(pl.Utf8)  # Convert duration to string
    )

In [105]:
reviews

Title,User_id,score,time,summary,text,review_length,summary_length,sentiment_score,current_date,review_age_days
str,str,f64,date,str,str,u32,u32,f64,date,str
"""Its Only Art If Its Well Hung!""","""AVCGYZL8FQQTD""",4.0,1999-10-23,"""Nice collection of Julie Strai…","""This is only for Julie Strain …",457,38,0.9408,2024-11-11,"""790646400000000000"""
"""Dr. Seuss: American Icon""","""A30TK6U7DNS82R""",5.0,2004-09-21,"""Really Enjoyed It""","""I don't care much for Dr. Seus…",1423,17,0.9876,2024-11-11,"""635558400000000000"""
"""Dr. Seuss: American Icon""","""A3UH4UZ4RSVO82""",5.0,2004-03-09,"""Essential for every personal a…","""If people become the books the…",1752,47,0.9935,2024-11-11,"""652492800000000000"""
"""Dr. Seuss: American Icon""","""A2MVUWT453QH61""",4.0,2004-07-25,"""Phlip Nel gives silly Seuss a …","""Theodore Seuss Geisel (1904-19…",3662,47,0.9807,2024-11-11,"""640569600000000000"""
"""Dr. Seuss: American Icon""","""A22X4XUPKF66MR""",4.0,2005-02-10,"""Good academic overview""","""Philip Nel - Dr. Seuss: Americ…",1542,22,0.9803,2024-11-11,"""623289600000000000"""
…,…,…,…,…,…,…,…,…,…,…
"""The Idea of History""","""AI1QNMVF2E3TN""",5.0,2003-07-01,"""R. G. Collingwood's Most Famou…","""Highly Recommended.This book i…",1632,36,0.9595,2024-11-11,"""674265600000000000"""
"""The Idea of History""","""AOFGOUMXLMVZS""",4.0,2012-07-17,"""Thoughtful Critic of History""","""History is not a scientific pr…",12422,28,-0.9643,2024-11-11,"""388800000000000000"""
"""The Idea of History""","""A1SMUB9ASL5L9Y""",4.0,2012-03-14,"""Quite good and ahead of its ti…","""This is pretty interesting. Co…",597,45,0.875,2024-11-11,"""399600000000000000"""
"""The Idea of History""","""A2AQMEKZKK5EE4""",4.0,2007-05-27,"""Easier reads of those not well…","""This is a good book but very e…",161,55,0.6858,2024-11-11,"""551059200000000000"""


In [108]:
from src.utils import save_to_parquet_if_not_exists

# Usage in feature engineering step
try:
    # Paths to save Parquet files
    reviews_path = FEATURE_STORE_DIR / "rating_reviews_features.parquet"
    books_info_path = FEATURE_STORE_DIR / "books_info_features.parquet"
    
    # Save DataFrames if files do not already exist
    save_to_parquet_if_not_exists(reviews, reviews_path, logger)
    save_to_parquet_if_not_exists(books_info, books_info_path, logger)
    
except Exception as e:
    logger.error(f"Error during Parquet file saving: {e}")


INFO - File /home/igor/github-projects/book-review/data/feature_store/rating_reviews_features.parquet already exists, skipping save.
INFO - File /home/igor/github-projects/book-review/data/feature_store/books_info_features.parquet already exists, skipping save.
