In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('words')

from tqdm import tqdm

from langdetect import detect

# Roberta Usage
import torch
import scipy
import transformers

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

from transformers import pipeline
from nltk.tokenize import sent_tokenize

In [2]:
#Import english_books dataset for genre filtering
df = pd.read_feather('english-books.feather')  # dataset containing genre's to book_id's
df_reviews = pd.read_csv('df_reviews_large.csv') #15 mil dataset containg reviews, book_id's and review_id's

#List of genres
df_genres = df['genres'].unique()

#Filter for specific genre i.e 'poetry'
filtered_df_poetry = df[df['genres'].str.contains('poetry')]

#Retrieve all book_id's associated with the genre
df_poetry_bookids = filtered_df_poetry['book_id'].unique().tolist()

df_poetry_bookids 
poetry_bookids = [int(id) for id in df_poetry_bookids]

# #Using the list of book_id's filter out all rows for Id's in list.
filtered_df_poetry = df_reviews[df_reviews['book_id'].isin(poetry_bookids)]

filtered_df_poetry.to_csv('filtered_df_poetry.csv', header=True, index=False)


Apply english filtering to reviews

In [3]:
df = pd.read_csv('filtered_df_poetry.csv')
def detect_language(text):
    try:
        return detect(text)
    except:
        return None
    
tqdm.pandas()
df['language'] = df['review_text'].apply(detect_language)

# Filter for English text
df_english = df[df['language'] == 'en']

df_english.to_csv('filtered_df_poetry_ready.csv', header=True, index=False)


Running Sentiment analysis

In [None]:
df = pd.read_csv("filtered_df_poetry_ready.csv")
df = df.reset_index(drop=True)

sentiment_classifier = pipeline("sentiment-analysis", model = "cardiffnlp/twitter-roberta-base-sentiment")

def sentiment_scores_cal(text):
    try:
        sentences = sent_tokenize(text)
        sentiment_scores = []
        for sentence in sentences:
            result = sentiment_classifier(sentence)[0] 
            sentiment_scores.append((sentence, result['label'], result['score']))

        average_score = sum(score for _, _, score in sentiment_scores) / len(sentiment_scores)
        return average_score
    except RuntimeError as e:
        print("RuntimeError occurred:", e)
        return None  
    

chunk_size = 1000

tqdm.pandas(desc="Calculating Sentiment Scores")

results_df = pd.DataFrame()

with open('results.csv', 'a') as f:

    for chunk in pd.read_csv("filtered_df_poetry_ready.csv", chunksize=chunk_size):
        processed_chunk = chunk.assign(sentiment_score=chunk['review_text'].progress_apply(sentiment_scores_cal))
        results_df = pd.concat([results_df, processed_chunk], ignore_index=True)

results_df.to_csv('df_poetryresults_fin.csv', header=True, index=False)

Applying Sentiment weight to Ratings 

In [None]:
df = pd.read_csv('df_poetryresults_fin.csv')

#Calculate length of the review text
df['text_length'] = df['review_text'].apply(lambda x: len(x.split()))
columns = ['book_id', 'user_id','review_id', 'rating', 'sentiment_score', 'text_length', 'review_text']
df = df[columns]
df.head()

In [7]:
#Adjust ratings down for differential on rating 5

def adjust_rating(rating, sentiment_score, text_length, weight_rating=1.0, weight_sentiment=1.0, weight_length=0.5):
    adjustment = 0.0


    # Adjust for sentiment score ranges based on any rating
    if sentiment_score <= 0.5:
        adjustment -= weight_sentiment * 0.2     
    elif 0.5 < sentiment_score <= 0.65:
        adjustment += weight_sentiment * 0.14
    elif 0.65 < sentiment_score <= 0.75:
        adjustment += weight_sentiment * 0.12
    elif 0.75 < sentiment_score <= 0.80:
        adjustment += weight_sentiment * 0.10
    elif 0.80 < sentiment_score <= 0.85:
        adjustment += weight_sentiment * 0.08
    elif 0.85 < sentiment_score <= 0.90:
        adjustment += weight_sentiment * 0.06
    elif 0.90 < sentiment_score <= 0.95:
        adjustment += weight_sentiment * 0.04
    elif sentiment_score > 0.95:
        adjustment += weight_sentiment * 0.02



    # Adjust for text length
    if text_length > 20:
        adjustment -= weight_length * 0.1

    # Apply adjustment to rating with weights
    adjusted_rating = weight_rating * rating - adjustment

    # Ensure the adjusted rating remains within a certain range (e.g., 0 to 5)
    adjusted_rating = max(0, min(5, adjusted_rating))
    
    adjusted_rating = format(adjusted_rating, '.3f')
    return adjusted_rating

weight_rating = 1.0
weight_sentiment = 1.0
weight_length = 0.5

# Create an empty list to store the adjusted ratings
adjusted_ratings = []

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    adjusted_rating = adjust_rating(row['rating'], row['sentiment_score'], row['text_length'], weight_rating, weight_sentiment, weight_length)
    adjusted_ratings.append(adjusted_rating)

# Add the adjusted ratings to the DataFrame as a new column
df['adjusted_rating'] = adjusted_ratings

In [None]:
#Store adjusted rating per review for colaborative filtereing
df.to_csv('adjusted_rating_poetry.csv', header=True, index=False)

In [9]:
#Calculate average rating per book_id for content based filtering
df['adjusted_rating'] = df['adjusted_rating'].astype(float)
average_ratings = df.groupby('book_id')['adjusted_rating'].mean()
average_ratings = average_ratings.round(3)
average_ratings_df = average_ratings.reset_index(name='average_adjusted_rating')

In [None]:
average_ratings_df.to_csv('average_adjusted_rating_book_id_poetry.csv', header=True, index=False)