In [1]:
# Necessary Libraries for Text Classification
from dotenv import load_dotenv

import numpy as np
import pandas as pd

import torch
from tqdm import tqdm
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Constant Variable
BOOKS_DATASET_PATH = "./dataset/books_with_categories.csv"
EMOTION_LABELS = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]

In [3]:
# Dataset Setup for Data Exploration
books_dataset = pd.read_csv(BOOKS_DATASET_PATH)

In [4]:
# Setup the pipeline of Text Classification
classifier = pipeline("text-classification", model = "j-hartmann/emotion-english-distilroberta-base", top_k = None, device = "mps")
classifier("I love this!")

Device set to use mps


[[{'label': 'surprise', 'score': 0.4869694709777832},
  {'label': 'neutral', 'score': 0.22344161570072174},
  {'label': 'joy', 'score': 0.14913079142570496},
  {'label': 'anger', 'score': 0.07174032181501389},
  {'label': 'sadness', 'score': 0.04664654657244682},
  {'label': 'disgust', 'score': 0.016297748312354088},
  {'label': 'fear', 'score': 0.005773564800620079}]]

In [5]:
# Test the classifier with the first book description from the dataset
sentences = books_dataset["description"][0].split(".")
predictions = classifier(sentences)

sentences[0], predictions[0]

('A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives',
 [{'label': 'surprise', 'score': 0.7296028733253479},
  {'label': 'neutral', 'score': 0.14038552343845367},
  {'label': 'fear', 'score': 0.0681622177362442},
  {'label': 'joy', 'score': 0.04794244095683098},
  {'label': 'anger', 'score': 0.009156350046396255},
  {'label': 'disgust', 'score': 0.002628473099321127},
  {'label': 'sadness', 'score': 0.0021221598144620657}])

In [6]:
# Extract emotions from descriptions of the books, then include the prediction in each respective columns
def calculate_max_emotion_scores(predictions):
    emotion_scores = {label: [] for label in EMOTION_LABELS}
    
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key = lambda x: x["label"])

        for idx, label in enumerate(EMOTION_LABELS):
            emotion_scores[label].append(sorted_predictions[idx]["score"])

    return {label: np.max(scores) for label, scores in emotion_scores.items()}

In [7]:
books_isbn = []
pred_score = {label: [] for label in  EMOTION_LABELS}

for idx in tqdm(range(len(books_dataset))):
    books_isbn.append(books_dataset["isbn13"][idx])
    
    sentences = books_dataset["description"][idx].split(".")
    predictions = classifier(sentences)
    
    max_emotion_scores = calculate_max_emotion_scores(predictions)
    
    for label in EMOTION_LABELS:
        pred_score[label].append(max_emotion_scores[label])

100%|██████████| 5197/5197 [05:44<00:00, 15.08it/s]


In [8]:
# Convert the predicted scores to a DataFrame
pred_score_df = pd.DataFrame(pred_score)
pred_score_df["isbn13"] = books_isbn

pred_score_df.head(10)

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,isbn13
0,0.051973,0.27359,0.928169,0.932798,0.646216,0.967158,0.729603,9780002005883
1,0.612619,0.348286,0.942528,0.704422,0.887939,0.074825,0.252545,9780002261982
2,0.051973,0.157667,0.972321,0.767237,0.608933,0.074825,0.046931,9780006178736
3,0.351483,0.157667,0.360707,0.251881,0.732687,0.074825,0.046931,9780006280897
4,0.081412,0.184495,0.095043,0.035207,0.925904,0.475881,0.046931,9780006280934
5,0.232225,0.727175,0.038787,0.043376,0.621393,0.10239,0.271903,9780006380832
6,0.538185,0.157667,0.747428,0.872565,0.712194,0.407999,0.07205,9780006470229
7,0.051973,0.157667,0.404496,0.020884,0.608933,0.820282,0.234488,9780006472612
8,0.30067,0.279481,0.915524,0.021228,0.84029,0.354459,0.135615,9780006482079
9,0.051973,0.177926,0.049457,0.032198,0.887411,0.074825,0.047142,9780006483014


In [9]:
# Merge the book dataset with the predicted max emotion scores
books_dataset = pd.merge(books_dataset, pred_score_df, on = "isbn13")

In [10]:
# Save the Emotions Dataset
books_dataset.to_csv("/Users/jkhang/Documents/GitHub/Semantic-Book-Recommender/dataset/books_with_emotions.csv", index = False)