In [1]:
import pandas as pd
import numpy as np
books = pd.read_csv("./books_with_categories.csv")

In [6]:
from transformers import pipeline
sentimental_analysis = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", device="cuda", top_k=None)

Device set to use cuda


In [7]:
sentimental_analysis("i love you")

[[{'label': 'joy', 'score': 0.9639525413513184},
  {'label': 'sadness', 'score': 0.027516931295394897},
  {'label': 'surprise', 'score': 0.0035664099268615246},
  {'label': 'anger', 'score': 0.0025094274897128344},
  {'label': 'neutral', 'score': 0.0012242378434166312},
  {'label': 'fear', 'score': 0.0008826187695376575},
  {'label': 'disgust', 'score': 0.0003477412974461913}]]

In [12]:
emotion_label = ['joy', 'sadness', 'surprise', 'anger', 'neutral', 'fear', 'disgust']
isbn = []
emotion_score = {label: [] for label in emotion_label}

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_label}
    for prediction in predictions:
        sorted_prediction = sorted(prediction, key=lambda x: x["label"])
        for index, label in enumerate(emotion_label):
            per_emotion_scores[label].append(sorted_prediction[index]["score"])
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [15]:
from tqdm import tqdm

emotion_label = ['joy', 'sadness', 'surprise', 'anger', 'neutral', 'fear', 'disgust']
isbn = []
emotion_score = {label: [] for label in emotion_label}

for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split('.')
    predictions = sentimental_analysis(sentences)
    max_score = calculate_max_emotion_scores(predictions)
    for label in emotion_label:
        emotion_score[label].append(max_score[label])


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

00%|██████████████████████████████████████████████████████████████████████████████| 5197/5197 [05:23<00:00, 16.05it/s]

In [16]:
emotion_df = pd.DataFrame(emotion_score)
emotion_df["isbn13"] = isbn

In [18]:
books =  pd.merge(books, emotion_df, on="isbn13")

In [19]:
books.to_csv("books_with_emotions.csv", index=False)

In [22]:
books.thumbnail

0       http://books.google.com/books/content?id=KQZCP...
1       http://books.google.com/books/content?id=gA5GP...
2       http://books.google.com/books/content?id=FKo2T...
3       http://books.google.com/books/content?id=XhQ5X...
4       http://books.google.com/books/content?id=Kk-uV...
                              ...                        
5192    http://books.google.com/books/content?id=q-tKP...
5193    http://books.google.com/books/content?id=rq6JP...
5194    http://books.google.com/books/content?id=c_7mf...
5195    http://books.google.com/books/content?id=Fv_JP...
5196    http://books.google.com/books/content?id=Vy7Sk...
Name: thumbnail, Length: 5197, dtype: object

In [7]:
droped_auther = books[~(books.authors.isna())]

In [11]:
droped_auther.to_csv("books_with_emotions.csv", index="False")