In [None]:
!pip install transformers



In [None]:
## Import necessary libraries
from transformers import pipeline
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
## Load the cleaned dataset

df = pd.read_csv('cleaned_books.csv')
df.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,new_title,new_description
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."


### Zero-shot classification to get the book categories for books with missing categories

In [None]:
## Mapping  for the book categories - Fiction and Non-Fiction to be used in the model
category_mapping = {'Fiction' : "Fiction",
'Juvenile Fiction': "Fiction",
'Biography & Autobiography': "Non-Fiction",
'History': "Non-Fiction",
'Literary Criticism': "Non-Fiction",
'Philosophy': "Non-Fiction",
'Religion': "Non-Fiction",
'Comics & Graphic Novels': "Fiction",
'Drama': "Fiction",
'Juvenile Nonfiction': "Non-Fiction",
'Science': "Non-Fiction",
'Poetry': "Fiction"}

In [None]:
df['simple_category'] = df['categories'].map(category_mapping) ## Replace the categories with the simplified ones

In [None]:
df[df['simple_category'].isna()]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,new_title,new_description,simple_category
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",
5,9780006380832,0006380832,Empires of the Monsoon,Richard Hall,"Africa, East",http://books.google.com/books/content?id=MuPEQ...,Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,608.0,65.0,Empires of the Monsoon: A History of the India...,9780006380832 Until Vasco da Gama discovered t...,
6,9780006470229,000647022X,The Gap Into Madness,Stephen R. Donaldson,"Hyland, Morn (Fictitious character)",http://books.google.com/books/content?id=4oXav...,A new-cover reissue of the fourth book in the ...,1994.0,4.15,743.0,103.0,The Gap Into Madness: Chaos and Order,9780006470229 A new-cover reissue of the fourt...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5158,9788125026600,8125026606,Nietzsche For Beginners,Marc Sautet,,http://books.google.com/books/content?id=fPIv8...,Not only does Nietzsche for Beginners delve in...,2004.0,3.26,192.0,21.0,Nietzsche For Beginners,9788125026600 Not only does Nietzsche for Begi...,
5159,9788171565641,8171565646,Aspects of the Novel,E. M. Forster,English fiction,http://books.google.com/books/content?id=qWU9P...,"Forster's lively, informed originality and wit...",2004.0,3.83,141.0,10.0,Aspects of the Novel,"9788171565641 Forster's lively, informed origi...",
5160,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...,
5161,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...,


In [None]:
## Load the zero-shot classification model
book_categories = [
    'Fiction', 'Non-Fiction'
]

classifier = pipeline("zero-shot-classification",
                      model = "facebook/bart-large-mnli",
                      device = "cuda")

Device set to use cuda


In [None]:
## Classify the books using the zero-shot classification model

def classify_books(description):
  predictions = classifier(description,book_categories)
  max_index = np.argmax(predictions['scores'])
  max_label = predictions['labels'][max_index]
  return max_label

In [None]:
## Check accuracy of the classification model on 300 fiction and non-fiction books
actual_cats = []
predicted_cats = []
fiction_books = df[df['simple_category'] == 'Fiction'].reset_index(drop = True)
nonfiction_books = df[df['simple_category'] == 'Non-Fiction'].reset_index(drop=True)

for i in tqdm(range(0,300)):
  sequence = fiction_books.loc[i,'description']
  predicted_cats += [classify_books(sequence)]
  actual_cats += ['Fiction']




100%|██████████| 300/300 [00:26<00:00, 11.27it/s]


In [None]:
for i in tqdm(range(0,300)):
  sequence = nonfiction_books.loc[i,'description']
  predicted_cats += [classify_books(sequence)]
  actual_cats += ['Non-Fiction']

100%|██████████| 300/300 [00:30<00:00,  9.96it/s]


In [None]:
predictions_df = pd.DataFrame({"actual_categories":actual_cats,"predicted_categories":predicted_cats})
len(predictions_df)

600

In [None]:
predictions_df.head(10)

Unnamed: 0,actual_categories,predicted_categories
0,Fiction,Fiction
1,Fiction,Fiction
2,Fiction,Non-Fiction
3,Fiction,Non-Fiction
4,Fiction,Non-Fiction
5,Fiction,Non-Fiction
6,Fiction,Fiction
7,Fiction,Non-Fiction
8,Fiction,Non-Fiction
9,Fiction,Fiction


In [None]:
predictions_df["correct_prediction"] = np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0)
predictions_df["correct_prediction"].sum()/len(predictions_df) ## 63% accuracy

np.float64(0.63)

In [None]:
## Find the categories for the books that don't have a category assigned to them
isbns = []
predicted_cats = []

missing_cats = df[df['simple_category'].isna()][["isbn13","description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(len(missing_cats))):
  sequence = missing_cats["description"]
  predicted_cats += [classify_books(sequence)]
  isbns += [missing_cats["isbn13"][i]]

100%|██████████| 1445/1445 [03:35<00:00,  6.70it/s]


In [None]:
len(predicted_cats)

1445

In [None]:
missing_predicted_df = pd.DataFrame({"isbn13":isbns,"predicted_category":predicted_cats})

In [None]:
df = pd.merge(df,missing_predicted_df, on = "isbn13", how = "left")
df['simple_category'] = np.where(df['simple_category'].isna(),df['predicted_category'], df['simple_category'])
df.drop(columns = ['predicted_category'], inplace = True)

In [None]:
df['simple_category'].isna().sum()

np.int64(0)

In [None]:
df.to_csv('books_with_categories.csv', index = False)

### Sentiment Analysis to assign various tones to the books

In [3]:
df_with_cat= pd.read_csv('books_with_categories.csv')
df_with_cat.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,new_title,new_description,simple_category
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,Fiction
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Fiction
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Fiction


In [None]:
## sentiment Analysis using the DistilRoBERTa model

sentiment_classifier = pipeline("text-classification", model = "j-hartmann/emotion-english-distilroberta-base",
                                device = "cuda",
                                top_k = None)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda


In [None]:
sentiment_classifier("I hate this!") ## Check model's performance on a sample text

[[{'label': 'anger', 'score': 0.6189563274383545},
  {'label': 'disgust', 'score': 0.3279281258583069},
  {'label': 'sadness', 'score': 0.025817126035690308},
  {'label': 'neutral', 'score': 0.01455705426633358},
  {'label': 'surprise', 'score': 0.005831682123243809},
  {'label': 'fear', 'score': 0.00426805205643177},
  {'label': 'joy', 'score': 0.0026416885666549206}]]

In [None]:
## Define emotion labels 

emotion_labels = ["joy","anger","disgust","fear", "sadness","surprise","neutral"]

In [None]:
## Define function to calculate the emotion scores for each book description

def calculate_max_emotion_scores(predictions):
  per_emotion_scores = {label: [] for label in emotion_labels}
  for prediction in predictions:
    sorted_predictions = sorted(prediction, key = lambda x: x["label"])
    for index, label in enumerate(emotion_labels):
      per_emotion_scores[label].append(sorted_predictions[index]["score"])
  return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [None]:
## Calculate the emotion scores for each book description line by line
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(df_with_cat))):
  isbn.append(df_with_cat["isbn13"][i])
  sentences = df_with_cat['description'][i].split(".")
  predictions = sentiment_classifier(sentences)
  max_scores = calculate_max_emotion_scores(predictions)
  for label in emotion_labels:
    emotion_scores[label].append(max_scores[label])

  0%|          | 7/5165 [00:00<04:54, 17.54it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 5165/5165 [02:17<00:00, 37.66it/s]


In [9]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df['isbn13'] = isbn

In [10]:
emotions_df.head()

Unnamed: 0,joy,anger,disgust,fear,sadness,surprise,neutral,isbn13
0,0.064134,0.273591,0.928168,0.932797,0.646216,0.967158,0.729603,9780002005883
1,0.612619,0.348284,0.942528,0.704422,0.887939,0.11169,0.252545,9780002261982
2,0.064134,0.104007,0.972321,0.767237,0.549477,0.11169,0.078766,9780006178736
3,0.351484,0.150723,0.360706,0.251881,0.732685,0.11169,0.078766,9780006280897
4,0.081412,0.184495,0.095043,0.040564,0.88439,0.475881,0.078766,9780006280934


In [11]:
final_df = pd.merge(df_with_cat, emotions_df, on = "isbn13")

In [12]:
final_df.to_csv("books_with_emotions.csv", index = False)