In [3]:
# This cell makes sure modules are auto-loaded when you change external python files
%load_ext autoreload
%autoreload 2

This notebook is use to retrieve the categories form the tags of the books. The tags are in the form of a list of strings. The categories are the most common words in the tags. The categories are used to classify the books.

In [4]:
import numpy as np
import pandas as pd

In [5]:
df_books = pd.read_csv('./data/books.csv')
df_ratings = pd.read_csv('./data/ratings.csv')
df_book_tags = pd.read_csv('./data/book_tags.csv')
df_tags = pd.read_csv('./data/tags.csv')
df_to_read = pd.read_csv('./data/to_read.csv')

We keep the tags that are used more than 40 times in the dataset.

In [5]:
tag_used = df_book_tags.tag_id.value_counts()   
tag_used = tag_used[tag_used > 40]
print('We have %d tags used more than 50 times' % len(tag_used))
print(tag_used)

We have 2055 tags used more than 50 times
tag_id
30574    9983
11557    9881
22743    9858
5207     9799
8717     9776
         ... 
7884       41
13479      41
902        41
21855      41
7214       41
Name: count, Length: 2055, dtype: int64


## Process the Tags

We will process the tags by lowercasing them, removing the punctuation, and removing the stop words.

We will group the year tags into decades. For example, 1990s, 1991s, and 1992s will be grouped into the 1990s.

We will use lemmatization to improve the retrieval of categories. Lemmatization will convert, for example, "kids" to "kid" and "liked" to "like," etc.

Finally, we will count the frequency of each word in the tags.


In [6]:
df_tags = pd.read_csv('./data/tags.csv')
df_tags = df_tags[df_tags.tag_id.isin(tag_used.index)]
df_tags.head()

Unnamed: 0,tag_id,tag_name
98,98,02-fantasy
134,134,1
199,199,100-books
206,206,100-books-to-read-in-a-lifetime
228,228,1000-books-before-kindergarten


In [8]:
#lowercase all tags
df_tags['tag_name'] = df_tags['tag_name'].str.lower()
#remove all non-alphanumeric characters
df_tags['tag_name'] = df_tags['tag_name'].str.replace('-', ' ')

In [9]:
df_tags.head()

Unnamed: 0,tag_id,tag_name
98,98,02 fantasy
134,134,1
199,199,100 books
206,206,100 books to read in a lifetime
228,228,1000 books before kindergarten


In [148]:
#if 2000 2001 2002 ect replace with 2000s
for ten in range(1900, 2020, 10):
    for year in range(0, 10):
        i = ten + year
        df_tags['tag_name'] = df_tags['tag_name'].str.replace(f'{i}', f"{ten}s")
        df_tags['tag_name'] = df_tags['tag_name'].str.replace(f'{ten}ss', f"{ten}s")
# df_tags['tag_name'] = df_tags['tag_name'].str.replace(f'{2000}', "2000s")

In [177]:
df_tags.head(100)

Unnamed: 0,tag_id,tag_name
98,98,02 fantasy
134,134,1
199,199,100 books
206,206,100 books to read in a lifetime
228,228,1000 books before kindergarten
...,...,...
933,933,20th century literature
941,941,21st century
969,969,2nd grade
1007,1007,3 star


In [158]:
df_tags.to_csv('tag_categories.csv', index=False)

In [176]:
list_words = []
for tag in df_tags['tag_name']:
    tag = tag.split()
    for word in tag:
        if word not in list_words:
            list_words.append(word)
# list_words

In [203]:
#NLP libraries
import spacy
nlp = spacy.load('en_core_web_sm')
from collections import Counter
import re

In [170]:
example = tag[206]
doc = nlp(example)
print(example,'\n')
for token in doc:
    if token.text != token.lemma_:
        print(token.text,'--->',token.lemma_)

100 books to read in a lifetime 

books ---> book


In [172]:
stop_words = [token.text for token in doc if token.is_stop]
print('Stop words:', stop_words)

Stop words: ['to', 'in', 'a']


In [175]:
words = [token.text for token in doc]

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common()

print(common_words)

[('100', 1), ('books', 1), ('to', 1), ('read', 1), ('in', 1), ('a', 1), ('lifetime', 1)]


In [196]:
df_clean_tags = df_tags.copy()

In [197]:
#remove stop words
for tag in df_tags['tag_name']:
    doc = nlp(tag)
    words = [token.text for token in doc if not token.is_stop]
    df_clean_tags.loc[df_clean_tags.tag_name == tag, 'tag_name'] = ' '.join(words)
df_clean_tags.head()

Unnamed: 0,tag_id,tag_name
98,98,02 fantasy
134,134,1
199,199,100 books
206,206,100 books read lifetime
228,228,1000 books kindergarten


In [198]:
print(len(df_clean_tags['tag_name'].unique()))
df_clean_tags.to_csv('tag_categories.csv', index=False)

1811


In [207]:
def remove_non_latin_characters(sentence):
    # Define a regular expression pattern to match non-Latin characters
    pattern = re.compile(r'[^\u0000-\u007F]+')  # This matches non-ASCII characters
    # Remove non-Latin characters using the pattern
    cleaned_sentence = pattern.sub('', sentence)
    return cleaned_sentence

def process_sentence(sentence):
    # Process the sentence with spaCy
    doc = nlp(sentence)
    # Convert the tokens back to a string
    processed_sentence = ' '.join([token.text for token in doc])
    # Remove non-Latin characters
    cleaned_sentence = remove_non_latin_characters(processed_sentence)
    return cleaned_sentence

# Apply the function to the tag_name column
df_clean_tags['tag_name'] = df_clean_tags['tag_name'].apply(process_sentence)


In [None]:
#remove if 1 character or less
df_clean_tags = df_clean_tags[df_clean_tags['tag_name'].apply(lambda x: len(x) > 1)]

In [None]:
df_clean_tags.to_csv('tag_categories.csv', index=False)

In [212]:
#lemmatize the tags
df_clean_tags['tag_name'] = df_clean_tags['tag_name'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
df_clean_tags.head()

Unnamed: 0,tag_id,tag_name
98,98,02 fantasy
199,199,100 book
206,206,100 book read lifetime
228,228,1000 book kindergarten
236,236,1001


In [214]:
#find the most common words
list_words = dict()
for tag in df_clean_tags['tag_name']:
    tag = tag.split()
    for word in tag:
        if word not in list_words:
            list_words[word] = 1
        else:
            list_words[word] += 1

In [219]:
list_words = {k: v for k, v in sorted(list_words.items(), key=lambda item: item[1], reverse=True)}
list_words 

{'read': 192,
 'book': 167,
 'fiction': 85,
 '2010s': 79,
 'fantasy': 53,
 'romance': 36,
 'novel': 35,
 'thriller': 34,
 'manga': 34,
 'mystery': 34,
 'literature': 30,
 'light': 28,
 'child': 26,
 'graphic': 25,
 'series': 24,
 'history': 22,
 'ya': 22,
 'adult': 21,
 'kid': 21,
 'science': 20,
 'favorite': 19,
 'comic_strip': 19,
 'classic': 18,
 'biography': 17,
 'memoir': 17,
 's': 17,
 'challenge': 16,
 'american': 16,
 'library': 16,
 'author': 16,
 'christian': 16,
 '2000s': 15,
 'paranormal': 15,
 'story': 15,
 'young': 15,
 'historical': 15,
 'kindle': 15,
 'reading': 14,
 'school': 14,
 'childhood': 14,
 'children': 14,
 'crime': 14,
 'gilmore': 14,
 'non': 13,
 'club': 13,
 'own': 13,
 'suspense': 13,
 'grade': 12,
 'release': 12,
 'love': 12,
 'sci': 12,
 'fi': 12,
 '1001': 11,
 'world': 11,
 'winner': 11,
 'contemporary': 11,
 'horror': 11,
 'rory': 11,
 '100': 10,
 'star': 10,
 'picture': 10,
 'comic': 10,
 'food': 10,
 'finish': 10,
 'time': 9,
 'king': 9,
 'modern': 9,

In [220]:
categories = pd.DataFrame(list(list_words.items()), columns=['word', 'count'])

In [222]:
categories.to_csv('categories.csv', index=False)

In [227]:
categories = pd.read_csv('categories.csv')
categories = categories[categories['count'] > 3]
categories.to_csv('categories.csv', index=True)

In [48]:
categories = pd.read_csv('categories.csv')
categories.drop(columns='count', inplace=True)
categories.head()
categories.to_csv('categories.csv', index=False)

In [7]:
from preprocess import preprocess_df
categories = pd.read_csv('./data/new/categories.csv')
tags = pd.read_csv('./data/tags.csv')
print(tags.shape)
tags.head()

(34252, 2)


Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [9]:
tags_processed = tags.copy()

In [10]:
tags_processed = preprocess_df(tags_processed, 'tag_name')
tags_processed.head()

34252


100%|██████████| 34252/34252 [05:22<00:00, 106.34it/s]
100%|██████████| 34252/34252 [00:01<00:00, 26441.99it/s]


Unnamed: 0,tag_id,tag_name
0,0,
1,1,1.0
2,2,10.0
3,3,12.0
4,4,122.0


In [49]:
catego = []
for tag in tags_processed['tag_name']:
    tag_cats = []
    for cat in categories['word']:
        if cat in tag:
            tag_cats.append(cat)
    if len(tag_cats) == 0:
        tag_cats.append('other')
    catego.append(tag_cats)
tags_processed['categories'] = catego
tags_processed.head(30)

Unnamed: 0,tag_id,tag_name,categories
0,0,,[other]
1,1,1,[other]
2,2,10,[other]
3,3,12,[other]
4,4,122,[other]
5,5,166,[other]
6,6,17,[other]
7,7,19,[other]
8,8,2,[other]
9,9,258,[other]


In [50]:
tags_processed.to_csv('./data/new/tags_processed.csv', index=False)

In [10]:
import pandas as pd
cat_small = pd.read_csv('./data/new/categories_small.csv')
tags_processed = pd.read_csv('./data/new/tags_processed.csv')

In [11]:
cat_small.head()

Unnamed: 0,index,word
0,0,fiction
1,1,2010s
2,2,fantasy
3,3,romance
4,4,novel


In [24]:
tags_processed['tag_name'] = tags_processed['tag_name'].str.lower()
tags_processed.dropna(inplace=True)

In [25]:
#change type column to string
tags_processed['categories'] = tags_processed['categories'].apply(lambda x: str(x))
cat_small['word'] = cat_small['word'].apply(lambda x: str(x))

In [26]:
catego = []
for tag in tags_processed['tag_name']:
    tag_cats = []
    print(tag)
    for cat in cat_small['word']:
        if cat in tag:
            tag_cats.append(cat)
    if len(tag_cats) == 0:
        tag_cats.append('other')
    catego.append(tag_cats)
tags_processed['categories'] = catego
tags_processed.head(30)

 
   1
   10
   12
   122
   166
   17
   19
   2
   258
   3
   33
   4
   5
   51
   6
   62
   8
   99
   available raspberrys  
  2000s  
  calif  
  d c  
  dean
  england
  fiction
  fictional
  fictitious
  football
  george
  gr
  history
  imaginary
  italy
  la  
  los
  mass  
  murder
  n y  
  non poetry
  performing
  post
  psychology
  read
  single
  specific
  television
0
0 0 0 0cant find
0 0 bingo
0 4 star rating
0 find 2010s summer 00
0 all2
0 best picture younger
0 boxed
0 c
0 eric s books
0 house lbkc
0 kindle
0 love funny
0 cry moved
0 nonfiction
0 owned
0 paperback
0 pgs 200 299
0 physical
0 plays
0 plc
0 tbr axz usa
00 exploring 1
00 graphic novels read
00 class
00 read 00
00 read 02
0000
0000 buy
00000
0000_notverified
000src su 25 11 3 x
000src w 15 01
001 150 pages
001 ebook
002 read cookbook
002 une bonne selection
007
008 strategy
01
01 alphabet authors
01 distopias
01 ecookbooks
01 folklore
01 irl bookshelf comics
01
01 words
016 sam j miller
01_best boo

Unnamed: 0,tag_id,tag_name,categories
0,0,,[other]
1,1,1,[other]
2,2,10,[other]
3,3,12,[other]
4,4,122,[other]
5,5,166,[other]
6,6,17,[other]
7,7,19,[other]
8,8,2,[other]
9,9,258,[other]


In [27]:
tags_processed.to_csv('./data/new/tags_processed_small_cat.csv', index=False)