In [None]:
import pandas as pd
from langdetect import detect

import requests
from PIL import Image

from nltk import word_tokenize
from nltk.corpus import stopwords

In [None]:
# Load the data set to a data frame
df = pd.read_csv('goodreads_100k_books.csv')

In [None]:
# Set to display all columns in a data frame
pd.set_option('display.max_columns', None)

In [None]:
# Check the head of the data frame
df.head(1)

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33


In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  100000


In [None]:
# Drop all records for which there is no cover image
df = df.dropna(subset = ['img'])

In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  96955


In [None]:
# Change the data to lower case (except img column)
df = df.apply(lambda x: x.lower() if (isinstance(x, str) and x.name != exclude_column) else x)

In [None]:
# Limit the records to only those which have 'fantasy' in the genre column
limited_genre = 'fantasy'
df = df[df.genre.str.contains(limited_genre, case = False, na = False, regex = True)]

In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  15704


In [None]:
# Check the head of the data frame
df.head(1)

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
35,Marion Weinstein,Paperback,Marion Weinstein was one of the first witches ...,"Religion,Wicca,Witchcraft,Spirituality,Nonfict...",https://i.gr-assets.com/images/S/compressed.ph...,1564146383,9780000000000.0,https://goodreads.com/book/show/1001409.Earth_...,224,4.12,14,Earth Magic: A Book of Shadows for Positive Wi...,337


In [None]:
# Print the number of columns in the data frame
print('The number of columns is: ', len(df.columns))

The number of columns is:  13


In [None]:
# Split the genre column on columns so that there are individual columns for each genre in the genre column

# Set the maximum number of columns to match the row with the maximum number of genres listed plus 1
max_cols = df.genre.str.count(',').max() + 1
# Split the genre column into multimple columns on the comma using the maximum number of columns specified earlier
split_cols = df.genre.str.split(',', expand = True, n = max_cols)


# Set a naming convention for the new columns (starting with 'genre_' and ending with the number of the column)
new_cols = [f'genre_{col}' for col in split_cols.columns]
split_cols.columns = new_cols

# Add the new columns to the data frame
df = pd.concat([df, split_cols], axis = 1)

# Drop the original genre column
df = df.drop(['genre'], axis = 1)

In [None]:
# Print the number of columns in the data frame
print('The number of columns is: ', len(df.columns))

The number of columns is:  32


In [None]:
# Check the head of the data frame
df.head(1)

Unnamed: 0,author,bookformat,desc,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
35,Marion Weinstein,Paperback,Marion Weinstein was one of the first witches ...,https://i.gr-assets.com/images/S/compressed.ph...,1564146383,9780000000000.0,https://goodreads.com/book/show/1001409.Earth_...,224,4.12,14,Earth Magic: A Book of Shadows for Positive Wi...,337,Religion,Wicca,Witchcraft,Spirituality,Nonfiction,Religion,Paganism,Religion,Occult,Fantasy,Magic,Religion,Goddess,Spirituality,New Age,,,,,


In [None]:
# Check a summary of genre_1 columns unique values
df.genre_1.unique()

array(['Wicca', 'Fiction', 'Comics', 'Romance', 'Short Stories',
       'Fantasy', 'Poetry', 'Science Fiction', 'Supernatural',
       'Historical', 'Occult', 'Vampires', 'Young Adult', 'Religion',
       'Classics', 'Mythology', 'Manga', 'Picture Books',
       'Historical Fiction', 'Childrens', 'Arthurian', 'Humor', 'Games',
       'Forgotten Realms', 'World Of Warcraft', 'Art', 'Horror',
       'Cryptozoology', 'Adult Fiction', 'Bande Dessinée',
       'Alternate History', 'European Literature', 'Nonfiction',
       'Heroic Fantasy', 'Space', 'M M Romance', 'Star Wars',
       'Media Tie In', 'Christian', 'Steampunk', 'Role Playing Games',
       'Paranormal', 'Star Trek', 'Graphic Novels', 'Apocalyptic',
       'Sports', 'Angels', 'Sequential Art', 'Gaming', 'Christmas',
       'Novels', 'Mystery', 'Philosophy', '40k', 'Urban Fantasy',
       'Paranormal Romance', 'Crime', 'Audiobook', 'Dragonlance',
       'Romanian Literature', 'Cultural', 'Magical Realism', 'Reference',
       '

In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  15704


In [None]:
# Remove fan fiction and graphic mediums from the records

# Define a list of genres to exclude from the data
exclude_genres = ['fan fiction', 'comics', 'sequential art', 'anthologies', 'role playing games', 'manga', 'graphic novel', 'graphic novels', 
                  'comic', 'dungeons and dragons', 'webcomic', 'fan fiction', 'children', 'childrens', 'nonfiction', 'Childrens', 'childrens', 'Video Games', 
                  'video games', 'Christmas', 'christmas', 'Anthologies', 'anthologies', 'Nonfiction', 'nonfiction', 'Tv', 'tv', 'Historical Fiction', 'historical fiction', 
                  'historical']

# Use the list of excluded genres to remove records that contain those genres anywhere
df = df[~df.map(lambda x: x in exclude_genres).any(axis = 1)]

In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  10586


In [None]:
# Check the head of the data frame
df.head(3)

Unnamed: 0,author,bookformat,desc,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
58,"Chuck Dixon,Scott McDaniel,Karl Story",Paperback,"Growing up as Robin the Boy Wonder, Dick Grays...",https://i.gr-assets.com/images/S/compressed.ph...,1563896133.0,9780000000000.0,https://goodreads.com/book/show/1001530.Nightwing,208,3.95,17,Nightwing: Love and Bullets,357,Sequential Art,Comics,Sequential Art,Graphic Novels,Superheroes,Dc Comics,Dc Comics,Batman,Comics,Comic Book,Graphic Novels Comics,Comics,Superheroes,Fiction,Fantasy,,,,,
67,Eliezer Yudkowsky,ebook,Harry Potter and the Methods of Rationality is...,https://i.gr-assets.com/images/S/compressed.ph...,,,https://goodreads.com/book/show/10016013-harry...,2184,4.4,1414,Harry Potter and the Methods of Rationality,14293,Fantasy,Fiction,Fan Fiction,Philosophy,Science Fiction,Young Adult,Humor,Fantasy,Magic,Unfinished,Adventure,,,,,,,,,
111,Robert Reed,Paperback,The Ship has traveled the universe for longer ...,https://i.gr-assets.com/images/S/compressed.ph...,812566572.0,9780000000000.0,https://goodreads.com/book/show/100208.Marrow,512,3.82,110,Marrow,2051,Science Fiction,Fiction,Space,Space Opera,Mystery,Science Fiction Fantasy,Space,Science Fiction,Dystopia,Science Fiction,Hard Science Fiction,Science Fiction,Aliens,Speculative Fiction,,,,,,


In [None]:
# I want to remove any books that have a language other than English in the description (desc) column

# Define a function to detect the language in the desc column
def detect_language(desc):
    try:
        return detect(desc)
    except:
        return 'unknown'

# Apply the detect language function to the data frame and assign the values to a new column called 'language'    
df['language'] = df.desc.apply(detect_language)

# Select only the rows that contain 'en' (English) in the 'language' column
df = df.loc[df['language'] == 'en'].copy()
# Drop the 'language' column, it is not needed anymore
df = df.drop('language', axis = 1)

In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  9676


In [None]:
df.head(3)

Unnamed: 0,author,bookformat,desc,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
58,"Chuck Dixon,Scott McDaniel,Karl Story",Paperback,"Growing up as Robin the Boy Wonder, Dick Grays...",https://i.gr-assets.com/images/S/compressed.ph...,1563896133.0,9780000000000.0,https://goodreads.com/book/show/1001530.Nightwing,208,3.95,17,Nightwing: Love and Bullets,357,Sequential Art,Comics,Sequential Art,Graphic Novels,Superheroes,Dc Comics,Dc Comics,Batman,Comics,Comic Book,Graphic Novels Comics,Comics,Superheroes,Fiction,Fantasy,,,,,
67,Eliezer Yudkowsky,ebook,Harry Potter and the Methods of Rationality is...,https://i.gr-assets.com/images/S/compressed.ph...,,,https://goodreads.com/book/show/10016013-harry...,2184,4.4,1414,Harry Potter and the Methods of Rationality,14293,Fantasy,Fiction,Fan Fiction,Philosophy,Science Fiction,Young Adult,Humor,Fantasy,Magic,Unfinished,Adventure,,,,,,,,,
111,Robert Reed,Paperback,The Ship has traveled the universe for longer ...,https://i.gr-assets.com/images/S/compressed.ph...,812566572.0,9780000000000.0,https://goodreads.com/book/show/100208.Marrow,512,3.82,110,Marrow,2051,Science Fiction,Fiction,Space,Space Opera,Mystery,Science Fiction Fantasy,Space,Science Fiction,Dystopia,Science Fiction,Hard Science Fiction,Science Fiction,Aliens,Speculative Fiction,,,,,,


In [None]:
# Remove all the words from the desc column that are stop words ('the', 'and', 'then', etc.) This makes for easier analysis later

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

# Define a function to select the keywords from each record desc
def extract_keywords(text):
    tokens = word_tokenize(text)
    keywords = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]

    return ' '.join(keywords)


# Apply the keywords function to the desc column, removing all stopwords
df['desc'] = df['desc'].apply(extract_keywords)

In [None]:
df.head(3)

Unnamed: 0,author,bookformat,desc,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
58,"Chuck Dixon,Scott McDaniel,Karl Story",Paperback,growing robin boy wonder dick grayson lived li...,https://i.gr-assets.com/images/S/compressed.ph...,1563896133.0,9780000000000.0,https://goodreads.com/book/show/1001530.Nightwing,208,3.95,17,Nightwing: Love and Bullets,357,Sequential Art,Comics,Sequential Art,Graphic Novels,Superheroes,Dc Comics,Dc Comics,Batman,Comics,Comic Book,Graphic Novels Comics,Comics,Superheroes,Fiction,Fantasy,,,,,
67,Eliezer Yudkowsky,ebook,harry potter methods rationality work harry po...,https://i.gr-assets.com/images/S/compressed.ph...,,,https://goodreads.com/book/show/10016013-harry...,2184,4.4,1414,Harry Potter and the Methods of Rationality,14293,Fantasy,Fiction,Fan Fiction,Philosophy,Science Fiction,Young Adult,Humor,Fantasy,Magic,Unfinished,Adventure,,,,,,,,,
111,Robert Reed,Paperback,ship traveled universe longer crew recall true...,https://i.gr-assets.com/images/S/compressed.ph...,812566572.0,9780000000000.0,https://goodreads.com/book/show/100208.Marrow,512,3.82,110,Marrow,2051,Science Fiction,Fiction,Space,Space Opera,Mystery,Science Fiction Fantasy,Space,Science Fiction,Dystopia,Science Fiction,Hard Science Fiction,Science Fiction,Aliens,Speculative Fiction,,,,,,


In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  9676


In [None]:
# Get statistics for the 'pages' column to show the maximum and minimum number of pages, among other descriptors
df.pages.describe()

count    9676.000000
mean      259.451840
std       184.749676
min         0.000000
25%       144.000000
50%       255.000000
75%       350.000000
max      5375.000000
Name: pages, dtype: float64

In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  9676


In [None]:
# Get the new statistics
df.pages.describe()

count    9676.000000
mean      259.451840
std       184.749676
min         0.000000
25%       144.000000
50%       255.000000
75%       350.000000
max      5375.000000
Name: pages, dtype: float64

In [None]:
# Select only records that have less than or equal to 1000 pages
df = df[df.pages <= 1000]

In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  9636


In [None]:
# Get the new statistics for pages
df.pages.describe()

count    9636.000000
mean      253.669884
std       149.259148
min         0.000000
25%       144.000000
50%       254.000000
75%       349.000000
max      1000.000000
Name: pages, dtype: float64

In [None]:
# Describe the number of reviews
df.reviews.describe()

count      9636.000000
mean        497.014529
std        2717.485172
min           0.000000
25%          21.000000
50%          71.000000
75%         235.250000
max      110042.000000
Name: reviews, dtype: float64

In [None]:
# Describe the totalratings column
df.totalratings.describe()

count    9.636000e+03
mean     8.696948e+03
std      6.952732e+04
min      1.000000e+00
25%      2.450000e+02
50%      9.670000e+02
75%      3.320250e+03
max      3.099689e+06
Name: totalratings, dtype: float64

- Count: 8,608
- Mean: 10,223.42
- Std (Standard Deviation): 75,466.09
- Min: 1
- 25% (Q1): 336
- 50% (Median or Q2): 1,141
- 75% (Q3): 3,957.25
- Max: 3,099,689

In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  9636


In [None]:
# Remove records that have less than 1000 reviews (we want a solid statistical sample)
df = df[df.totalratings > 1000]

In [None]:
# Check the number of records in the data frame
print('The number of records is: ', len(df))

The number of records is:  4723


In [None]:
df.totalratings.describe()

count    4.723000e+03
mean     1.738949e+04
std      9.856632e+04
min      1.001000e+03
25%      1.785000e+03
50%      3.424000e+03
75%      8.541000e+03
max      3.099689e+06
Name: totalratings, dtype: float64

- Count: 4,574
- Mean: 18,907.73
- Std (Standard Deviation): 102,752.00
- Min: 1,001
- 25% (Q1): 1,813.75
- 50% (Median or Q2): 3,625.00
- 75% (Q3): 9,113.25
- Max: 3,099,689

In [None]:
df.head(2)

Unnamed: 0,author,bookformat,desc,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
111,Robert Reed,Paperback,ship traveled universe longer crew recall true...,https://i.gr-assets.com/images/S/compressed.ph...,812566572,9780000000000.0,https://goodreads.com/book/show/100208.Marrow,512,3.82,110,Marrow,2051,Science Fiction,Fiction,Space,Space Opera,Mystery,Science Fiction Fantasy,Space,Science Fiction,Dystopia,Science Fiction,Hard Science Fiction,Science Fiction,Aliens,Speculative Fiction,,,,,,
139,"Wu Cheng'en,Arthur Waley,Hu Shih",Paperback,probably popular book history far east classic...,https://i.gr-assets.com/images/S/compressed.ph...,802130860,9780000000000.0,https://goodreads.com/book/show/100237.Monkey,306,4.02,552,Monkey: The Journey to the West,6147,Classics,Fiction,Fantasy,Cultural,China,Fantasy,Mythology,Asian Literature,Chinese Literature,Literature,Cultural,Asia,Novels,Adventure,,,,,,


In [None]:
len(df)

4723

In [None]:
df['multiple_img'] = df.img.apply(lambda x: len(x.split()) > 1 if isinstance(x, str) else False)

In [None]:
df.multiple_img.unique()

array([False])

In [None]:
df = df.drop(['multiple_img'], axis = 1)

In [None]:
len(df.img.unique())

4723

In [None]:
df.head()

Unnamed: 0,author,bookformat,desc,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18,genre_19
111,Robert Reed,Paperback,ship traveled universe longer crew recall true...,https://i.gr-assets.com/images/S/compressed.ph...,812566572.0,9780000000000.0,https://goodreads.com/book/show/100208.Marrow,512,3.82,110,Marrow,2051,Science Fiction,Fiction,Space,Space Opera,Mystery,Science Fiction Fantasy,Space,Science Fiction,Dystopia,Science Fiction,Hard Science Fiction,Science Fiction,Aliens,Speculative Fiction,,,,,,
139,"Wu Cheng'en,Arthur Waley,Hu Shih",Paperback,probably popular book history far east classic...,https://i.gr-assets.com/images/S/compressed.ph...,802130860.0,9780000000000.0,https://goodreads.com/book/show/100237.Monkey,306,4.02,552,Monkey: The Journey to the West,6147,Classics,Fiction,Fantasy,Cultural,China,Fantasy,Mythology,Asian Literature,Chinese Literature,Literature,Cultural,Asia,Novels,Adventure,,,,,,
160,Beth Fantaskey,Hardcover,one thing find vampire princess whole thing ac...,https://i.gr-assets.com/images/S/compressed.ph...,547393091.0,9780000000000.0,https://goodreads.com/book/show/10025007-jessi...,309,3.9,853,Jessica Rules the Dark Side,9807,Paranormal,Vampires,Young Adult,Romance,Fantasy,Paranormal,Fantasy,Romance,Paranormal Romance,Fantasy,Supernatural,Fantasy,Urban Fantasy,Fiction,Mystery,,,,,
236,Lindsay Buroker,Kindle Edition,adventure starts imperial law enforcer amarant...,https://i.gr-assets.com/images/S/compressed.ph...,,,https://goodreads.com/book/show/10031259-the-e...,324,4.04,1378,The Emperor's Edge,13185,Fantasy,Science Fiction,Steampunk,Mystery,Fiction,Adventure,Fantasy,Magic,Science Fiction,Romance,Adult,Fantasy,Urban Fantasy,,,,,,,
276,Larry Niven,Paperback,phssthpok pak traveling thousand years mission...,https://i.gr-assets.com/images/S/compressed.ph...,345353129.0,9780000000000.0,https://goodreads.com/book/show/100344.Protector,224,4.07,252,Protector,9596,Science Fiction,Fiction,Space,Space Opera,Science Fiction Fantasy,Space,Novels,Science Fiction,Aliens,Science Fiction,Hard Science Fiction,Speculative Fiction,Audiobook,,,,,,,


We want to add the cover images now that the sample is smaller. They can be analyzed later.

In [None]:
def download_images_and_process(df, url_column, output_folder):
    total_processed = 0  # Initialize a counter for total records processed

    for index, row in df.iterrows():
        image_url = row[url_column]
        response = requests.get(image_url)

        if response.status_code == 200:
            try:
                # Downloaded image file path
                file_path = f'{output_folder}/image_{index}.jpg'

                # Save the image file
                with open(file_path, 'wb') as file:
                    file.write(response.content)

                # Image processing
                image = Image.open(file_path)

                # Convert the image to RGB
                image = image.convert('RGB')

                # Resize the image to a consistent size (e.g., 100x100 pixels)
                image = image.resize((100, 100))
                
                # Save the processed image
                processed_file_path = f'{output_folder}/processed_image_{index}.jpg'
                image.save(processed_file_path)

                print(f'Image {index} downloaded and processed successfully. Saved to {processed_file_path}')

                # Update DataFrame with processed image path
                df.at[index, 'cover_image'] = processed_file_path

                # Increment the counter
                total_processed += 1

                # Print the total number of records processed at each iteration
                print(f'Total number of records processed so far: {total_processed}')

            except Exception as e:
                print(f'Error processing image {index}: {str(e)}')

        else:
            print(f'Failed to download image {index}. Status code: {response.status_code}')

    print(f'Total number of records processed: {total_processed}')

In [None]:
output_folder = 'cover_images'

url_column = 'img'

download_images_and_process(df, url_column, output_folder)

Image 111 downloaded and processed successfully. Saved to cover_images/processed_image_111.jpg
Total number of records processed so far: 1


Image 139 downloaded and processed successfully. Saved to cover_images/processed_image_139.jpg
Total number of records processed so far: 2
Image 160 downloaded and processed successfully. Saved to cover_images/processed_image_160.jpg
Total number of records processed so far: 3
Image 236 downloaded and processed successfully. Saved to cover_images/processed_image_236.jpg
Total number of records processed so far: 4
Image 276 downloaded and processed successfully. Saved to cover_images/processed_image_276.jpg
Total number of records processed so far: 5
Image 286 downloaded and processed successfully. Saved to cover_images/processed_image_286.jpg
Total number of records processed so far: 6
Image 301 downloaded and processed successfully. Saved to cover_images/processed_image_301.jpg
Total number of records processed so far: 7
Image 303 downloaded and processed successfully. Saved to cover_images/processed_image_303.jpg
Total number of records processed so far: 8
Image 309 downloaded and pr

In [None]:
df.to_csv('book_w_images.csv', index = False) 