In [1]:
#multi-hot encoding large so own thing to avoid accidental kernal restart
#Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import os
from fuzzywuzzy import fuzz, process
import ast

In [2]:
#load data
file = 'books_nodup.csv'
books_df= pd.read_csv(file)

In [3]:
print(books_df['genre'].head(3))

0    history military history civil war american hi...
1            couture fashion historical art nonfiction
2                                     politics history
Name: genre, dtype: object


In [4]:
books_df['genre']=books_df['genre'].apply(lambda x: x.split())

In [5]:
genres=set(genre for sublist in books_df['genre'] for genre in sublist)

In [6]:
genres=sorted(genres)

In [7]:
print(genres)

['about', 'abuse', 'academia', 'academic', 'academics', 'acceptance', 'accounting', 'action', 'activism', 'adaptations', 'adolescence', 'adoption', 'adult', 'adventure', 'aeroplanes', 'africa', 'african', 'age', 'agriculture', 'ai', 'air', 'aircraft', 'airships', 'albanian', 'alchemy', 'alcohol', 'alexandria', 'algebra', 'algeria', 'algorithms', 'aliens', 'alternate', 'alternative', 'amateur', 'amazon', 'america', 'american', 'americana', 'americans', 'americas', 'amish', 'analysis', 'anarchism', 'ancient', 'and', 'angels', 'anglo', 'angola', 'animal', 'animals', 'anime', 'anthologies', 'anthropology', 'anthropomorphic', 'anti', 'antietam', 'antiquities', 'antisemitism', 'apocalyptic', 'apple', 'appomattox', 'archaeology', 'architecture', 'arithmetic', 'army', 'art', 'arthurian', 'artificial', 'arts', 'asexual', 'asia', 'asian', 'aspergers', 'astrology', 'astronomy', 'atheism', 'atlases', 'atmospheric', 'audiobook', 'australia', 'authors', 'autobiography', 'aviation', 'awards', 'babylo

In [8]:
# Define the genre hierarchy
genre_hierarchy = {
    'Action': ['adventure', 'combat', 'fantasy', 'horror', 'sci-fi', 'thriller'],
    'Academia': ['academic', 'academics', 'education', 'university', 'campus'],
    'Science': ['astronomy', 'biology', 'chemistry', 'physics', 'geology'],
    'History': ['ancient', 'medieval', 'modern', 'historical', 'prehistory'],
    'Art': ['art', 'architecture', 'arts', 'visual', 'crafts', 'design'],
    'Fiction': ['fantasy', 'science fiction', 'mystery', 'romance', 'thriller'],
    'Non-Fiction': ['biography', 'autobiography', 'memoir', 'self-help', 'essay'],
    'Culture': ['cultural', 'society', 'ethnic', 'religion', 'spirituality'],
    'Entertainment': ['movies', 'tv', 'comics', 'music', 'games', 'sports'],
    'Technology': ['computers', 'programming', 'technology', 'engineering'],
    'Travel': ['travel', 'adventure', 'tourism', 'exploration'],
    'Politics': ['political', 'history', 'government', 'sociology'],
    'Social Issues': ['activism', 'acceptance', 'disability', 'feminism'],
    'Lifestyle': ['health', 'fitness', 'food', 'fashion', 'home'],
    'Mythology': ['mythology', 'legends', 'fairy tales', 'fantasy'],
    'Religion': ['christianity', 'islam', 'hinduism', 'buddhism', 'paganism'],
    'Languages': ['english', 'french', 'spanish', 'german', 'latin'],
    'Geography': ['asia', 'europe', 'africa', 'americas', 'oceania'],
    'Science Fiction': ['cyberpunk', 'dystopia', 'space', 'aliens', 'robots'],
    'Fantasy': ['dragons', 'magic', 'fairies', 'mythical creatures'],
    'Romance': ['love', 'relationship', 'romantic', 'erotic'],
    'Mystery': ['detective', 'thriller', 'whodunit', 'crime'],
    'Historical Fiction': ['historical', 'biographical', 'period'],
    'Young Adult': ['teen', 'yaoi', 'yuri'],
    'Hobbies': ['gardening', 'knitting', 'beading', 'cooking', 'collecting'],
    'Animals': ['dogs', 'cats', 'birds', 'wildlife'],
    'Education': ['teaching', 'learning', 'academic', 'curriculum']
}


In [9]:
# Flatten the hierarchy into a mapping
term_to_category = {}
for category, terms in genre_hierarchy.items():
    for term in terms:
        term_to_category[term] = category

In [10]:
# Function to map genres to categories
def map_genres(genre_list):
    return set(term_to_category.get(genre, 'Unknown') for genre in genre_list)

In [11]:
# Apply the function to create a new column with categories
books_df['category'] = books_df['genre'].apply(map_genres)

In [12]:
print(books_df.head())

                                               title  \
0  Between Two Fires: American Indians in the Civ...   
1                           Fashion Sourcebook 1920s   
2                                         Hungary 56   
3  All-American Anarchist: Joseph A. Labadie and ...   
4  The Human Equation: Building Profits by Puttin...   

                             author  \
0              Laurence M. Hauptman   
1  Charlotte Fiell,Emmanuelle Dirix   
2                     Andy Anderson   
3              Carlotta R. Anderson   
4                   Jeffrey Pfeffer   

                                                desc  \
0  Reveals that several hundred thousand Indians ...   
1  Fashion Sourcebook - 1920s is the first book i...   
2  The seminal history and analysis of the Hungar...   
3  "All-American Anarchist" chronicles the life a...   
4  Why is common sense so uncommon when it comes ...   

                                               genre    rating  \
0  [history, military, hi

In [13]:
# Get all unique categories and convert to list
all_categories = list(set(cat for cats in books_df['category'] for cat in cats))

In [14]:
# Create a DataFrame with multi-hot encoding
multi_hot_df = pd.DataFrame(0, index=books_df.index, columns=all_categories)

In [17]:
# Populate the multi-hot encoding DataFrame
for idx, categories in books_df['category'].items():
    multi_hot_df.loc[idx, list(categories)] = 1

In [18]:
# Combine the original DataFrame with the multi-hot encoding DataFrame
result_df = pd.concat([books_df, multi_hot_df], axis=1)

In [19]:
# Drop the intermediate columns if needed
result_df = result_df.drop(columns=['category'])

In [20]:
# Print the updated DataFrame with multi-hot encoding
print(result_df)

                                                   title  \
0      Between Two Fires: American Indians in the Civ...   
1                               Fashion Sourcebook 1920s   
2                                             Hungary 56   
3      All-American Anarchist: Joseph A. Labadie and ...   
4      The Human Equation: Building Profits by Puttin...   
...                                                  ...   
89516                                     The Sea Inside   
89517                                  A Horse for Angel   
89518  A Faith Worth Sharing: A Lifetime of Conversat...   
89519  A Volcano Beneath the Snow: John Brown's War A...   
89520  Paranormal Nation: Why America Needs Ghosts, U...   

                                 author  \
0                  Laurence M. Hauptman   
1      Charlotte Fiell,Emmanuelle Dirix   
2                         Andy Anderson   
3                  Carlotta R. Anderson   
4                       Jeffrey Pfeffer   
...                  

In [21]:
#save data
result_df.to_csv('encoded.csv', index=False)