# The task

The "goodbooks" dataset contains six million ratings for the ten
thousand most popular books. It includes data such as books marked
"to-read" by users, book metadata (author, year, etc.), and
tags/shelves/genres. For this task, we would like you to cluster similar
books by considering both interaction data (user ratings) and metadata.
Although the dataset includes some metadata (e.g., author and title),
feel free to collect additional data from the web or generate new data.
Additionally, please propose a method to automatically label the
clusters in a way that reflects their semantic content. We encourage
creativity in this process and are not setting strict guidelines for
clustering.

Link to the dataset: https://github.com/zygmuntz/goodbooks-10k

What to Deliver
a) Presentation: A 10-minute presentation where you explain your solution.
b) Demo: A demonstration showing the clusters in a low-dimensional space.

# 1) Exploration

Covered in the *quick_look.ipynb* notebook.

# 2) Clustering

In [1]:
#!pip install umap-learn

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans, HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import re


def load_goodbooks_data(base_path):
    """Load and prepare Goodbooks-10k dataset files with normalized tags, exclusions, and compound splitting."""
    
    # Default manual mapping for common cases
    default_mapping = {
        # Children's books
        'childrens': 'childrens',
        'children': 'childrens',
        'childrensbooks': 'childrens',
        'kids': 'childrens',
        'kidsbooks': 'childrens',
        
        # Young Adult
        'youngadult': 'youngadult',
        'ya': 'youngadult',
        'newadult': 'youngadult',
        
        # Science Fiction
        'scifi': 'scifi',
        'sciencefiction': 'scifi',

        # Classics
        'classics': 'classics',
        'classic': 'classics',        

        # Humour
        'humour': 'humour',
        'humor': 'humour',

        # History
        'history': 'history',
        'historical': 'history',
        'historic': 'history',

        # Audiobook
        'audiobook': 'audiobook',
        'audiobooks': 'audiobook',
        'audio': 'audiobook',
        'audible': 'audiobook',

        # Ebook
        'ebook': 'ebook',
        'ebooks': 'ebook',
        'kindle': 'ebook',

        # Favourite
        'favourite': 'favourite',
        'favourites': 'favourite',
        'favorite': 'favourite',
        'favorites': 'favourite',
        'alltimefavorites': 'favourite',
        'favoritebooks': 'favourite',
        'shelfarifavorites': 'favourite',

        # Graphicnovel
        'graphicnovel': 'graphicnovel',
        'graphicnovels': 'graphicnovel',

        # Memoir
        'memoir': 'memoir',
        'memoirs': 'memoir',

        # Fiction
        'fiction': 'fiction',
        'generalfiction': 'fiction',
        'literaryfiction': 'fiction',
        'toreadfiction': 'fiction',

        # Fantasy
        'fantasy': 'fantasy',
        'epicfantasy': 'fantasy',

        # Biography
        'biography': 'biography',
        'biographies': 'biography',
    }
    
    # Default compound tags to split
    default_compound_splits = {
        'historicalfiction': ['history', 'fiction'],
        'adultfiction': ['adult', 'fiction'],
        'scififantasy': ['scifi', 'fantasy'],
        'fantasyscifi': ['scifi', 'fantasy'],
        'paranormalromance': ['paranormal', 'romance'],
        'contemporaryromance': ['contemporary', 'romance'],
        'historicalromance': ['history', 'romance'],
        'urbanfantasy': ['urban', 'fantasy'],
        'literaryfiction': ['literary', 'fiction'],
        'romanticfantasy': ['romance', 'fantasy'],
        'psychologicalthriller': ['psychological', 'thriller'],
        'crimefiction': ['crime', 'fiction'],
        'darkfantasy': ['dark', 'fantasy'],
        'mysterythriller': ['mystery', 'thriller'],
        'contemporaryfiction': ['contemporary', 'fiction'],
        'christianfiction': ['christian', 'fiction'],
        'yafiction': ['youngadult', 'fiction'],
        'audiobiography': ['audio', 'biography'],
    }

    # Default tags to exclude    
    default_exclude_tags = {
        'books', 'literature', 'default', 'favourite', # general
        'toread', 'currentlyreading', 'readin', 'read', 'reads', 'reread', # read related
        'owned', 'booksiown', 'ownedbooks', 'iown', 'ownit', 'have', # own related
        'library', 'bookclub', 'mylibrary', 'mybooks', # library related
        'tobuy', 'wishlist', # buy related
        'maybe', 'borrowed', 'thcentury', '', 'dnf', 'sf', 's', #others
        'audiobook', 'ebook', #format
        'didnotfinish', 'unfinished', 'didntfinish', 'finished', # finish related
    }
    
    def normalize_and_split_tag(tag):
        """Normalize tag names and handle compound splitting."""
        # Basic normalization
        tag = str(tag)
        tag = tag.replace('-', '')
        tag = ''.join([c for c in tag if not c.isdigit()])
        tag = ''.join(tag.split()).lower()
        
        # Check if it's a compound tag that should be split
        if tag in default_compound_splits:
            return default_compound_splits[tag]
        
        # If not a compound tag, apply regular mapping
        return [default_mapping.get(tag, tag)]
    
    # Load the data
    books = pd.read_csv(f"{base_path}/books.csv")
    ratings = pd.read_csv(f"{base_path}/ratings.csv")
    tags = pd.read_csv(f"{base_path}/book_tags.csv")
    tag_names = pd.read_csv(f"{base_path}/tags.csv")
    
    # Store original statistics
    original_tag_count = len(tag_names['tag_name'].unique())
    
    # Process tags with splitting
    expanded_tags = []
    for _, row in tags.merge(tag_names, on='tag_id').iterrows():
        split_tags = normalize_and_split_tag(row['tag_name'])
        for tag in split_tags:
            if tag not in default_exclude_tags:  # Only add if not excluded
                expanded_tags.append({
                    'goodreads_book_id': row['goodreads_book_id'],
                    'tag_name': tag,
                    'count': row['count']
                })
    
    # Create new DataFrame with expanded tags
    tags = pd.DataFrame(expanded_tags)
    
    # Aggregate counts for same tags
    tags = tags.groupby(['goodreads_book_id', 'tag_name'])['count'].sum().reset_index()
    
    # Normalize tag counts
    tags['count_normalized'] = tags.groupby('goodreads_book_id')['count'].transform(
        lambda x: x / x.sum()
    )
    
    # Set index for books DataFrame
    books.set_index('book_id', inplace=True)
    
    # Print statistics
    print(f"\nFinal tag statistics:")
    print(f"Original number of unique tags: {original_tag_count}")
    print(f"Number of unique tags after processing: {len(tags['tag_name'].unique())}")
    print('\n')
    
    return books, ratings, tags


def preprocess_ratings(ratings_df, books_df, n_components=100):
    """Process ratings and reduce their dimensionality."""
    
    # Create user-item matrix
    user_item_matrix = ratings_df.pivot(
        index='book_id', 
        columns='user_id', 
        values='rating'
    ).fillna(0)
    
    # Add missing books with zero ratings
    user_item_matrix = user_item_matrix.reindex(books_df.index, fill_value=0)
    
    # Compute latent factors
    svd = TruncatedSVD(n_components=min(n_components, min(user_item_matrix.shape) - 1))
    latent_factors = svd.fit_transform(user_item_matrix)
    
    # Calculate rating statistics for all books
    rating_stats = pd.DataFrame(index=books_df.index)
    
    # For books with ratings
    rating_stats['rating_mean'] = ratings_df.groupby('book_id')['rating'].mean()
    rating_stats['rating_std'] = ratings_df.groupby('book_id')['rating'].std()
    rating_stats['rating_count'] = ratings_df.groupby('book_id')['rating'].count()
    
    # Fill missing values
    rating_stats['rating_mean'] = rating_stats['rating_mean'].fillna(0)
    rating_stats['rating_std'] = rating_stats['rating_std'].fillna(0)
    rating_stats['rating_count'] = rating_stats['rating_count'].fillna(0)
    
    return latent_factors, rating_stats


def process_metadata(books_df, tags_df, n_tags=200, n_tag_components=20, n_authors=200, n_author_components=20):
    """Process book metadata features with proper dimensionality reduction for sparse matrices."""
    
    # Process tags
    tag_agg = tags_df.groupby(['goodreads_book_id', 'tag_name'])['count_normalized'].sum().reset_index()
    
    # Get top n most common tags
    top_tags = tag_agg.groupby('tag_name')['count_normalized'].sum().nlargest(n_tags).index
    tag_agg_filtered = tag_agg[tag_agg['tag_name'].isin(top_tags)]
    
    # Create tag matrix and fill missing books with zeros
    tag_matrix = tag_agg_filtered.pivot(
        index='goodreads_book_id',
        columns='tag_name',
        values='count_normalized'
    )
    tag_matrix = tag_matrix.reindex(books_df.index, fill_value=0)
    
    # Fill any remaining NaN values with 0 before SVD
    tag_matrix = tag_matrix.fillna(0)
    
    # Reduce dimensionality of tag matrix using TruncatedSVD
    tag_svd = TruncatedSVD(n_components=n_tag_components)  # Adjust number of components as needed
    tag_reduced = tag_svd.fit_transform(tag_matrix)
    #tag_variance = sum(tag_svd.explained_variance_ratio_)
    #print(f"Variance explained by tag SVD: {tag_variance:.2%}")
    
    # Convert to DataFrame and scale
    tag_matrix_reduced = pd.DataFrame(
        tag_reduced,
        index=tag_matrix.index,
        columns=[f'tag_component_{i}' for i in range(tag_reduced.shape[1])]
    )
    
    # Process authors
    books_df['authors'] = books_df['authors'].fillna('Unknown Author')
    
    # Get top n authors (and take into account that book may have multiple authors)
    author_counts = books_df['authors'].str.split(',').explode().value_counts()
    top_authors = author_counts.nlargest(n_authors).index
    
    # Create author features
    authors = pd.get_dummies(
        books_df['authors'].str.split(',').explode()
    ).groupby(level=0).sum()
    authors = authors[authors.columns[authors.columns.isin(top_authors)]]
    authors = authors.reindex(books_df.index, fill_value=0)
    
    # Fill any remaining NaN values with 0 before SVD
    authors = authors.fillna(0)
    
    # Reduce dimensionality of author matrix using TruncatedSVD
    author_svd = TruncatedSVD(n_components=n_author_components)
    author_reduced = author_svd.fit_transform(authors)
    #author_variance = sum(author_svd.explained_variance_ratio_)
    #print(f"Variance explained by author SVD: {author_variance:.2%}")
    
    # Convert to DataFrame and scale
    authors_reduced = pd.DataFrame(
        author_reduced,
        index=authors.index,
        columns=[f'author_component_{i}' for i in range(author_reduced.shape[1])]
    )
    
    # Process numerical features
    numerical_cols = ['average_rating', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count', 'original_publication_year']
    numerical_features = books_df[numerical_cols].copy()
    
    # Fill missing values with sensible defaults
    defaults = {
        'average_rating': numerical_features['average_rating'].median(),
        'ratings_count': 0,
        'work_ratings_count': 0,
        'work_text_reviews_count': 0,
        'original_publication_year': numerical_features['original_publication_year'].median()
    }
    
    for col, default in defaults.items():
        numerical_features[col] = numerical_features[col].fillna(default)
    
    # Log transform count-based features before scaling
    log_columns = ['ratings_count', 'work_ratings_count', 'work_text_reviews_count']
    for col in log_columns:
        numerical_features[col] = np.log1p(numerical_features[col])
    
    return tag_matrix_reduced, authors_reduced, numerical_features


def combine_features(latent_factors, rating_stats, tag_matrix, authors, numerical_features, active_features):
    """Combine features with 3D reduction."""
    
    # Only process active features
    base_features = {
        'latent_factors': latent_factors if isinstance(latent_factors, pd.DataFrame) 
            else pd.DataFrame(latent_factors, index=rating_stats.index, 
                columns=[f'latent_factor_{i}' for i in range(latent_factors.shape[1])]),
        'rating_stats': rating_stats,
        'tags': tag_matrix,
        'authors': authors,
        'numerical': numerical_features
    }
    
    active_features = {
        name: df for name, df in base_features.items() 
        if name in active_features and active_features[name] > 0
    }

    # Combine features
    combined = pd.concat(active_features, axis=1)
    combined = combined.fillna(0)

    # Scale numerical features using RobustScaler for better handling of outliers
    scaler = RobustScaler()
    combined = pd.DataFrame(
            scaler.fit_transform(combined),
            index=combined.index,
            columns=combined.columns
    )
    
    # Use UMAP with 3 components
    reducer = umap.UMAP(
        n_neighbors=30,
        min_dist=0.2,
        n_components=3,
        metric='euclidean'
    )
    
    reduced_features = reducer.fit_transform(combined)
    return reduced_features, combined, combined.index


def cluster_books(features, min_clusters=5, max_clusters=30, step=1):
    """Perform clustering with improved parameter selection."""
    scores = []
    kmeans_models = []
    
    for k in range(min_clusters, max_clusters + 1, step):
        kmeans = KMeans(
            n_clusters=k,
            init='k-means++',
            n_init=15
        )
        clusters = kmeans.fit_predict(features)
        
        # Compute multiple clustering metrics
        sil_score = silhouette_score(features, clusters)
        db_score = davies_bouldin_score(features, clusters)
        
        # Combine metrics (higher is better)
        combined_score = sil_score - db_score
        scores.append(combined_score)
        kmeans_models.append(kmeans)
    
    best_model = kmeans_models[np.argmax(scores)]
    final_clusters = best_model.fit_predict(features)
    
    return final_clusters


def get_distinctive_cluster_labels(books_df, clusters, tags_df, cluster_index):
    """Generate meaningful labels for clusters based on filtered tags."""
    
    clustered_books = books_df.loc[cluster_index].copy()
    clustered_books['cluster'] = clusters
    
    def get_cluster_theme(cluster_id):
        """Get the main theme for a cluster using both titles and tags."""
        # Get books in this cluster
        cluster_books = clustered_books[clustered_books['cluster'] == cluster_id]
        
        # Analyze tags
        cluster_tags = tags_df[tags_df['goodreads_book_id'].isin(cluster_books.index)]
        top_tags = cluster_tags.groupby('tag_name')['count'].sum().nlargest(10)

        if len(top_tags) > 0:
            primary_tag = top_tags.nlargest(1).index[0]
        else:
            primary_tag = 'Unknown'

        if len(top_tags) > 1:
            top_tags = top_tags[1:]
            exclude_generic = ['fiction', 'classics']
            filtered_top_tags = top_tags[~top_tags.index.str.contains('|'.join(exclude_generic), case=False)]
            secondary_tag = filtered_top_tags.nlargest(1).index[0]
        else:
            secondary_tag = 'Unknown'
        
        # Get time period
        years = cluster_books['original_publication_year']
        median_year = years.median()
        
        # Construct label
        label_parts = []
        label_parts.append(primary_tag.title())
        label_parts.append(secondary_tag.title())
        label_parts.append(str(median_year))
        
        return " ".join(label_parts)
    
    # Generate labels for all clusters
    cluster_labels = {}
    for cluster_id in range(clusters.max() + 1):
        cluster_labels[cluster_id] = get_cluster_theme(cluster_id)
    
    return cluster_labels

    
def visualize_clusters_3d_with_labels(reduced_features, clusters, books_df, cluster_index, cluster_labels):
    """Create 3D visualization with automatic cluster labels."""
    
    viz_df = pd.DataFrame(
        reduced_features, 
        columns=['UMAP1', 'UMAP2', 'UMAP3'],
        index=cluster_index
    )
    viz_df['Cluster'] = clusters
    viz_df['Title'] = books_df.loc[cluster_index, 'title']
    viz_df['Author'] = books_df.loc[cluster_index, 'authors']
    viz_df['Rating'] = books_df.loc[cluster_index, 'average_rating']
    
    # Add semantic labels
    cluster_sizes = pd.Series(clusters).value_counts()
    viz_df['Cluster_Label'] = viz_df['Cluster'].apply(
        lambda x: f"{cluster_labels[x]} (n={cluster_sizes[x]})"
    )
    
    # Create interactive 3D scatter plot
    fig = px.scatter_3d(
        viz_df,
        x='UMAP1',
        y='UMAP2',
        z='UMAP3',
        color='Cluster_Label',
        hover_data=['Title', 'Author', 'Rating'],
        title='3D Book Clusters Visualization with Semantic Labels',
        color_discrete_sequence=px.colors.qualitative.Set3,
        opacity=0.6
    )
    
    fig.update_layout(
        scene=dict(
            xaxis_title='UMAP Dimension 1',
            yaxis_title='UMAP Dimension 2',
            zaxis_title='UMAP Dimension 3'
        ),
        width=1200,
        height=800,
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01
        )
    )
    
    return fig

    
# Update the main function to include automatic labeling
def main(base_path, active_features=None):
    """Main function with automatic cluster labeling."""
    
    # Load data
    books, ratings, tags = load_goodbooks_data(base_path)
    
    # Process ratings data
    latent_factors, rating_stats = preprocess_ratings(ratings, books)
    
    # Process metadata including original title features
    tag_matrix, authors, numerical_features = process_metadata(books, tags)
    
    # Combine features
    reduced_features, combined_features, common_index = combine_features(
        latent_factors,
        rating_stats,
        tag_matrix,
        authors,
        numerical_features,
        active_features
    )
    
    # Perform clustering
    clusters = cluster_books(reduced_features)
    print(f"\n\n\nNumber of clusters: {len(np.unique(clusters))}")
    
    # Generate cluster labels
    cluster_labels = get_distinctive_cluster_labels(books, clusters, tags, common_index)
    
    # Create visualizations with semantic labels
    cluster_3d = visualize_clusters_3d_with_labels(
        reduced_features, clusters, books, 
        common_index, cluster_labels
    )
    
    # Print cluster descriptions
    print("\nCluster Descriptions:")
    for cluster_id, label in cluster_labels.items():
        size = (clusters == cluster_id).sum()
        print(f"\nCluster {cluster_id} ({size} books): {label}")
    
    return books, reduced_features, clusters, {
        'cluster_3d': cluster_3d,
    }, common_index, cluster_labels

In [None]:
active_features = {
    'latent_factors': 1,
    'rating_stats': 0,
    'tags': 1,
    'authors': 1,
    'numerical': 0,
}

base_path = "."
books, reduced_features, clusters, plots, common_index, labels = main(base_path, active_features)
plots['cluster_3d'].show()

# Print cluster descriptions with example books
for cluster_id, label in labels.items():
    books_in_cluster = books.loc[common_index][clusters == cluster_id]
    print(f"\nCluster {cluster_id}: {label}")
    print("Example books:")
    print(books_in_cluster.nlargest(10, 'ratings_count')[['title', 'authors']].to_string())


Final tag statistics:
Original number of unique tags: 34252
Number of unique tags after processing: 31627




In this final solution, we can see fairly meaningful clusters. A few examples that should be familiar even to someone who isn't interested in literature: cluster 1 contains the Harry Potter series and similar books, cluster 4 literally looks like a list of required classic literature for high school, cluster 10 contains mainly self-help books, cluster 12 horror books, cluster 13 children's books and cluster 19 manga.

The most challening part was evaluation. While it is fairly straight forward to mathematically evaluate the shape and density, it’s quite complicated to mathematically express whether the clusters make semantically sense. Therefore, it was necessary to manually evaluate every run.

*PS: The TF-IDF experiments with the title feature mentioned in the presentation were removed, as they were increasing the computational complexity while not improving the results at all.*

# 3) Labelling

In the previous part, we already tried tag based labelling. However, the most common tags are not very informative. Therefore, we will experiment with LLM based labelling. I will choose a sample of books from each cluster and prompt ChatGPT via API to reply with a brief description and a label.

In [None]:
# !pip install openai==0.28
# !pip install --upgrade typing_extensions

In [None]:
prompt_strings = []
for cluster_id, label in labels.items():
    books_in_cluster = books.loc[common_index][clusters == cluster_id]
    example_books = books_in_cluster.nlargest(10, 'ratings_count')
    books_string = [f"{row['title']} by {row['authors']}" for _, row in example_books.iterrows()]
    prompt_strings.append(", ".join(books_string))

# prompt_strings

In [None]:
import openai

openai.api_key = "placeholder"

# Function to communicate with the API
def get_llm_description(prompt, model="gpt-3.5-turbo", temperature=0.15, max_tokens=200):
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt},
            ],
            temperature=temperature,
            max_tokens=max_tokens,
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        return f"An error occurred: {e}"

In [None]:
# Get cluster descriptions
cluster_descriptions = []
prompt_instruction = "I will give you the names of 10 books and your task is to concisely describe the group of books. Do not include the name of any of the authors. Do not include the name of any of the authors! Describe using genres or period when they were written that describes majority of them. Use 8 words at maximum per group, not per book. Here is the group: "
for books_in_cluster in prompt_strings:
    response = get_llm_description(prompt_instruction + books_in_cluster)
    cluster_descriptions.append(response)

# print(cluster_descriptions)

In [None]:
# Get cluster labels
cluster_labels = []
prompt_instruction = "I will give you the names of 10 books and your task is to concisely assign a label to the group. The label should be based on the genre of majority of the books in the group. Use 2 words at maximum per group, not per book. Here is the group: "
for books_in_cluster in prompt_strings:
    response = get_llm_description(prompt_instruction + books_in_cluster)
    cluster_labels.append(response)

# print(cluster_labels)

In [None]:
plots['cluster_3d'].show()

# Print detailed cluster descriptions with example books
for i, (cluster_id, label) in enumerate(labels.items()):
    books_in_cluster = books.loc[common_index][clusters == cluster_id]
    print(f"\nCluster {cluster_id}: {cluster_labels[i]} - {cluster_descriptions[i]}")
    print("Example books:")
    print(books_in_cluster.nlargest(10, 'ratings_count')[['title', 'authors']].to_string())

The LLM generated tags and descriptions seem to be much more descriptive than the tag based ones, which actually makes the manual evaluation of semantic quality of the clusters a bit easier. A bit of prompt engineering might improve them a bit, but I'm satisfied with these and will call this my final solution.

# Archive of some auxiliary experiments
## Determining the number of components in relationships SVD

In [None]:
books_df, ratings_df, _ = load_goodbooks_data(base_path)

# Create user-item matrix
user_item_matrix = ratings_df.pivot(
    index='book_id', 
    columns='user_id', 
    values='rating'
).fillna(0)

# Add missing books with zero ratings
user_item_matrix = user_item_matrix.reindex(books_df.index, fill_value=0)

# Method 1: Calculate explained variance ratio
svd = TruncatedSVD(n_components=1000)
svd.fit(user_item_matrix)

# Plot cumulative explained variance
cumulative_variance_ratio = np.cumsum(svd.explained_variance_ratio_)
plt.plot(cumulative_variance_ratio)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance vs Number of Components')
plt.show()

In [None]:
# Find number of components for desired explained variance
target_variance = 0.5
n_components = np.argmax(cumulative_variance_ratio >= target_variance) + 1
print(f"Number of components needed for {target_variance*100}% variance: {n_components}")

## One feature at a time testing

In [None]:
active_features = {
    'latent_factors': 1,
    'rating_stats': 0,
    'tags': 0,
    'authors': 0,
    'numerical': 0
}

base_path = "."
books, reduced_features, clusters, plots, common_index, labels = main(base_path, active_features)
plots['cluster_3d'].show()

# Print detailed cluster descriptions with example books
for cluster_id, label in labels.items():
    books_in_cluster = books.loc[common_index][clusters == cluster_id]
    print(f"\nCluster {cluster_id}: {label}")
    print("Example books:")
    print(books_in_cluster.nlargest(10, 'ratings_count')[['title', 'authors']].to_string())

In [None]:
active_features = {
    'latent_factors': 0,
    'rating_stats': 1,
    'tags': 0,
    'authors': 0,
    'numerical': 0
}

base_path = "."
books, reduced_features, clusters, plots, common_index, labels = main(base_path, active_features)
plots['cluster_3d'].show()

# Print detailed cluster descriptions with example books
for cluster_id, label in labels.items():
    books_in_cluster = books.loc[common_index][clusters == cluster_id]
    print(f"\nCluster {cluster_id}: {label}")
    print("Example books:")
    print(books_in_cluster.nlargest(10, 'ratings_count')[['title', 'authors']].to_string())

In [None]:
active_features = {
    'latent_factors': 0,
    'rating_stats': 0,
    'tags': 1,
    'authors': 0,
    'numerical': 0
}

base_path = "."
books, reduced_features, clusters, plots, common_index, labels = main(base_path, active_features)
plots['cluster_3d'].show()

# Print detailed cluster descriptions with example books
for cluster_id, label in labels.items():
    books_in_cluster = books.loc[common_index][clusters == cluster_id]
    print(f"\nCluster {cluster_id}: {label}")
    print("Example books:")
    print(books_in_cluster.nlargest(10, 'ratings_count')[['title', 'authors']].to_string())

In [None]:
active_features = {
    'latent_factors': 0,
    'rating_stats': 0,
    'tags': 0,
    'authors': 1,
    'numerical': 0,
}

base_path = "."
books, reduced_features, clusters, plots, common_index, labels = main(base_path, active_features)
plots['cluster_3d'].show()

# Print detailed cluster descriptions with example books
for cluster_id, label in labels.items():
    books_in_cluster = books.loc[common_index][clusters == cluster_id]
    print(f"\nCluster {cluster_id}: {label}")
    print("Example books:")
    print(books_in_cluster.nlargest(10, 'ratings_count')[['title', 'authors']].to_string())

In [None]:
active_features = {
    'latent_factors': 0,
    'rating_stats': 0,
    'tags': 0,
    'authors': 0,
    'numerical': 1,
}

base_path = "."
books, reduced_features, clusters, plots, common_index, labels = main(base_path, active_features)
plots['cluster_3d'].show()

# Print detailed cluster descriptions with example books
for cluster_id, label in labels.items():
    books_in_cluster = books.loc[common_index][clusters == cluster_id]
    print(f"\nCluster {cluster_id}: {label}")
    print("Example books:")
    print(books_in_cluster.nlargest(10, 'ratings_count')[['title', 'authors']].to_string())