In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pdfkit
import glob
import os
import base64
import pdfkit
import module
import ast

from openai import OpenAI
from PyPDF2 import PdfMerger
import multidict as multidict
from os import path
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.interpolate import make_interp_spline

# Query OpenAI for title and description of each cluster

In [167]:
OPENAI_API_KEY= ''
client = OpenAI(api_key = OPENAI_API_KEY)

# Cluster Atlas from titles and description

Making one page per cluster with wordclouds and titles

In [68]:
encoding = 'utf-8' 

# embedding of books
embedding_i = pd.read_csv('data/internal_embedding.csv', encoding=encoding, encoding_errors='replace')
embedding_i  = module.eval_as(embedding_i)

# clusters
clusters_i = pd.read_csv('data/clusters_i.csv', encoding=encoding, encoding_errors='replace')
# top 100 loaned books in cluster
clusters_i['top_titles'] = embedding_i.sort_values(['frequency_norm']).groupby('cluster').head(100).groupby('cluster').title.apply(list)

## Loan frequency of top titles per year

In [70]:
# Plotting loans of top titles in each cluster
frequencies = []

# Get only top titles' loan frequency
for cluster_id, row in clusters_i.iterrows():

    top_titles = row.top_titles
    books = embedding_i[(embedding_i.cluster == cluster_id) & (embedding_i.title.isin(top_titles))]
    cluster_frequencies = books['yearly_frequency'].aggregate(module.sum_loan_times)
    frequencies.append(cluster_frequencies)

cluster_yearly_time_df = pd.DataFrame(frequencies).fillna(0)
sorted_columns = sorted(cluster_yearly_time_df.columns)
x = np.arange(len(sorted_columns))

for cluster_id, row in cluster_yearly_time_df.iterrows():

    plt.figure(figsize=(5, 3), dpi=300)

    spl = make_interp_spline(x, row, k=3)
    x_new = np.linspace(0, len(sorted_columns) - 1, 300)
    y_new = spl(x_new)

    plt.fill_between(x_new, y_new, color=module.CUSTOM_COLOURS[10])
    plt.title(f'Cluster {cluster_id}: Loans per year')
    plt.xlabel('Year')
    plt.ylabel('Number of Loans')
    plt.ylim(0, max(row) + (max(row)*0.3))

    plt.xticks(x, sorted_columns, rotation=45)  # Rotate x-axis labels for readability
    plt.tight_layout()
    # plt.show()

    # Save the plot as an image
    plt.savefig(f'exp/loans/cluster_{cluster_id}_loans.png')
    plt.close()

## Wordclouds

In [101]:
# Generate wordclouds for each cluster in top four languages

vectorizer = TfidfVectorizer()
feature_frequencies = []

# List of available stopwords 
english_stop_words = list(stopwords.words('english'))
german_stop_words = list(stopwords.words('german'))
italian_stop_words = list(stopwords.words('italian'))
french_stop_words = list(stopwords.words('french'))

wc = WordCloud( 
        mode = "RGBA",
        color_func=lambda *args, **kwargs: (0, 0, 0),
        font_path = path.join('Lato-Regular.ttf'),
        # mask=mask,
        normalize_plurals=False,
        prefer_horizontal= 1,
        margin=10,
        background_color=None,
        # background_color='black',
        # max_words=max_words,
        min_font_size= 5,
        max_font_size= 100,
        # collocation_threshold = 20,
        relative_scaling = 1,
    )

clusters = embedding_i[embedding_i.cluster != -1].groupby('cluster')

for cluster_id, cluster in clusters:
    nr_langs = 4
    langs = cluster.groupby('lang').size().head(nr_langs).reset_index()['lang'].tolist()
    lang_titles = cluster[cluster['lang'].isin(langs)].groupby('lang')

    cluster_frequencies = []
    
    for lang, lang_group in lang_titles:
        # Remove stop words for main languages
        if lang == 'eng':
            stop_words = english_stop_words
        elif lang == 'ger':
            stop_words = german_stop_words
        elif lang == 'ita':
            stop_words = italian_stop_words
        elif lang == 'fre':
            stop_words = french_stop_words
        else:
            stop_words = None

        # Apply stop words removal and tfidf vectorizer
        titles = ' '.join(lang_group['title'].tolist())
        vectorizer = TfidfVectorizer(stop_words=stop_words)
        tfidf_matrix = vectorizer.fit_transform([titles])
        feature_names = vectorizer.get_feature_names_out()

        # Generate word cloud
        wc.generate_from_frequencies(dict(zip(feature_names, tfidf_matrix.toarray()[0])))
        wc.to_file(path.join("exp/wc/", f"{cluster_id}_{lang}.png"))
        
        cluster_frequencies.append((dict(zip(feature_names, tfidf_matrix.toarray()[0]))))
    
    feature_frequencies.append(cluster_frequencies)


In [179]:
# Keeping only the top 100 words in the dictionaries per language
top_n = 100
for cluster_idx, cluster_freq in enumerate(feature_frequencies):
    for idx, freq_dict in enumerate(cluster_freq):
        sorted_words = dict(sorted(freq_dict.items(), key=lambda item: item[1], reverse=True))
        
        # Keep only the top N most frequent words
        sorted_words = {k: v for k, v in sorted_words.items() if v > 0}  # Remove words with frequency 0
        sorted_words = dict(list(sorted_words.items())[:top_n])  # Keep only the top N
        cluster_freq[idx] = sorted_words


In [180]:
clusters_i['wc_frequencies'] = feature_frequencies
# Rounding the decimal places
decimal_places = 4
clusters_i['wc_frequencies'] = clusters_i['wc_frequencies'].apply(lambda cluster_frequencies: [{key: round(value, decimal_places) for key, value in lang_frequencies.items()} for lang_frequencies in cluster_frequencies])

# Get cluster title from word frequencies

In [181]:
def get_title_from_frequencies(frequencies):
    try:
        completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": f"Based on the following terms and tf-idf scores, suggest a subject classification (up to 10 words) for the associated cluster of books. Give me the response in format: Subject: subject"}, 
            {
                "role": "user",
                "content": f"These are the terms and frequencies: {frequencies}"
            }
        ]
        )
        
        return completion.choices[0].message.content
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
def extract_titles (text): 

    split_text = text.split('\n\n')
    title = split_text[0]
    if(title.startswith('Subject:')):
        return(title[8:])
    else: 
        return title


# Getting a description like this absolutely doesn't work. 
# The number of input clusters doens't correspond to the output. 
# And then it gives random subject like 
#  Cluster 1: History and Geography)
#  Cluster 2: Advanced Mathematics and Algorithms)
#  Cluster 3: Food Science and Culinary Concepts)
#  Cluster 4: Psychological Theory and Therapy)
    
def get_subject_vocabulary(df):
    frequencies = df.wc_frequencies

    try:

        initial_message = {
            "role": "user",
            "content": "Based on the following terms and frequencies, suggest a meaningful subject classification (up to 10 words) for the associated cluster of books. Give me the response in format cluster output"
        }
        messages = [initial_message]
        for _, row in df.iterrows():
            cluster_id = row['cluster']
            frequencies = row['wc_frequencies']

            cluster_message = {
                "role": "user",
                "content": f"START Cluster {cluster_id}:{frequencies}. END Cluster {cluster_id}"
            }

            completion = client.chat.completions.create(
            model="gpt-4",
            messages=messages
        )
        response = completion.choices[0].message.content
        print(f"Response: {response}")

        return response

    except Exception as e:
        print(f"An error occurred: {e}")
        
        return completion.choices[0].message.content



In [182]:
response = clusters_i.wc_frequencies.apply(get_title_from_frequencies)

In [191]:
clusters_i['subject_output'] = response
clusters_i['cluster_subject'] = clusters_i['subject_output'].apply(extract_titles)

In [189]:
def get_description_from_titles(cluster_title, titles):
    try:
        completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": f"Based on the following cluster subject and book titles, suggest a description (up to 100 words) for the associated cluster of books. Give me the response in format: Description: description"}, 
            {
                "role": "user",
                "content": f"This is the cluster subject : {cluster_title}"
            },
            {
                "role": "user",
                "content": f"These are the book titles : {titles}"
            }
        ]
        )
        
        return completion.choices[0].message.content
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    

def extract_descriptions(text): 

    split_text = text.split('\n\n')
    description = split_text[0]
    start = 'Description:'
    if(description.startswith(start)):
        return(description[len(start):])
    else: 
        return description

In [192]:
clusters_i['description_output'] = clusters_i.apply(lambda row: get_description_from_titles(row['cluster_subject'], row['top_titles']), axis=1)


In [194]:
clusters_i['cluster_description'] = clusters_i.description_output.apply(extract_descriptions)

## Plot the clusters on the mapping

In [None]:
# Create map for each cluster with cluster in red and other clusters in blue

df = embedding_i

unique_clusters = np.unique(df['cluster'])
unique_clusters = unique_clusters[unique_clusters != -1]


for cluster_number in unique_clusters:
    # print(cluster_number)
    plt.figure(figsize=(10, 5), dpi=300)
    cluster_indices = np.where(df['cluster'] == cluster_number)[0]
    other_indices = np.where(df['cluster'] != cluster_number)[0]
    cluster_x = df.iloc[cluster_indices]['x'].mean()
    cluster_y = df.iloc[cluster_indices]['y'].mean()
    
    # Scatter plot for the current cluster with a specified color
    plt.scatter(df.iloc[cluster_indices]['x'], df.iloc[cluster_indices]['y'], s=5, c=module.CUSTOM_COLOURS[0])
    plt.scatter(df.iloc[other_indices]['x'], df.iloc[other_indices]['y'], s=2, c=module.CUSTOM_COLOURS[10])
    
    # Text label for the cluster number
    plt.text(cluster_x , cluster_y + 20, str(cluster_number), fontsize=16, ha='center')
    # plt.title(f'Cluster {cluster_number}')
    plt.axis('off')
    plt.savefig(f'exp/maps/cluster_{cluster_number}_map.png')
    plt.close()



## Generate html and pdf for Cluster Atlas

In [4]:
css_directory = os.path.abspath('export.css')

def generate_pdf_for_clusters(clusters):
    for _, cluster in clusters.iterrows():
        cluster_id = cluster['cluster']
        title = cluster['cluster_subject']
        description = cluster['cluster_description']
        top_titles = cluster['top_titles']

        image_pattern = f"exp/wc/{cluster_id}_*.png"
        image_paths = [os.path.abspath(image) for image in glob.glob(image_pattern)]

        pdf_directory = "exp/pdfs"
        pdf_filename = f"{pdf_directory}/cluster_{cluster_id}_report.pdf"
        
        loans_directory = "exp/loans"
        loans_path = f"{loans_directory}/cluster_{cluster_id}_loans.png"

        maps_directory = "exp/maps"
        maps_path = f"{maps_directory}/cluster_{cluster_id}_map.png"

        # Create an HTML template for each cluster
        html_template = f"""
        <html>
        <head>
            <meta charset="UTF-8">
            <title>{cluster_id} - {title}</title>
            <link rel="stylesheet" type="text/css" href="{css_directory}">
        </head>
        <body>
        <div class='page-wrapper'>
                
        <h1>{cluster_id} - {title}</h1>
        <p>{description}</p>

        <div class="row">
        """
        if os.path.exists(loans_path):
            data_uri = base64.b64encode(open(loans_path, 'rb').read()).decode('utf-8')
            img_tag = '<div class="loan-figure"><img src="data:image/png;base64,{0}"></div>'.format(data_uri)
            html_template += img_tag
        else:
            print(f"Image not found: {loans_path}")

        if os.path.exists(maps_path):
            data_uri = base64.b64encode(open(maps_path, 'rb').read()).decode('utf-8')
            img_tag = '<div class="loan-figure"><img src="data:image/png;base64,{0}"></div>'.format(data_uri)
            html_template += img_tag

        else:
            print(f"Image not found: {maps_path}")

        html_template += """
        </div>
        
        <div class=wordclouds>
        """

        # Add images in a row
        for image_path in image_paths[:4]:
            if os.path.exists(image_path):
                data_uri = base64.b64encode(open(image_path, 'rb').read()).decode('utf-8')
                img_tag = '<img class="wordcloud-image" src="data:image/png;base64,{0}">'.format(data_uri)
                html_template += img_tag
            else:
                print(f"Image not found: {image_path}")


        html_template += """
        </div>

        <div class="page-break"></div> <!-- Page break to ensure titles start on a new page -->
        <div class=row>
        """

         # Add the top titles in two columns
        half_point = len(top_titles) // 2
        for i, top_title in enumerate(top_titles):
            if i == 0 or i == half_point:
                html_template += '<div class="column">'
            html_template += f"<p>{top_title}</p>"
            if i == half_point - 1 or i == len(top_titles) - 1:
                html_template += '</div>'

        html_template += """
        </div>
        </body>
        </html>
        """
        options = {
            'enable-local-file-access': True,
            "page-size": "A4",
            "user-style-sheet": css_directory,
            'encoding': "UTF-8"
        }
        # Generate PDF for the cluster
        pdfkit.from_string(html_template, pdf_filename, options=options)

clusters_i = pd.read_csv('exp/cluster_atlas_full.csv', encoding='utf-8', encoding_errors='replace')
clusters_i.top_titles = clusters_i.top_titles.apply(ast.literal_eval)
generate_pdf_for_clusters(clusters_i)


In [5]:
# Directory containing HTML files
html_directory = "exp/html"

# Directory to save the final PDF
output_pdf = "exp/cluster_atlas_full.pdf"

pdf_directory = "exp/pdfs"
pdf_files = [os.path.join(pdf_directory, filename) for filename in os.listdir(pdf_directory) if filename.endswith(".pdf")]

# Sort the PDF files by their cluster number
pdf_files = sorted(pdf_files, key=lambda x: int(os.path.splitext(os.path.basename(x))[0].split('_')[1]))

# Merge the individual PDFs into one PDF file
merger = PdfMerger()
for pdf_file in pdf_files:
    merger.append(pdf_file)
    
# Save the merged PDF
merger.write(output_pdf)
merger.close()


In [198]:
# clusters_i.to_csv('exp/cluster_atlas_full.csv', index=False)