In [18]:
import os
import random
import spacy
import ast
import glob
import spacy
import openai
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import networkx as nx
import hdbscan
import module


from collections import defaultdict
from umap.umap_ import UMAP
from itertools import combinations
from matplotlib.colors import ListedColormap
from tqdm.auto import tqdm
from langdetect import detect, DetectorFactory
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import gaussian_kde


# Input

In [3]:
pre = pd.read_csv('exp/pre.csv')
post = pd.read_csv('exp/post.csv')

# converts lists and dictionaries in the columns
pre = module.eval_as(pre)
post = module.eval_as(post)

In [4]:
pre.sample()

Unnamed: 0,Z30_CALL_NO,frequency,users,users_weighed,yearly_frequency,yearly_loan_time,title,is_outlier,yearly_frequency_norm,frequency_norm
19018,Ca-VIV 70-4170,3,"[U-176, U-443]","[U-176, U-443, U-443]","{2014: 2, 2019: 1}","{2014: 2.0, 2019: 1.0}","Elogio accademico dei Vivarini, primi padri de...",False,"{2016: 0.0, 2018: 0.0, 2015: 0.0, 2019: 0.0135...",0.004032


# Embedding 

In [16]:
# Identity tokenizer for tfidf
def identity_tokenizer(doc):
    return doc

# TFIDF and Embedding for a df
def embedding(df): 

    tfidf = TfidfVectorizer(tokenizer = identity_tokenizer, preprocessor= identity_tokenizer, token_pattern=None)
    users_weighed_tfidf = tfidf.fit_transform(df['users_weighed'])

    reducer = UMAP(n_components=2, metric='hamming', n_neighbors=100)
    embedding = reducer.fit_transform(users_weighed_tfidf)
    scaler = MinMaxScaler(feature_range=(100, 900))  # Scale

    embedding = scaler.fit_transform(embedding)
    embedding = embedding.astype(int)

    module.plot_embedding(embedding,'')

    df = df.assign(x=embedding[:, 0], y=embedding[:, 1])
    return df

def clustering(df): 
    
    hdb = hdbscan.HDBSCAN(
    # min_samples=1,
     min_cluster_size=20,  # 2 — which means 3 elements — is the smallest cluster size
    cluster_selection_epsilon=5,   # float, optional (default=0.0) A distance threshold. Clusters below this value will be merged.
    # gen_min_span_tree=False,
    # leaf_size=10, # int, optional (default=40) If using a space tree algorithm (kdtree, or balltree) the number of points ina leaf node of the tree. This does not alter the resulting clustering, but may have an effect on the runtime of the algorithm.
)
    embedding = df[['x', 'y']].to_numpy()
    hdb.fit(embedding)
    clusters = hdb.labels_

    df['cluster'] = clusters

    return df,hdb

# pre = embedding(pre)
# post = embedding(post)

pre, pre_hdb = clustering(pre)
post, post_hdb = clustering(post)





In [19]:
module.plot_clustering(pre, pre_hbd)
module.plot_clustering(post, post_hdb)

AttributeError: module 'module' has no attribute 'plot_clustering'

# Analysis

In [None]:
subject_dict= {'A': 'Manuals', 'B': 'Italian Art', 'C': 'Italian Artists', 'D': 'Rome','E':'Italian Topography', 'F': 'Travel Literature', 
               'G': 'Sources', 'H': 'Iconography', 'J': 'Ornament','Kat': 'Catalogues','K': 'Commemorative and Collected Writings','L': 'Congress Publications', 'M': 'Art in General',
                'N': 'Architecture', 'O':'Sculpture','Per': 'Periodicals', 'P': 'Painting', 'Q': 'Manuscript Illumination', 'R': 'Graphic Arts', 
               'S': 'Applied Arts', 'T': 'Collecting Art, Museum Studies', 'U': 'Registers of Artistic Monuments', 'V': 'Cultural Institutions', 
               'W': 'Non-Italian Artists','X': 'European Topography', 'Y': 'World Topography', 'Z': 'Related Disciplines'}

custom_colours = [
    "#EA522B", "#EFD4D1", "#2A4978", "#8BDBE1", "#ECA19D", "#B48E36", "#EB84D6", "#B8BFCE", "#FAC73B", "#91C5E4",
    "#6E8EAC", "#D2E7E0","#DAA47F", "#ECD096", "#6C9686", "#E6E10F", "#9D4B37", "#A3B49D", "#BDC920", "#DBE3E5", "#6A8B8D",
    "#EFB3D1", "#F6A1B4", "#5499C7", "#1C2833", "#F0B27A", "#2E7F7F", "#CB4335", "#4A235A"
]

custom_cmap = ListedColormap(custom_colours)

def get_subject(call_number):
    for key, category in subject_dict.items():
        if call_number.startswith(key):
            return category
    return 'Unknown'

df['cat'] = df['Z30_CALL_NO'].apply(get_subject)


In [None]:
def small_multiple_subject(df):

    unique_subjects = df['cat'].unique()
    n_subjects = len(unique_subjects)

    num_cols = 6
    num_rows = (n_subjects - 1) // num_cols + 1 # subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 15), dpi=300)

    for i, subject in enumerate(unique_subjects):
        
        row, col = i // num_cols, i % num_cols
        ax = axes[row, col]
        ax.set_title(f'{subject}')

        rows = df[df.cat == subject]

        x_values = rows['x'].to_list()
        y_values = rows['y'].to_list()
        
        color = custom_cmap(i / (n_subjects - 1))

        sc = ax.scatter(x_values, y_values, s=2, c=color)

        ax.axis("off")

    # Hide empty subplots
    for i in range(len(unique_subjects), num_rows * num_cols):
        row, col = i // num_cols, i % num_cols
        fig.delaxes(axes[row, col])

    plt.tight_layout()
    plt.subplots_adjust(top=0.85)

    plt.show()