In [None]:
import pandas as pd
import json
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import altair as alt

In [None]:
WIDTH = 400
HEIGHT = 400
N_CLUSTER = 12
SELECTION = 'cluster'
SELECTION_LABEL = 'Cluster'
AFFILIATIONS = ["UGM"]

In [None]:
with open("../data/processed/UGM_BIO/biology.json", "r") as f:
    data = json.load(f)

In [None]:
text_vis = {}
ctr = 0
for sinta_id, value in data.items():
    name = value["name_inputted"]
    for item in value['publications']:
        text_vis[ctr] = {"author": sinta_id,
                         "title" : item['bib']['title']}
        ctr += 1

In [None]:
df = pd.DataFrame.from_dict(text_vis).T
df

In [None]:
# Function to get the most common words per cluster
def get_top_words(titles, stop_words='english', top_n=5):
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = vectorizer.fit_transform(titles)
    feature_array = vectorizer.get_feature_names_out()
    tfidf_sorting = tfidf_matrix.toarray().sum(axis=0).argsort()[::-1]
    top_words = [feature_array[i] for i in tfidf_sorting[:top_n]]
    return top_words

# Combine English and Indonesian stop words
indonesian_stop_words = [
    'dan', 'yang', 'untuk', 'dari', 'dengan', 'pada', 'adalah', 'ke', 'di', 'sebagai', 'ini',
    'itu', 'oleh', 'dalam', 'atau', 'juga', 'tersebut', 'sangat', 'agar', 'bisa', 'karena', 'terhadap',
    'pengaruh', 'berdasarkan', 'indonesia', 'isolated', 'based', 'daerah', 'indonesian', 'yogyakarta',
    'java', 'analysis', 'effect', 'using', "sp"
]
english_stop_words = set(TfidfVectorizer(stop_words='english').get_stop_words())
combined_stop_words = list(english_stop_words.union(indonesian_stop_words))

# Step 1: Text Preprocessing and Vectorization
vectorizer = TfidfVectorizer(stop_words=combined_stop_words)
tfidf_matrix = vectorizer.fit_transform(df['title'])

# Step 2: Aggregation
# Combine all titles for each author into a single string
author_profiles = df.groupby('author')['title'].apply(lambda titles: ' '.join(titles)).reset_index()
author_tfidf_matrix = vectorizer.transform(author_profiles['title'])

# Step 3: Dimensionality Reduction (optional)
pca = PCA(n_components=3)
reduced_tfidf_matrix = pca.fit_transform(author_tfidf_matrix.toarray())

# Calculate explained variance
explained_variance = pca.explained_variance_ratio_

In [None]:
# Step 1: Calculate metrics for a range of k values
wcss = []
silhouette_scores = []
davies_bouldin_scores = []
box_size = 150
font_size = 10

for k in range(2, 21):
    kmeans = KMeans(n_clusters=k, random_state=42)
    clusters = kmeans.fit_predict(reduced_tfidf_matrix)
    wcss.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(reduced_tfidf_matrix, clusters))
    davies_bouldin_scores.append(davies_bouldin_score(reduced_tfidf_matrix, clusters))

# Create dataframes for plotting
wcss_df = pd.DataFrame({'k': range(2, 21), 'WCSS': wcss})
silhouette_df = pd.DataFrame({'k': range(2, 21), 'Silhouette Score': silhouette_scores})
davies_bouldin_df = pd.DataFrame({'k': range(2, 21), 'Davies-Bouldin Index': davies_bouldin_scores})

# Plot the Elbow Method results
elbow_plot = alt.Chart(wcss_df).mark_line(point=True).encode(
    x=alt.X('k', title='Number of Clusters (k)', axis=alt.Axis(labelFontSize=font_size, titleFontSize=font_size)),
    y=alt.Y('WCSS', title='WCSS', axis=alt.Axis(labelFontSize=font_size, titleFontSize=font_size)),
    tooltip=['k', 'WCSS']
).properties(
    title=alt.TitleParams('Elbow Method for Optimal k', fontSize=font_size+2),
    height=box_size,
    width=box_size
)

# Plot the Silhouette Score results
silhouette_plot = alt.Chart(silhouette_df).mark_line(point=True).encode(
    x=alt.X('k', title='Number of Clusters (k)', axis=alt.Axis(labelFontSize=font_size, titleFontSize=font_size)),
    y=alt.Y('Silhouette Score', title='Silhouette Score', axis=alt.Axis(labelFontSize=font_size, titleFontSize=font_size)),
    tooltip=['k', 'Silhouette Score']
).properties(
    title=alt.TitleParams('Silhouette Score for Optimal k', fontSize=font_size+2),
    height=box_size,
    width=box_size
)

# Plot the Davies-Bouldin Index results
davies_bouldin_plot = alt.Chart(davies_bouldin_df).mark_line(point=True).encode(
    x=alt.X('k', title='Number of Clusters (k)', axis=alt.Axis(labelFontSize=font_size, titleFontSize=font_size)),
    y=alt.Y('Davies-Bouldin Index', title='Davies-Bouldin Index', axis=alt.Axis(labelFontSize=font_size, titleFontSize=font_size)),
    tooltip=['k', 'Davies-Bouldin Index']
).properties(
    title=alt.TitleParams('Davies-Bouldin Index for Optimal k', fontSize=font_size+2),
    height=box_size,
    width=box_size
)

# Display the plots
cluster_plot = (elbow_plot | silhouette_plot | davies_bouldin_plot)

outfile = Path("../figures/01_clusterplot_publication_titles.html")
outfile.parent.mkdir(exist_ok=True, parents=True)
cluster_plot.save(outfile)
cluster_plot

In [None]:
# Step 4: Clustering
num_clusters = N_CLUSTER
kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(reduced_tfidf_matrix)

# Adding cluster labels to the DataFrame
author_profiles['cluster'] = clusters
author_profiles['PCA1'] = reduced_tfidf_matrix[:, 0]
author_profiles['PCA2'] = reduced_tfidf_matrix[:, 1]

# Step 5: Evaluation
silhouette_avg = silhouette_score(reduced_tfidf_matrix, clusters)

# Step 6: Enrich the DataFrame using the mapping dictionary
author_profiles['author_name'] = author_profiles['author'].apply(lambda x: data[x]['name_inputted'])
author_profiles['subjects'] = author_profiles['author'].apply(lambda x: ", ".join(data[x]['subjects']))
author_profiles['sinta_id'] = author_profiles['author']
author_profiles['affiliation_sinta'] = author_profiles['author'].apply(lambda x: data[x]['affiliation_sinta']['name'])

# Step 7: Visualization with Altair

# Get the most common words for each cluster
cluster_words = {}
for cluster in author_profiles['cluster'].unique():
    titles = author_profiles[author_profiles['cluster'] == cluster]['title']
    top_words = get_top_words(titles, stop_words=combined_stop_words, top_n=5)
    cluster_words[cluster] = ', '.join(top_words)

author_words = {}
for author in author_profiles['author'].unique():
    titles = author_profiles[author_profiles['author'] == author]['title']
    top_words = get_top_words(titles, stop_words=combined_stop_words, top_n=5)
    author_words[author] = ', '.join(top_words)

# Visualization with Altair
author_profiles['top_words_author'] = author_profiles['author'].apply(lambda x: author_words[x])
author_profiles['top_words_cluster'] = author_profiles['cluster'].apply(lambda x: cluster_words[x])

In [None]:
options = [i for i in author_profiles[SELECTION].unique()]

resize = alt.selection_interval(bind='scales')

source = author_profiles

base = alt.Chart(source)

labels = [str(option) + ' ' for option in options]

input_dropdown = alt.binding_select(options=options + [None],
                                    labels=labels + ['All '],
                                    name=f'{SELECTION_LABEL} ')

selection = alt.selection_point(fields=[SELECTION], 
                                bind=input_dropdown)

color = alt.condition(
    selection,
    alt.Color(f'{SELECTION}:N').legend(None),
    alt.value('lightgray')
)

scatter = base.mark_circle(size=75).encode(
    x=alt.X('PCA1', title=None),
    y=alt.Y('PCA2', title=f'PCA2 ({explained_variance[1]*100:.2f}% variance)'),
    color=color,
    tooltip=['author_name', 'affiliation_sinta', 'cluster', 'top_words_cluster', 'top_words_author', 'subjects', "sinta_id"]
).add_params(
    selection
).properties(
    height=HEIGHT,
    width=WIDTH
).add_selection(
    resize
)

legend = base.mark_circle(size=75).encode(
    alt.Y(f'{SELECTION}:N', title=SELECTION_LABEL).axis(orient='right'),
    color=color
)

chart2 = base.mark_bar().encode(
    x=alt.X('count()', title='Author Count'),
    y=alt.Y('PCA2:Q', title="").bin(maxbins=18),
    color=color
).add_params(
    selection
).properties(
    height=HEIGHT,
    width=50
).add_selection(
    resize
)

# Additional chart at the bottom
chart3 = base.mark_bar().encode(
    x=alt.X('PCA1:Q', bin=alt.Bin(maxbins=18), title=f'PCA1 ({explained_variance[0]*100:.2f}% variance)'),
    y=alt.Y('count()', title='Author Count', scale=alt.Scale(reverse=True)),
    color=color
).add_params(
    selection
).properties(
    height=50,
    width=WIDTH
).add_selection(
    resize
)

# Vertical line at PCA1 = 0
vertical_line = base.mark_rule(color='gray').encode(
    x=alt.datum(0)
).properties(
    width=WIDTH
)

# Horizontal line at PCA2 = 0
horizontal_line = base.mark_rule(color='gray').encode(
    y=alt.datum(0)
).properties(
    height=HEIGHT
)

# Combine scatter and chart3 vertically, and chart2 and legend horizontally
combined_chart = alt.vconcat(
    scatter + vertical_line + horizontal_line,
    chart3 + vertical_line
).resolve_scale(
    color='independent'
) | chart2 + horizontal_line | legend

# Format silhouette score to two decimal places
silhouette_avg_formatted = f'{silhouette_avg:.2f}'

# Add title to the combined chart and center it
final_chart = combined_chart.properties(
    title=alt.TitleParams(
        text=f'Silhouette Score: {silhouette_avg_formatted}',
        anchor='middle',
        align='center'
    )
)

final_chart

In [None]:
outfile = Path("../figures/01_PCA_publication_titles.html")
outfile.parent.mkdir(exist_ok=True, parents=True)
final_chart.save(outfile)