In [None]:
import pandas as pd
import json
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import altair as alt

In [None]:
with open("../data/processed/UGM/biology.json", "r") as f:
    data = json.load(f)

In [None]:
text_vis = {}
ctr = 0
for sinta_id, value in data.items():
    name = value["name_inputted"]
    for item in value['publications']:
        text_vis[ctr] = {"author": sinta_id,
                         "title" : item['bib']['title']}
        ctr += 1

In [None]:
df = pd.DataFrame.from_dict(text_vis).T
df

In [None]:
# Function to get the most common words per cluster
def get_top_words(titles, stop_words='english', top_n=5):
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = vectorizer.fit_transform(titles)
    feature_array = vectorizer.get_feature_names_out()
    tfidf_sorting = tfidf_matrix.toarray().sum(axis=0).argsort()[::-1]
    top_words = [feature_array[i] for i in tfidf_sorting[:top_n]]
    return top_words

# Combine English and Indonesian stop words
indonesian_stop_words = [
    'dan', 'yang', 'untuk', 'dari', 'dengan', 'pada', 'adalah', 'ke', 'di', 'sebagai', 'ini',
    'itu', 'oleh', 'dalam', 'atau', 'juga', 'tersebut', 'sangat', 'agar', 'bisa', 'karena', 'terhadap',
    'pengaruh', 'berdasarkan', 'indonesia', 'yogyakarta', 'isolated', 'java', 'based', 'daerah', 'indonesian',
    'analysis', 'gunungkidul', 'medan', 'effect', 'using', "sp"
]
english_stop_words = set(TfidfVectorizer(stop_words='english').get_stop_words())
combined_stop_words = list(english_stop_words.union(indonesian_stop_words))

# Step 1: Text Preprocessing and Vectorization
vectorizer = TfidfVectorizer(stop_words=combined_stop_words)
tfidf_matrix = vectorizer.fit_transform(df['title'])

# Step 2: Aggregation
# Combine all titles for each author into a single string
author_profiles = df.groupby('author')['title'].apply(lambda titles: ' '.join(titles)).reset_index()
author_tfidf_matrix = vectorizer.transform(author_profiles['title'])

# Step 3: Dimensionality Reduction (optional)
pca = PCA(n_components=3)
reduced_tfidf_matrix = pca.fit_transform(author_tfidf_matrix.toarray())

# Calculate explained variance
explained_variance = pca.explained_variance_ratio_

# Step 4: Clustering
num_clusters = 12
kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(reduced_tfidf_matrix)

# Adding cluster labels to the DataFrame
author_profiles['cluster'] = clusters
author_profiles['PCA1'] = reduced_tfidf_matrix[:, 0]
author_profiles['PCA2'] = reduced_tfidf_matrix[:, 1]

# Step 5: Evaluation
silhouette_avg = silhouette_score(reduced_tfidf_matrix, clusters)
print(f'Silhouette Score: {silhouette_avg}')

# Step 6: Enrich the DataFrame using the mapping dictionary
author_profiles['author_name'] = author_profiles['author'].apply(lambda x: data[x]['name_inputted'])
author_profiles['subjects'] = author_profiles['author'].apply(lambda x: ", ".join(data[x]['subjects']))
author_profiles['sinta_id'] = author_profiles['author']

# Step 7: Visualization with Altair

# Get the most common words for each cluster
cluster_words = {}
for cluster in author_profiles['cluster'].unique():
    titles = author_profiles[author_profiles['cluster'] == cluster]['title']
    top_words = get_top_words(titles, stop_words=combined_stop_words, top_n=5)
    cluster_words[cluster] = ', '.join(top_words)

author_words = {}
for author in author_profiles['author'].unique():
    titles = author_profiles[author_profiles['author'] == author]['title']
    top_words = get_top_words(titles, stop_words=combined_stop_words, top_n=5)
    author_words[author] = ', '.join(top_words)

# Visualization with Altair
author_profiles['top_words_author'] = author_profiles['author'].apply(lambda x: author_words[x])
author_profiles['top_words_cluster'] = author_profiles['cluster'].apply(lambda x: cluster_words[x])

chart = alt.Chart(author_profiles).mark_circle(size=100).encode(
    x=alt.X('PCA1', title=f'PCA1 ({explained_variance[0]*100:.2f}% variance)'),
    y=alt.Y('PCA2', title=f'PCA2 ({explained_variance[1]*100:.2f}% variance)'),
    color='cluster:N',
    tooltip=['author_name', 'cluster', 'top_words_cluster', 'top_words_author', 'subjects', "sinta_id"]
).properties(
    title='Authors Clustering based on Publication Titles',
    width=600,
    height=600
).interactive()

chart.show()

In [None]:
outfile = Path("../figures/01_PCA_publication_titles.html")
outfile.parent.mkdir(exist_ok=True, parents=True)
chart.save(outfile)