In [None]:
# init
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import spdiags
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin_min
from matplotlib import pyplot as plt

In [None]:
# read csv
pubmedSelected = pd.read_csv('pubmed/selected.csv')
scopusSelected = pd.read_csv('scopus/selected.csv')

# get feature
# pubmed
pubmedClass = pubmedSelected.loc[:, 'class']
pubmed_boaw_sr = pubmedSelected.loc[:, 'BOAW_SR']
pubmed_bon_sr = pubmedSelected.loc[:, 'BON_SR']
pubmed_bona_sr = pubmedSelected.loc[:, 'BONA_SR']
pubmed_boaw_sfs = pubmedSelected.loc[:, 'BOAW_SFS']
pubmed_bon_sfs = pubmedSelected.loc[:, 'BON_SFS']
pubmed_bona_sfs = pubmedSelected.loc[:, 'BONA_SFS']
pubmed_boaw_tfrf = pubmedSelected.loc[:, 'BOAW_TFRF']
pubmed_bon_tfrf = pubmedSelected.loc[:, 'BON_TFRF']
pubmed_bona_tfrf = pubmedSelected.loc[:, 'BONA_TFRF']

# scopus
scopusClass = scopusSelected.loc[:, 'class']
scopus_boaw_sr = scopusSelected.loc[:, 'BOAW_SR']
scopus_bon_sr = scopusSelected.loc[:, 'BON_SR']
scopus_bona_sr = scopusSelected.loc[:, 'BONA_SR']
scopus_boaw_sfs = scopusSelected.loc[:, 'BOAW_SFS']
scopus_bon_sfs = scopusSelected.loc[:, 'BON_SFS']
scopus_bona_sfs = scopusSelected.loc[:, 'BONA_SFS']
scopus_boaw_tfrf = scopusSelected.loc[:, 'BOAW_TFRF']
scopus_bon_tfrf = scopusSelected.loc[:, 'BON_TFRF']
scopus_bona_tfrf = scopusSelected.loc[:, 'BONA_TFRF']
# pubmedSelected

In [None]:
# TF
def tf(features):
    # get tf weights & transform to sparse matrix
    tfVec = CountVectorizer()
    tf = tfVec.fit_transform(features.fillna(' '))
    return tf

# tfx = tf(bona)
# print(tfx)

# TF-IDF
def tf_idf(features):
    # get tf-idf weights & transform to sparse matrix
    tfIdfVec = TfidfVectorizer()
    tfIdf = tfIdfVec.fit_transform(features.fillna(' '))
    return tfIdf

# idfx = tf_idf(scopusBonSr)
# print(idfx)

# TF-IDF-ICF
def tf_idf_icf(features, classes):
    # count tf-idf
    tfIdf = tf_idf(features)

    # get classes weights
    clsVec = LabelBinarizer()
    cls = clsVec.fit_transform(classes)

    # count total class
    totalClass = cls.shape[1]

    # count class contain feature (1 if contain the feature, then sum per class)
    classFeature = ((cls.T * tfIdf) > 0).astype(np.float64).sum(axis=0)

    # count icf
    icf = []
    for featureId in range(tfIdf.shape[1]):
        icf.append(1 + math.log(totalClass / classFeature[featureId]))
    icf = np.array(icf)

    # get tf-idf-icf weights & transform to sparse matrix
    totalFeature = icf.shape[0]
    tfIdfIcf = tfIdf * spdiags(icf, 0, totalFeature, totalFeature)
    return tfIdfIcf

# icfx = tf_idf_icf(pubmedBonSr, pubmedClass)
# print(icfx)

In [None]:
pubmed_bona_sr_tfIdfIcf = tf_idf_icf(pubmed_bona_sr, pubmedClass)

def spherical_kmeans(X, n_clusters, max_iter=300):
    # Normalize the input vectors to have unit norm
    X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)
    
    # Initialize the cluster centers randomly
    kmeans = KMeans(n_clusters=n_clusters, init='random', max_iter=max_iter)
    kmeans.fit(X_normalized)
    
    # Find the closest points to each cluster center in the original space
    closest_points, _ = pairwise_distances_argmin_min(X_normalized, kmeans.cluster_centers_)
    
    return closest_points
 
# Example usage
# X = np.array([[1, 2, 0], [0, 1, 0], [0, 2, 1], [-1, 0, 2], [0, -1, 0], [2, 0, -1]])
n_clusters = 4
 
cluster_indices = spherical_kmeans(pubmed_bona_sr_tfIdfIcf, n_clusters)
cluster_indices