In [1]:
# init
import pandas as pd
import numpy as np
from tqdm import tqdm
# import nltk
# nltk.download()

In [None]:
import glob
import os

In [30]:
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import spdiags

In [3]:
from sklearn.cluster import KMeans
# from spherecluster import SphericalKMeans
from matplotlib import pyplot as plt

In [9]:
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import adjusted_mutual_info_score

Import Data

In [None]:
# import pubmed
pubmed = []
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/pubmed/", "*.txt"))

for docPath in tqdm(docList):
    # get doc file name
    docName = os.path.basename(docPath).split('.')[0]
    className = docName[:3]
    
    with open(docPath, encoding="utf8") as doc:
        # insert [class, docs, feature]
        pubmed.append([className, docName, doc.read().replace('\n', ' ')])

# # make dataframe
# dataframe = pd.DataFrame(data=pubmed, columns=['class', 'document', 'feature']) 

# # export pubmed raw
# dataframe.to_csv('pubmed/raw.csv', index=False)

In [None]:
# import scopus
scopus = []
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/scopus/", "*"))

for docPath in tqdm(docList):
    # get doc file name
    docName = os.path.basename(docPath)
    className = docName.split('-')[0]
    
    with open(docPath, encoding="utf8") as doc:
        # insert [class, docs, feature]
        scopus.append([className, docName, doc.read().replace('\n', ' ')])

# make dataframe
dataframe = pd.DataFrame(data=scopus, columns=['class', 'document', 'feature']) 

# export pubmed raw
dataframe.to_csv('scopus/raw.csv', index=False)

Dataframe Raw

In [None]:
# read csv
pubmedRaw = pd.read_csv('pubmed/raw.csv')
scopusRaw = pd.read_csv('scopus/raw.csv')

# get feature
pubmedFeatures = pubmedRaw.loc[:, 'feature']
scopusFeatures = scopusRaw.loc[:, 'feature']
scopusFeatures

Preprocesing

In [7]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# cleaning
def cleaning(features):
    result = []
    for feature in features:
        regex = re.sub(r'[^a-zA-Z\s]', '', feature)
        result.append(regex)
    return result

# case folding
def caseFolding(features):
    result = []
    for feature in features:
        lower = feature.lower()
        result.append(lower)
    return result

# tokenization
def tokenization(features):
    result = []
    for feature in features:
        token = word_tokenize(feature)
        result.append(token)
    return result

# stopwords removal
def stopWords(features):
    result = []
    stopWords = set(stopwords.words('english'))
    for token in features:
        cleanedFeature = [feature for feature in token if feature not in stopWords]
        result.append(cleanedFeature)
    return result

In [35]:
# preprocessing
def preprocessing(features):
    clean = cleaning(features)
    case = caseFolding(clean)
    token = tokenization(case)
    preprocessed = stopWords(token)
    return preprocessed
    
pubmedPreprocessed = preprocessing(pubmedFeatures)
scopusPreprocessed = preprocessing(scopusFeatures)
# print(scopusPreprocessed[0])

# export pubmed clean
for i in range(len(pubmedPreprocessed)):
    pubmedRaw.loc[i, 'feature'] = ' '.join(pubmedPreprocessed[i])
pubmedRaw.to_csv('pubmed/clean.csv', index=False)

# export scopus clean
for i in range(len(scopusPreprocessed)):
    scopusRaw.loc[i, 'feature'] = ' '.join(scopusPreprocessed[i])
scopusRaw.to_csv('scopus/clean.csv', index=False)

Daraframe Clean

In [None]:
# read csv
pubmedClean = pd.read_csv('pubmed/clean.csv')
scopusClean = pd.read_csv('scopus/clean.csv')

# get feature
pubmedFeatures = pubmedClean.loc[:, 'feature']
scopusFeatures = scopusClean.loc[:, 'feature']
scopusClean

Feature Forming

In [None]:
from textblob import TextBlob

def featureForming(features, dataframe):
    # BOAW
    dataframe.rename(columns={'feature': 'BOAW'}, inplace=True)
    for i in tqdm(range(len(features))):
        # BON
        dataframe.loc[i, 'BON'] = ' '.join(TextBlob(features[i]).noun_phrases)
        # BONA
        dataframe.loc[i, 'BONA'] = ' '.join([word for (word, tag) in TextBlob(features[i]).tags if tag[:2]=='NN' or tag[:2]=='JJ'])

featureForming(pubmedFeatures, pubmedClean)
featureForming(scopusFeatures, scopusClean)
# print(scopusClean)

pubmedClean.to_csv('pubmed/formed.csv', index=False)
scopusClean.to_csv('scopus/formed.csv', index=False)

Dataframe Formed

In [7]:
# read csv
pubmedFormed = pd.read_csv('pubmed/formed.csv')
scopusFormed = pd.read_csv('scopus/formed.csv')

# get feature
# pubmed
pubmedClass = pubmedFormed.loc[:, 'class']
pubmed_boaw = pubmedFormed.loc[:, 'BOAW']
pubmed_bon = pubmedFormed.loc[:, 'BON']
pubmed_bona = pubmedFormed.loc[:, 'BONA']

# scopus
scopusClass = scopusFormed.loc[:, 'class']
scopus_boaw = scopusFormed.loc[:, 'BOAW']
scopus_bon = scopusFormed.loc[:, 'BON']
scopus_bona = scopusFormed.loc[:, 'BONA']
scopusClass

0             CONCRETE
1             CONCRETE
2             CONCRETE
3             CONCRETE
4             CONCRETE
             ...      
2795    TECTONICPLATES
2796    TECTONICPLATES
2797    TECTONICPLATES
2798    TECTONICPLATES
2799    TECTONICPLATES
Name: class, Length: 2800, dtype: object

Feature Selection

In [7]:
# SFS
def symbolic(features, classes):
    # get tf weights
    tfVec = CountVectorizer()
    tf = tfVec.fit_transform(features)

    # define weights into dataframe
    featureName = tfVec.get_feature_names_out()
    featureWeight = tf.todense()
    df = pd.DataFrame(featureWeight, columns=featureName)
    df.loc[:, 'class'] = classes
    # return df

    # count mean & standard deviation
    mean = df.groupby('class').mean().reset_index()
    std = df.groupby('class').std().reset_index()

    totalSm = []
    for col in tqdm(mean.columns):
        if col != 'class':
            # count interval per features
            interval = []
            for row in range(len(mean)):
                meanA = mean.loc[row, col]
                stdA = std.loc[row, col]
                interval.append([meanA - stdA, meanA + stdA])
            
            # count similarity per feature
            similarity = 0
            for itvA in interval:
                # count similarity per class
                smClass = []
                for itvB in interval:
                    if itvA != itvB:
                        smClass.append((min(itvA[1], itvB[1]) - max(itvA[0], itvB[0])) / (itvB[1] - itvB[0]))

                # count total similarity
                similarity += np.nansum(smClass)
            totalSm.append(similarity)

    # count average total similarity
    avgTotalSm = np.mean(totalSm)

    # select feature that totalSm > avgTotalSm
    selected = []
    for i in range(len(totalSm)):
        if totalSm[i] > avgTotalSm:
            selected.append(df.columns[i])
    # return len(selected)
    return selected

# sfx = symbolic(pubmedBona, pubmedClass)
# print(sfx)

In [18]:
# TF-RF
def tf_rf(features, classes):
    # get tf weights
    tfVec = CountVectorizer()
    tf = tfVec.fit_transform(features)

    # get classes weights
    clsVec = LabelBinarizer()
    cls = clsVec.fit_transform(classes)
    # return cls, cls.shape

    # if has only 1 class, add negative class
    if cls.shape[1] == 1:
        cls = np.append(1 - cls, cls, axis=1)

    # count class contain feature (1 if contain, 0 if not, then convert to float)
    featureDoc = ((cls.T * tf) > 0).astype(np.float64)
    # return featureDoc.shape

    for classC in range(cls.shape[1]):
        # featureDoc.sum(axis=0) - featureDoc[classC] is the sum of all rows except classC
        featureDoc[classC] /= np.maximum(1., featureDoc.sum(axis=0) - featureDoc[classC])

    # count rf
    rf = np.mean(np.log2(2 + featureDoc, out=featureDoc), axis=0)

    # count tf-rf & transform to sparse matrix
    totalFeature = rf.shape[0]
    tfRf = tf * spdiags(rf, 0, totalFeature, totalFeature)

    # define weights into dataframe
    featureName = tfVec.get_feature_names_out()
    featureWeight = tfRf.todense()
    df = pd.DataFrame(featureWeight, columns=featureName)
    df.loc[:, 'class'] = classes
    # return df

    # count weight by class
    mean = df.groupby('class').mean()
    result = (mean/mean.max()).sum()
    # return result

    # select minimum weighted features
    selected = []
    for i in tqdm(range(len(result))):
        if result[i] > np.mean(result):
            selected.append(df.columns[i])
    # return len(selected)
    return selected

# rfx = tf_rf(pubmedBona, pubmedClass)
# print(rfx)

In [9]:
# Symbolic Relevance
def symbolic_relevance(features, classes):
    # get sfs
    sfs = symbolic(features, classes)
    # get tf-rf
    tfrf = tf_rf(features, classes)

    # intersect features
    selected = list(np.intersect1d(sfs, tfrf))
    print('sfs:', len(sfs), 'tfrf:', len(tfrf), 'selected:', len(selected))

    # update features
    token = tokenization(features)
    result = []
    for row in tqdm(token):
        words = [word for word in row if word in selected]
        result.append(words)
    return result

# srx = symbolic_relevance(scopusBona, scopusClass)
# print(srx)

In [None]:
# pubmed
pubmed_boaw_sr = symbolic_relevance(pubmed_boaw, pubmedClass)
pubmed_bon_sr = symbolic_relevance(pubmed_bon, pubmedClass)
pubmed_bona_sr = symbolic_relevance(pubmed_bona, pubmedClass)

# scopus
scopus_boaw_sr = symbolic_relevance(scopus_boaw, scopusClass)
scopus_bon_sr = symbolic_relevance(scopus_bon, scopusClass)
scopus_bona_sr = symbolic_relevance(scopus_bona, scopusClass)

In [12]:
# save csv
# pubmed
for i in tqdm(range(len(pubmedFormed))):
    # BOAW
    pubmedFormed.loc[i, 'BOAW_SR'] = ' '.join(pubmed_boaw_sr[i])
    # BON
    pubmedFormed.loc[i, 'BON_SR'] = ' '.join(pubmed_bon_sr[i])
    # BONA
    pubmedFormed.loc[i, 'BONA_SR'] = ' '.join(pubmed_bona_sr[i])

pubmedFormed.to_csv('pubmed/selected.csv', index=False)

# scopus
for i in tqdm(range(len(scopusFormed))):
    # BOAW
    scopusFormed.loc[i, 'BOAW_SR'] = ' '.join(scopus_boaw_sr[i])
    # BON
    scopusFormed.loc[i, 'BON_SR'] = ' '.join(scopus_bon_sr[i])
    # BONA
    scopusFormed.loc[i, 'BONA_SR'] = ' '.join(scopus_bona_sr[i])

scopusFormed.to_csv('scopus/selected.csv', index=False)

100%|██████████| 4000/4000 [00:00<00:00, 4562.78it/s]
100%|██████████| 2800/2800 [00:00<00:00, 5388.66it/s]


In [4]:
# read csv
pubmedSelected = pd.read_csv('pubmed/selected.csv')
scopusSelected = pd.read_csv('scopus/selected.csv')

# get feature
# pubmed
pubmedClass = pubmedSelected.loc[:, 'class']
pubmed_boaw = pubmedSelected.loc[:, 'BOAW']
pubmed_bon = pubmedSelected.loc[:, 'BON']
pubmed_bona = pubmedSelected.loc[:, 'BONA']
pubmed_boaw_sr = pubmedSelected.loc[:, 'BOAW_SR']
pubmed_bon_sr = pubmedSelected.loc[:, 'BON_SR']
pubmed_bona_sr = pubmedSelected.loc[:, 'BONA_SR']

# scopus
scopusClass = scopusSelected.loc[:, 'class']
scopus_boaw = scopusSelected.loc[:, 'BOAW']
scopus_bon = scopusSelected.loc[:, 'BON']
scopus_bona = scopusSelected.loc[:, 'BONA']
scopus_boaw_sr = scopusSelected.loc[:, 'BOAW_SR']
scopus_bon_sr = scopusSelected.loc[:, 'BON_SR']
scopus_bona_sr = scopusSelected.loc[:, 'BONA_SR']
pubmed_bon_sr

0       amounts temporal type tissue patients diagnosi...
1       criteria type diagnosis patients memory charac...
2       synthesis cglucose tissue synthesis co cglucos...
3       pattern type observations patients type dat co...
4       blood rate bodies amino acids blood rate bodie...
                              ...                        
3995    major complex genes outcome null alleles assoc...
3996    cohort patients course patients clinical exami...
3997    main purpose paper trace describe principal cl...
3998    virus inhibit cell antigen pathway recombinant...
3999    progression usefulness lymphocyte purpose usef...
Name: BON_SR, Length: 4000, dtype: object

Term Weighting

In [5]:
# TF
def tf(features):
    # get tf weights & transform to sparse matrix
    tfVec = CountVectorizer()
    tf = tfVec.fit_transform(features.fillna(' '))
    return tf

# tfx = tf(bona)
# print(tfx)

# TF-IDF
def tf_idf(features):
    # get tf-idf weights & transform to sparse matrix
    tfIdfVec = TfidfVectorizer()
    tfIdf = tfIdfVec.fit_transform(features.fillna(' '))
    return tfIdf

# idfx = tf_idf(scopusBonSr)
# print(idfx)

# TF-IDF-ICF
def tf_idf_icf(features, classes):
    # count tf-idf
    tfIdf = tf_idf(features)

    # get classes weights
    clsVec = LabelBinarizer()
    cls = clsVec.fit_transform(classes)

    # count total class
    totalClass = cls.shape[1]

    # count class contain feature (1 if contain, then convert to float, and sum per class)
    classFeature = ((cls.T * tfIdf) > 0).astype(np.float64).sum(axis=0)

    # count icf
    icf = []
    for featureId in range(tfIdf.shape[1]):
        icf.append(1 + math.log(totalClass / classFeature[featureId]))
    icf = np.array(icf)

    # get tf-idf-icf weights & transform to sparse matrix
    totalFeature = icf.shape[0]
    tfIdfIcf = tfIdf * spdiags(icf, 0, totalFeature, totalFeature)
    return tfIdfIcf

# icfx = tf_idf_icf(pubmedBonSr, pubmedClass)
# print(icfx)

In [6]:
# term weighting

# pubmed
# BOAW > SR > TF
pubmed_boaw_sr_tf = tf(pubmed_boaw_sr)
# BOAW > SR > TF-IDF
pubmed_boaw_sr_tfIdf = tf_idf(pubmed_boaw_sr)
# BOAW > SR > TF-IDF-ICF
pubmed_boaw_sr_tfIdfIcf = tf_idf_icf(pubmed_boaw_sr, pubmedClass)
# BOAW > TF-IDF
pubmed_boaw_tfIdf = tf_idf(pubmed_boaw)
# BOAW > TF-IDF-ICF
pubmed_boaw_tfIdfIcf = tf_idf_icf(pubmed_boaw, pubmedClass)

# BON > SR > TF
pubmed_bon_sr_tf = tf(pubmed_bon_sr)
# BON > SR > TF-IDF
pubmed_bon_sr_tfIdf = tf_idf(pubmed_bon_sr)
# BON > SR > TF-IDF-ICF
pubmed_bon_sr_tfIdfIcf = tf_idf_icf(pubmed_bon_sr, pubmedClass)
# BON > TF-IDF
pubmed_bon_tfIdf = tf_idf(pubmed_bon)
# BON > TF-IDF-ICF
pubmed_bon_tfIdfIcf = tf_idf_icf(pubmed_bon, pubmedClass)

# BONA > SR > TF
pubmed_bona_sr_tf = tf(pubmed_bona_sr)
# BONA > SR > TF-IDF
pubmed_bona_sr_tfIdf = tf_idf(pubmed_bona_sr)
# BONA > SR > TF-IDF-ICF
pubmed_bona_sr_tfIdfIcf = tf_idf_icf(pubmed_bona_sr, pubmedClass)
# BONA > TF-IDF
pubmed_bona_tfIdf = tf_idf(pubmed_bona)
# BONA > TF-IDF-ICF
pubmed_bona_tfIdfIcf = tf_idf_icf(pubmed_bona, pubmedClass)


# scopus
# BOAW > SR > TF
scopus_boaw_sr_tf = tf(scopus_boaw_sr)
# BOAW > SR > TF-IDF
scopus_boaw_sr_tfIdf = tf_idf(scopus_boaw_sr)
# BOAW > SR > TF-IDF-ICF
scopus_boaw_sr_tfIdfIcf = tf_idf_icf(scopus_boaw_sr, scopusClass)
# BOAW > TF-IDF
scopus_boaw_tfIdf = tf_idf(scopus_boaw)
# BOAW > TF-IDF-ICF
scopus_boaw_tfIdfIcf = tf_idf_icf(scopus_boaw, scopusClass)

# BON > SR > TF
scopus_bon_sr_tf = tf(scopus_bon_sr)
# BON > SR > TF-IDF
scopus_bon_sr_tfIdf = tf_idf(scopus_bon_sr)
# BON > SR > TF-IDF-ICF
scopus_bon_sr_tfIdfIcf = tf_idf_icf(scopus_bon_sr, scopusClass)
# BON > TF-IDF
scopus_bon_tfIdf = tf_idf(scopus_bon)
# BON > TF-IDF-ICF
scopus_bon_tfIdfIcf = tf_idf_icf(scopus_bon, scopusClass)

# BONA > SR > TF
scopus_bona_sr_tf = tf(scopus_bona_sr)
# BONA > SR > TF-IDF
scopus_bona_sr_tfIdf = tf_idf(scopus_bona_sr)
# BONA > SR > TF-IDF-ICF
scopus_bona_sr_tfIdfIcf = tf_idf_icf(scopus_bona_sr, scopusClass)
# BONA > TF-IDF
scopus_bona_tfIdf = tf_idf(scopus_bona)
# BONA > TF-IDF-ICF
scopus_bona_tfIdfIcf = tf_idf_icf(scopus_bona, scopusClass)
# pubmed_bona_sr_tfIdfIcf

Clustering

In [None]:
# K-Means++
def kmeans_plus(data):
    cluster = {
        'label': []
    }
    
    for k in tqdm(range(2, 11)):
        kmp = KMeans(n_clusters=k, init = 'k-means++')
        kmp.fit(data)
        cluster['label'].append(kmp.labels_)
    return cluster

# kmpx = kmeansPlus(pubmed_bona_sr_tfIdfIcf)
# print(kmpx)

# # Spherical K-Means
# def spher_kmeans(data):
#     label = []
#     for k in tqdm(range(2, 11)):
#         skm = SphericalKMeans(n_clusters=k)
#         skm.fit(data)
#         label = skm.labels_

# skmx = spherKmeans(3, bona_sr_tfidf)
# print(skmx)

In [42]:
# get K-Means++ clusters
# pubmed
# BOAW > SR > TF > K-Means++
pubmed_boaw_sr_tf_kmeansPlus = kmeans_plus(pubmed_boaw_sr_tf)
# BOAW > SR > TF-IDF > K-Means++
pubmed_boaw_sr_tfIdf_kmeansPlus = kmeans_plus(pubmed_boaw_sr_tfIdf)
# BOAW > SR > TF-IDF-ICF > K-Means++
pubmed_boaw_sr_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_boaw_sr_tfIdfIcf)
# BOAW > TF-IDF > K-Means++
pubmed_boaw_tfIdf_kmeansPlus = kmeans_plus(pubmed_boaw_tfIdf)
# BOAW > TF-IDF-ICF > K-Means++
pubmed_boaw_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_boaw_tfIdfIcf)

# BON > SR > TF > K-Means++
pubmed_bon_sr_tf_kmeansPlus = kmeans_plus(pubmed_bon_sr_tf)
# BON > SR > TF-IDF > K-Means++
pubmed_bon_sr_tfIdf_kmeansPlus = kmeans_plus(pubmed_bon_sr_tfIdf)
# BON > SR > TF-IDF-ICF > K-Means++
pubmed_bon_sr_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_bon_sr_tfIdfIcf)
# BON > TF-IDF > K-Means++
pubmed_bon_tfIdf_kmeansPlus = kmeans_plus(pubmed_bon_tfIdf)
# BON > TF-IDF-ICF > K-Means++
pubmed_bon_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_bon_tfIdfIcf)

# BONA > SR > TF > K-Means++
pubmed_bona_sr_tf_kmeansPlus = kmeans_plus(pubmed_bona_sr_tf)
# BONA > SR > TF-IDF > K-Means++
pubmed_bona_sr_tfIdf_kmeansPlus = kmeans_plus(pubmed_bona_sr_tfIdf)
# BONA > SR > TF-IDF-ICF > K-Means++
pubmed_bona_sr_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_bona_sr_tfIdfIcf)
# BONA > TF-IDF > K-Means++
pubmed_bona_tfIdf_kmeansPlus = kmeans_plus(pubmed_bona_tfIdf)
# BONA > TF-IDF-ICF > K-Means++
pubmed_bona_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_bona_tfIdfIcf)


# scopus
# BOAW > SR > TF > K-Means++
scopus_boaw_sr_tf_kmeansPlus = kmeans_plus(scopus_boaw_sr_tf)
# BOAW > SR > TF-IDF > K-Means++
scopus_boaw_sr_tfIdf_kmeansPlus = kmeans_plus(scopus_boaw_sr_tfIdf)
# BOAW > SR > TF-IDF-ICF > K-Means++
scopus_boaw_sr_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_boaw_sr_tfIdfIcf)
# BOAW > TF-IDF > K-Means++
scopus_boaw_tfIdf_kmeansPlus = kmeans_plus(scopus_boaw_tfIdf)
# BOAW > TF-IDF-ICF > K-Means++
scopus_boaw_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_boaw_tfIdfIcf)

# BON > SR > TF > K-Means++
scopus_bon_sr_tf_kmeansPlus = kmeans_plus(scopus_bon_sr_tf)
# BON > SR > TF-IDF > K-Means++
scopus_bon_sr_tfIdf_kmeansPlus = kmeans_plus(scopus_bon_sr_tfIdf)
# BON > SR > TF-IDF-ICF > K-Means++
scopus_bon_sr_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_bon_sr_tfIdfIcf)
# BON > TF-IDF > K-Means++
scopus_bon_tfIdf_kmeansPlus = kmeans_plus(scopus_bon_tfIdf)
# BON > TF-IDF-ICF > K-Means++
scopus_bon_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_bon_tfIdfIcf)

# BONA > SR > TF > K-Means++
scopus_bona_sr_tf_kmeansPlus = kmeans_plus(scopus_bona_sr_tf)
# BONA > SR > TF-IDF > K-Means++
scopus_bona_sr_tfIdf_kmeansPlus = kmeans_plus(scopus_bona_sr_tfIdf)
# BONA > SR > TF-IDF-ICF > K-Means++
scopus_bona_sr_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_bona_sr_tfIdfIcf)
# BONA > TF-IDF > K-Means++
scopus_bona_tfIdf_kmeansPlus = kmeans_plus(scopus_bona_tfIdf)
# BONA > TF-IDF-ICF > K-Means++
scopus_bona_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_bona_tfIdfIcf)
# pubmed_bona_sr_tfIdfIcf_kmeansPlus

{'label': [array([0, 0, 0, ..., 0, 1, 0]), array([1, 1, 1, ..., 1, 0, 1]), array([2, 1, 1, ..., 1, 3, 1]), array([0, 4, 4, ..., 4, 2, 4]), array([4, 1, 1, ..., 1, 2, 1]), array([2, 3, 6, ..., 3, 0, 6]), array([2, 6, 6, ..., 6, 1, 5]), array([3, 1, 7, ..., 1, 0, 7]), array([1, 9, 3, ..., 3, 2, 3])]}


In [None]:
# get Spherical K-Means clusters
# Pubmed


Performance Analysis

In [None]:
# Silhouette Score
def silhouette(document, label):
    # count score
    score = []
    for k in tqdm(range(len(label))):
        silhouetteScore = silhouette_score(document, label[k])
        score.append(silhouetteScore)
    return score

# sx = silhouette(pubmed_bona_sr_tfIdfIcf, kmpx)
# print(sx)

# Purity
def purity(classes, label):
    # encode class
    clsVec = LabelEncoder()
    cls = clsVec.fit_transform(classes)
    # add 1 to all list (to compare with label)
    cls = cls + 1
    
    # count score
    score = []
    for k in tqdm(range(len(label))):
        contingencyMatrix = contingency_matrix(cls, label[k])
        purityScore = np.sum(np.amax(contingencyMatrix, axis=0)) / np.sum(contingencyMatrix)
        score.append(purityScore)
    return score

# px = purity(pubmedClass, kmpx)
# print(px)

# AMI
def ami(classes, label):
    # encode class
    clsVec = LabelEncoder()
    cls = clsVec.fit_transform(classes)
    # add 1 to all list (to compare with label)
    cls = cls + 1
    
    # count score
    score = []
    for k in tqdm(range(len(label))):
        amiScore = adjusted_mutual_info_score(classes, label[k])
        score.append(amiScore)
    return score

# ax = ami(pubmedClass, kmpx)
# print(ax)