In [1]:
# init
import pandas as pd
import numpy as np
from tqdm import tqdm
# import nltk
# nltk.download()

In [None]:
import glob
import os

In [3]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import spdiags

In [3]:
from sklearn.cluster import KMeans
# from spherecluster import SphericalKMeans
from matplotlib import pyplot as plt

In [4]:
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import adjusted_mutual_info_score

Import Data

In [None]:
# import pubmed
pubmed = []
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/pubmed/", "*.txt"))

for docPath in tqdm(docList):
    # get doc file name
    docName = os.path.basename(docPath).split('.')[0]
    className = docName[:3]
    
    with open(docPath, encoding="utf8") as doc:
        # insert [class, docs, feature]
        pubmed.append([className, docName, doc.read().replace('\n', ' ')])

# make dataframe
dataframe = pd.DataFrame(data=pubmed, columns=['class', 'document', 'feature']) 

# export pubmed raw
dataframe.to_csv('pubmed/raw.csv', index=False)

In [None]:
# import scopus
scopus = []
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/scopus/", "*"))

for docPath in tqdm(docList):
    # get doc file name
    docName = os.path.basename(docPath)
    className = docName.split('-')[0]
    
    with open(docPath, encoding="utf8") as doc:
        # insert [class, docs, feature]
        scopus.append([className, docName, doc.read().replace('\n', ' ')])

# make dataframe
dataframe = pd.DataFrame(data=scopus, columns=['class', 'document', 'feature']) 

# export pubmed raw
dataframe.to_csv('scopus/raw.csv', index=False)

Dataframe Raw

In [6]:
# read csv
pubmedRaw = pd.read_csv('pubmed/raw.csv')
scopusRaw = pd.read_csv('scopus/raw.csv')

# get feature
pubmedFeatures = pubmedRaw.loc[:, 'feature']
scopusFeatures = scopusRaw.loc[:, 'feature']
scopusFeatures

613

Preprocesing

In [4]:
# cleaning
def cleaning(features):
    result = []
    for feature in features:
        regex = re.sub(r'[^a-zA-Z\s]', '', feature)
        result.append(regex)
    return result

# case folding
def caseFolding(features):
    result = []
    for feature in features:
        lower = feature.lower()
        result.append(lower)
    return result

# tokenization
def tokenization(features):
    result = []
    for feature in features:
        token = word_tokenize(feature)
        result.append(token)
    return result

# stopwords removal
def stopWords(features):
    result = []
    stopWords = set(stopwords.words('english'))
    for token in features:
        cleanedFeature = [feature for feature in token if feature not in stopWords]
        result.append(cleanedFeature)
    return result

In [35]:
# preprocessing
def preprocessing(features):
    clean = cleaning(features)
    case = caseFolding(clean)
    token = tokenization(case)
    preprocessed = stopWords(token)
    return preprocessed
    
pubmedPreprocessed = preprocessing(pubmedFeatures)
scopusPreprocessed = preprocessing(scopusFeatures)
# print(scopusPreprocessed[0])

# export pubmed clean
for i in range(len(pubmedPreprocessed)):
    pubmedRaw.loc[i, 'feature'] = ' '.join(pubmedPreprocessed[i])
pubmedRaw.to_csv('pubmed/clean.csv', index=False)

# export scopus clean
for i in range(len(scopusPreprocessed)):
    scopusRaw.loc[i, 'feature'] = ' '.join(scopusPreprocessed[i])
scopusRaw.to_csv('scopus/clean.csv', index=False)

Daraframe Clean

In [7]:
# read csv
pubmedClean = pd.read_csv('pubmed/clean.csv')
scopusClean = pd.read_csv('scopus/clean.csv')

# get feature
pubmedFeatures = pubmedClean.loc[:, 'feature']
scopusFeatures = scopusClean.loc[:, 'feature']
scopusClean

473

Feature Forming

In [None]:
from textblob import TextBlob

def featureForming(features, dataframe):
    # BOAW
    dataframe.rename(columns={'feature': 'BOAW'}, inplace=True)
    for i in tqdm(range(len(features))):
        # BON
        dataframe.loc[i, 'BON'] = ' '.join(TextBlob(features[i]).noun_phrases)
        # BONA
        dataframe.loc[i, 'BONA'] = ' '.join([word for (word, tag) in TextBlob(features[i]).tags if tag[:2]=='NN' or tag[:2]=='JJ'])

featureForming(pubmedFeatures, pubmedClean)
featureForming(scopusFeatures, scopusClean)
# print(scopusClean)

pubmedClean.to_csv('pubmed/formed.csv', index=False)
scopusClean.to_csv('scopus/formed.csv', index=False)

Dataframe Formed

In [9]:
# read csv
pubmedFormed = pd.read_csv('pubmed/formed.csv')
scopusFormed = pd.read_csv('scopus/formed.csv')

# get feature
# pubmed
pubmedClass = pubmedFormed.loc[:, 'class']
pubmed_boaw = pubmedFormed.loc[:, 'BOAW']
pubmed_bon = pubmedFormed.loc[:, 'BON']
pubmed_bona = pubmedFormed.loc[:, 'BONA']

# scopus
scopusClass = scopusFormed.loc[:, 'class']
scopus_boaw = scopusFormed.loc[:, 'BOAW']
scopus_bon = scopusFormed.loc[:, 'BON']
scopus_bona = scopusFormed.loc[:, 'BONA']
scopusClass

0             CONCRETE
1             CONCRETE
2             CONCRETE
3             CONCRETE
4             CONCRETE
             ...      
2795    TECTONICPLATES
2796    TECTONICPLATES
2797    TECTONICPLATES
2798    TECTONICPLATES
2799    TECTONICPLATES
Name: class, Length: 2800, dtype: object

Feature Selection

In [5]:
# SFS
def symbolic(features, classes):
    # get tf weights
    tfVec = CountVectorizer()
    tf = tfVec.fit_transform(features)

    # define weights into dataframe
    featureName = tfVec.get_feature_names_out()
    featureWeight = tf.todense()
    df = pd.DataFrame(featureWeight, columns=featureName)
    df.loc[:, 'class'] = classes
    # return df

    # count mean & standard deviation
    mean = df.groupby('class').mean().reset_index()
    std = df.groupby('class').std().reset_index()

    totalSm = []
    for col in tqdm(mean.columns):
        if col != 'class':
            # count interval per features
            interval = []
            for row in range(len(mean)):
                meanA = mean.loc[row, col]
                stdA = std.loc[row, col]
                interval.append([meanA - stdA, meanA + stdA])
            
            # count similarity per feature
            similarity = 0
            for itvA in interval:
                # count similarity per class
                smClass = []
                for itvB in interval:
                    if itvA != itvB:
                        # check if interval B equals 0
                        sm = 0
                        if (itvB[1] - itvB[0]) != 0:
                            sm = (min(itvA[1], itvB[1]) - max(itvA[0], itvB[0])) / (itvB[1] - itvB[0])
                        smClass.append(sm)

                # count total similarity
                similarity += np.sum(smClass)
            totalSm.append(similarity)

    # count average total similarity
    avgTotalSm = np.mean(totalSm)

    # select feature that totalSm > avgTotalSm
    selected = []
    for i in range(len(totalSm)):
        if totalSm[i] > avgTotalSm:
            selected.append(df.columns[i])
    # return len(selected)
    return selected

# sfx = symbolic(scopus_boaw, scopusClass)
# print(len(sfx))

In [6]:
# TF-RF
def relevance(features, classes):
    # get tf weights
    tfVec = CountVectorizer()
    tf = tfVec.fit_transform(features)

    # get classes weights
    clsVec = LabelBinarizer()
    cls = clsVec.fit_transform(classes)
    # return cls, cls.shape

    # if has only 1 class, add negative class
    if cls.shape[1] == 1:
        cls = np.append(1 - cls, cls, axis=1)

    # count class contain feature (1 if contain, 0 if not, then convert to float)
    # featureDoc = ((cls.T * tf) > 0).astype(np.float64)
    featureDoc = (cls.T * tf).astype(np.float64)
    # return featureDoc.shape

    for classC in range(cls.shape[1]):
        # featureDoc.sum(axis=0) - featureDoc[classC] is the sum of all rows except classC
        featureDoc[classC] /= np.maximum(1., featureDoc.sum(axis=0) - featureDoc[classC])

    # count rf
    rf = np.mean(np.log2(2 + featureDoc, out=featureDoc), axis=0)

    # count tf-rf & transform to sparse matrix
    totalFeature = rf.shape[0]
    tfRf = tf * spdiags(rf, 0, totalFeature, totalFeature)

    # define weights into dataframe
    featureName = tfVec.get_feature_names_out()
    featureWeight = tfRf.todense()
    df = pd.DataFrame(featureWeight, columns=featureName)
    df.loc[:, 'class'] = classes
    # return df

    # count weight by class
    mean = df.groupby('class').mean()
    result = (mean/mean.max()).sum()
    # return result

    # select minimum weighted features
    selected = []
    for i in tqdm(range(len(result))):
        if result[i] > np.mean(result):
            selected.append(df.columns[i])
    # return len(selected)
    return selected

# rfx = relevance(pubmed_bon, pubmedClass)
# print(len(rfx))

In [7]:
# Implementation
# Symbolic Relevance
def symbolic_relevance(features, classes):
    # get sfs
    sfs = symbolic(features, classes)
    # get tf-rf
    tfrf = relevance(features, classes)

    # intersect features
    selected = list(np.intersect1d(sfs, tfrf))
    # print(len(selected))

    # update features
    token = tokenization(features)
    result = []
    for row in tqdm(token):
        words = [word for word in row if word in selected]
        result.append(words)
    return result

# srx = symbolic_relevance(scopus_bona, scopusClass)
# print(srx)

# SFS
def symbolic_feature_selection(features, classes):
    # get sfs
    selected = symbolic(features, classes)
    # print(len(selected))

    # update features
    token = tokenization(features)
    result = []
    for row in tqdm(token):
        words = [word for word in row if word in selected]
        result.append(words)
    return result

# sfsx = sfs(scopus_bona, scopusClass)
# print(sfsx)

# TF-RF
def term_frequency_relevance_frequency(features, classes):
    # get tfrf
    selected = relevance(features, classes)
    # print(len(selected))

    # update features
    token = tokenization(features)
    result = []
    for row in tqdm(token):
        words = [word for word in row if word in selected]
        result.append(words)
    return result

# tfrfx = tfrf(scopus_bona, scopusClass)
# print(tfrfx)

In [10]:
# Symbolic Relevance
# pubmed
pubmed_boaw_sr = symbolic_relevance(pubmed_boaw, pubmedClass)
pubmed_bon_sr = symbolic_relevance(pubmed_bon, pubmedClass)
pubmed_bona_sr = symbolic_relevance(pubmed_bona, pubmedClass)

# scopus
scopus_boaw_sr = symbolic_relevance(scopus_boaw, scopusClass)
scopus_bon_sr = symbolic_relevance(scopus_bon, scopusClass)
scopus_bona_sr = symbolic_relevance(scopus_bona, scopusClass)

# SFS
# pubmed
pubmed_boaw_sfs = symbolic_feature_selection(pubmed_boaw, pubmedClass)
pubmed_bon_sfs = symbolic_feature_selection(pubmed_bon, pubmedClass)
pubmed_bona_sfs = symbolic_feature_selection(pubmed_bona, pubmedClass)

# scopus
scopus_boaw_sfs = symbolic_feature_selection(scopus_boaw, scopusClass)
scopus_bon_sfs = symbolic_feature_selection(scopus_bon, scopusClass)
scopus_bona_sfs = symbolic_feature_selection(scopus_bona, scopusClass)

# TF-RF
# pubmed
pubmed_boaw_tfrf = term_frequency_relevance_frequency(pubmed_boaw, pubmedClass)
pubmed_bon_tfrf = term_frequency_relevance_frequency(pubmed_bon, pubmedClass)
pubmed_bona_tfrf = term_frequency_relevance_frequency(pubmed_bona, pubmedClass)

# scopus
scopus_boaw_tfrf = term_frequency_relevance_frequency(scopus_boaw, scopusClass)
scopus_bon_tfrf = term_frequency_relevance_frequency(scopus_bon, scopusClass)
scopus_bona_tfrf = term_frequency_relevance_frequency(scopus_bona, scopusClass)

100%|██████████| 21873/21873 [00:04<00:00, 5264.37it/s]
100%|██████████| 21872/21872 [00:02<00:00, 9076.93it/s]
100%|██████████| 4000/4000 [00:20<00:00, 197.02it/s]
100%|██████████| 17596/17596 [00:03<00:00, 4831.41it/s]
100%|██████████| 17595/17595 [00:01<00:00, 9965.60it/s] 
100%|██████████| 4000/4000 [00:13<00:00, 301.94it/s]
100%|██████████| 18358/18358 [00:03<00:00, 4991.61it/s]
100%|██████████| 18357/18357 [00:01<00:00, 9806.97it/s]
100%|██████████| 4000/4000 [00:14<00:00, 277.44it/s]
100%|██████████| 32980/32980 [00:09<00:00, 3464.88it/s]
100%|██████████| 32979/32979 [00:04<00:00, 7592.20it/s]
100%|██████████| 2800/2800 [00:22<00:00, 122.26it/s]
100%|██████████| 26341/26341 [00:07<00:00, 3533.22it/s]
100%|██████████| 26340/26340 [00:03<00:00, 8118.46it/s]
100%|██████████| 2800/2800 [00:11<00:00, 240.72it/s]
100%|██████████| 27044/27044 [00:08<00:00, 3364.42it/s]
100%|██████████| 27043/27043 [00:03<00:00, 8019.75it/s]
100%|██████████| 2800/2800 [00:13<00:00, 210.65it/s]
100%|████

In [11]:
# save csv
# pubmed
pubmedFormed.drop(['BOAW', 'BON', 'BONA'], axis=1, inplace=True)
for i in tqdm(range(len(pubmedFormed))):
    # BOAW_SR
    pubmedFormed.loc[i, 'BOAW_SR'] = ' '.join(pubmed_boaw_sr[i])
    # BON_SR
    pubmedFormed.loc[i, 'BON_SR'] = ' '.join(pubmed_bon_sr[i])
    # BONA_SR
    pubmedFormed.loc[i, 'BONA_SR'] = ' '.join(pubmed_bona_sr[i])
    
    # BOAW_SFS
    pubmedFormed.loc[i, 'BOAW_SFS'] = ' '.join(pubmed_boaw_sfs[i])
    # BON_SFS
    pubmedFormed.loc[i, 'BON_SFS'] = ' '.join(pubmed_bon_sfs[i])
    # BONA_SFS
    pubmedFormed.loc[i, 'BONA_SFS'] = ' '.join(pubmed_bona_sfs[i])

    # BOAW_TFRF
    pubmedFormed.loc[i, 'BOAW_TFRF'] = ' '.join(pubmed_boaw_tfrf[i])
    # BON_TFRF
    pubmedFormed.loc[i, 'BON_TFRF'] = ' '.join(pubmed_bon_tfrf[i])
    # BONA_TFRF
    pubmedFormed.loc[i, 'BONA_TFRF'] = ' '.join(pubmed_bona_tfrf[i])

pubmedFormed.to_csv('pubmed/selected.csv', index=False)

# scopus
scopusFormed.drop(['BOAW', 'BON', 'BONA'], axis=1, inplace=True)
for i in tqdm(range(len(scopusFormed))):
    # BOAW_SR
    scopusFormed.loc[i, 'BOAW_SR'] = ' '.join(scopus_boaw_sr[i])
    # BON_SR
    scopusFormed.loc[i, 'BON_SR'] = ' '.join(scopus_bon_sr[i])
    # BONA_SR
    scopusFormed.loc[i, 'BONA_SR'] = ' '.join(scopus_bona_sr[i])
    
    # BOAW_SFS
    scopusFormed.loc[i, 'BOAW_SFS'] = ' '.join(scopus_boaw_sfs[i])
    # BON_SFS
    scopusFormed.loc[i, 'BON_SFS'] = ' '.join(scopus_bon_sfs[i])
    # BONA_SFS
    scopusFormed.loc[i, 'BONA_SFS'] = ' '.join(scopus_bona_sfs[i])

    # BOAW_TFRF
    scopusFormed.loc[i, 'BOAW_TFRF'] = ' '.join(scopus_boaw_tfrf[i])
    # BON_TFRF
    scopusFormed.loc[i, 'BON_TFRF'] = ' '.join(scopus_bon_tfrf[i])
    # BONA_TFRF
    scopusFormed.loc[i, 'BONA_TFRF'] = ' '.join(scopus_bona_tfrf[i])

scopusFormed.to_csv('scopus/selected.csv', index=False)

100%|██████████| 4000/4000 [00:02<00:00, 1518.06it/s]
100%|██████████| 2800/2800 [00:01<00:00, 1727.12it/s]


In [12]:
# read csv
pubmedSelected = pd.read_csv('pubmed/selected.csv')
scopusSelected = pd.read_csv('scopus/selected.csv')

# get feature
# pubmed
pubmedClass = pubmedSelected.loc[:, 'class']
pubmed_boaw_sr = pubmedSelected.loc[:, 'BOAW_SR']
pubmed_bon_sr = pubmedSelected.loc[:, 'BON_SR']
pubmed_bona_sr = pubmedSelected.loc[:, 'BONA_SR']
pubmed_boaw_sfs = pubmedSelected.loc[:, 'BOAW_SFS']
pubmed_bon_sfs = pubmedSelected.loc[:, 'BON_SFS']
pubmed_bona_sfs = pubmedSelected.loc[:, 'BONA_SFS']
pubmed_boaw_tfrf = pubmedSelected.loc[:, 'BOAW_TFRF']
pubmed_bon_tfrf = pubmedSelected.loc[:, 'BON_TFRF']
pubmed_bona_tfrf = pubmedSelected.loc[:, 'BONA_TFRF']

# scopus
scopusClass = scopusSelected.loc[:, 'class']
scopus_boaw_sr = scopusSelected.loc[:, 'BOAW_SR']
scopus_bon_sr = scopusSelected.loc[:, 'BON_SR']
scopus_bona_sr = scopusSelected.loc[:, 'BONA_SR']
scopus_boaw_sfs = scopusSelected.loc[:, 'BOAW_SFS']
scopus_bon_sfs = scopusSelected.loc[:, 'BON_SFS']
scopus_bona_sfs = scopusSelected.loc[:, 'BONA_SFS']
scopus_boaw_tfrf = scopusSelected.loc[:, 'BOAW_TFRF']
scopus_bon_tfrf = scopusSelected.loc[:, 'BON_TFRF']
scopus_bona_tfrf = scopusSelected.loc[:, 'BONA_TFRF']
pubmedSelected

Unnamed: 0,class,document,BOAW_SR,BON_SR,BONA_SR,BOAW_SFS,BON_SFS,BONA_SFS,BOAW_TFRF,BON_TFRF,BONA_TFRF
0,ALZ,ALZ1,reduced amounts tissue patients diagnosis comp...,amounts temporal type tissue patients diagnosi...,reduced amounts temporal type tissue patients ...,reduced amounts cortex brain tissue patients d...,amounts temporal cortex type brain tissue pati...,reduced amounts temporal cortex type brain tis...,reduced amounts tissue patients diagnosis comp...,amounts temporal type tissue patients diagnosi...,reduced amounts temporal type tissue patients ...
1,ALZ,ALZ10,criteria diagnosis patients presenting memory ...,criteria type diagnosis patients memory charac...,criteria diagnosis patients memory deficits ch...,criteria diagnosis patients presenting memory ...,criteria type diagnosis patients memory charac...,criteria diagnosis patients memory deficits ch...,criteria diagnosis patients presenting memory ...,criteria type diagnosis patients memory defici...,criteria diagnosis patients memory deficits ch...
2,ALZ,ALZ100,synthesis cglucose tissue synthesis co cglucos...,synthesis cglucose tissue synthesis co cglucos...,cglucose tissue co cglucose tissue electron ra...,synthesis cglucose tissue synthesis co cglucos...,synthesis cglucose tissue synthesis co cglucos...,cglucose tissue co cglucose tissue electron ra...,synthesis dioxide cglucose tissue synthesis co...,synthesis dioxide production cglucose tissue s...,dioxide production cglucose tissue co producti...
3,ALZ,ALZ1000,pattern observations thirteen patients dat abi...,pattern type observations patients type dat co...,pattern type thirteen patients type dat abilit...,pattern observations thirteen patients dat abi...,pattern type observations patients type dat co...,pattern type thirteen patients type dat abilit...,pattern observations thirteen patients dat abi...,pattern reading type observations patients typ...,pattern type thirteen patients type dat abilit...
4,ALZ,ALZ101,blood rate bodies amino acids blood rate bodie...,blood rate bodies amino acids blood rate bodie...,blood rate bodies acids blood rate bodies acid...,cerebral blood rate bodies amino acids cerebra...,cerebral blood rate bodies amino acids cerebra...,cerebral blood rate bodies acids cerebral bloo...,blood rate bodies amino acids blood rate bodie...,blood rate bodies amino acids blood rate bodie...,blood rate bodies acids blood rate bodies acid...
...,...,...,...,...,...,...,...,...,...,...,...
3995,HIV,HIV3996,major genes influence outcome alleles explain ...,major complex genes outcome null alleles assoc...,major genes influence outcome null alleles ass...,major genes influence outcome alleles explain ...,major complex genes outcome null alleles assoc...,major genes influence outcome null alleles ass...,major genes influence outcome alleles explain ...,major complex genes outcome null alleles diver...,major complex genes influence outcome null all...
3996,HIV,HIV3997,cohort patients patients aged clinical examina...,cohort patients course patients clinical exami...,cohort patients course patients years clinical...,cohort patients patients aged clinical examina...,cohort patients course patients clinical exami...,cohort patients course patients years clinical...,cohort patients patients aged years clinical e...,cohort patients course patients clinical exami...,cohort patients course patients years clinical...
3997,HIV,HIV3998,main classifications purpose paper trace descr...,main purpose paper trace describe principal cl...,main classifications purpose paper trace descr...,main classifications purpose paper trace descr...,main purpose paper trace describe principal cl...,main classifications purpose paper trace descr...,main classifications purpose paper trace descr...,main purpose paper trace describe principal cl...,main classifications purpose paper trace descr...
3998,HIV,HIV3999,inhibit cell antigen pathway recombinant cell ...,virus inhibit cell antigen pathway recombinant...,virus cell antigen pathway recombinant cell an...,virus inhibit cell antigen pathway recombinant...,virus inhibit cell antigen pathway recombinant...,virus cell antigen pathway recombinant cell an...,inhibit cell antigen pathway interference reco...,immunodeficiency virus inhibit cell antigen pa...,virus cell antigen pathway recombinant cell an...


Term Weighting

In [13]:
# TF
def tf(features):
    # get tf weights & transform to sparse matrix
    tfVec = CountVectorizer()
    tf = tfVec.fit_transform(features.fillna(' '))
    return tf

# tfx = tf(bona)
# print(tfx)

# TF-IDF
def tf_idf(features):
    # get tf-idf weights & transform to sparse matrix
    tfIdfVec = TfidfVectorizer()
    tfIdf = tfIdfVec.fit_transform(features.fillna(' '))
    return tfIdf

# idfx = tf_idf(scopusBonSr)
# print(idfx)

# TF-IDF-ICF
def tf_idf_icf(features, classes):
    # count tf-idf
    tfIdf = tf_idf(features)

    # get classes weights
    clsVec = LabelBinarizer()
    cls = clsVec.fit_transform(classes)

    # count total class
    totalClass = cls.shape[1]

    # count class contain feature (1 if contain, then convert to float, and sum per class)
    classFeature = ((cls.T * tfIdf) > 0).astype(np.float64).sum(axis=0)

    # count icf
    icf = []
    for featureId in range(tfIdf.shape[1]):
        icf.append(1 + math.log(totalClass / classFeature[featureId]))
    icf = np.array(icf)

    # get tf-idf-icf weights & transform to sparse matrix
    totalFeature = icf.shape[0]
    tfIdfIcf = tfIdf * spdiags(icf, 0, totalFeature, totalFeature)
    return tfIdfIcf

# icfx = tf_idf_icf(pubmedBonSr, pubmedClass)
# print(icfx)

In [8]:
# term weighting

# pubmed
# BOAW > SR > TF
pubmed_boaw_sr_tf = tf(pubmed_boaw_sr)
# BOAW > SR > TF-IDF
pubmed_boaw_sr_tfIdf = tf_idf(pubmed_boaw_sr)
# BOAW > SR > TF-IDF-ICF
pubmed_boaw_sr_tfIdfIcf = tf_idf_icf(pubmed_boaw_sr, pubmedClass)
# BOAW > TF-IDF
pubmed_boaw_tfIdf = tf_idf(pubmed_boaw)
# BOAW > TF-IDF-ICF
pubmed_boaw_tfIdfIcf = tf_idf_icf(pubmed_boaw, pubmedClass)

# BON > SR > TF
pubmed_bon_sr_tf = tf(pubmed_bon_sr)
# BON > SR > TF-IDF
pubmed_bon_sr_tfIdf = tf_idf(pubmed_bon_sr)
# BON > SR > TF-IDF-ICF
pubmed_bon_sr_tfIdfIcf = tf_idf_icf(pubmed_bon_sr, pubmedClass)
# BON > TF-IDF
pubmed_bon_tfIdf = tf_idf(pubmed_bon)
# BON > TF-IDF-ICF
pubmed_bon_tfIdfIcf = tf_idf_icf(pubmed_bon, pubmedClass)

# BONA > SR > TF
pubmed_bona_sr_tf = tf(pubmed_bona_sr)
# BONA > SR > TF-IDF
pubmed_bona_sr_tfIdf = tf_idf(pubmed_bona_sr)
# BONA > SR > TF-IDF-ICF
pubmed_bona_sr_tfIdfIcf = tf_idf_icf(pubmed_bona_sr, pubmedClass)
# BONA > TF-IDF
pubmed_bona_tfIdf = tf_idf(pubmed_bona)
# BONA > TF-IDF-ICF
pubmed_bona_tfIdfIcf = tf_idf_icf(pubmed_bona, pubmedClass)


# scopus
# BOAW > SR > TF
scopus_boaw_sr_tf = tf(scopus_boaw_sr)
# BOAW > SR > TF-IDF
scopus_boaw_sr_tfIdf = tf_idf(scopus_boaw_sr)
# BOAW > SR > TF-IDF-ICF
scopus_boaw_sr_tfIdfIcf = tf_idf_icf(scopus_boaw_sr, scopusClass)
# BOAW > TF-IDF
scopus_boaw_tfIdf = tf_idf(scopus_boaw)
# BOAW > TF-IDF-ICF
scopus_boaw_tfIdfIcf = tf_idf_icf(scopus_boaw, scopusClass)

# BON > SR > TF
scopus_bon_sr_tf = tf(scopus_bon_sr)
# BON > SR > TF-IDF
scopus_bon_sr_tfIdf = tf_idf(scopus_bon_sr)
# BON > SR > TF-IDF-ICF
scopus_bon_sr_tfIdfIcf = tf_idf_icf(scopus_bon_sr, scopusClass)
# BON > TF-IDF
scopus_bon_tfIdf = tf_idf(scopus_bon)
# BON > TF-IDF-ICF
scopus_bon_tfIdfIcf = tf_idf_icf(scopus_bon, scopusClass)

# BONA > SR > TF
scopus_bona_sr_tf = tf(scopus_bona_sr)
# BONA > SR > TF-IDF
scopus_bona_sr_tfIdf = tf_idf(scopus_bona_sr)
# BONA > SR > TF-IDF-ICF
scopus_bona_sr_tfIdfIcf = tf_idf_icf(scopus_bona_sr, scopusClass)
# BONA > TF-IDF
scopus_bona_tfIdf = tf_idf(scopus_bona)
# BONA > TF-IDF-ICF
scopus_bona_tfIdfIcf = tf_idf_icf(scopus_bona, scopusClass)
# pubmed_bona_sr_tfIdfIcf

Clustering

In [9]:
# K-Means++
def kmeans_plus(data):
    label = []
    for k in tqdm(range(2, 11)):
        kmp = KMeans(n_clusters=k, init = 'k-means++')
        kmp.fit(data)
        label.append(kmp.labels_)
    return label

# kmpx = kmeansPlus(pubmed_bona_sr_tfIdfIcf)
# print(kmpx)

# # Spherical K-Means
# def spher_kmeans(data):
#     label = []
#     for k in tqdm(range(2, 11)):
#         skm = SphericalKMeans(n_clusters=k)
#         skm.fit(data)
#         label = skm.labels_

# skmx = spherKmeans(3, bona_sr_tfidf)
# print(skmx)

In [10]:
# get K-Means++ clusters
# # pubmed
# # BOAW > SR > TF > K-Means++
# pubmed_boaw_sr_tf_kmeansPlus = kmeans_plus(pubmed_boaw_sr_tf)
# # BOAW > SR > TF-IDF > K-Means++
# pubmed_boaw_sr_tfIdf_kmeansPlus = kmeans_plus(pubmed_boaw_sr_tfIdf)
# # BOAW > SR > TF-IDF-ICF > K-Means++
# pubmed_boaw_sr_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_boaw_sr_tfIdfIcf)
# # BOAW > TF-IDF > K-Means++
# pubmed_boaw_tfIdf_kmeansPlus = kmeans_plus(pubmed_boaw_tfIdf)
# # BOAW > TF-IDF-ICF > K-Means++
# pubmed_boaw_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_boaw_tfIdfIcf)

# BON > SR > TF > K-Means++
pubmed_bon_sr_tf_kmeansPlus = kmeans_plus(pubmed_bon_sr_tf)
# BON > SR > TF-IDF > K-Means++
pubmed_bon_sr_tfIdf_kmeansPlus = kmeans_plus(pubmed_bon_sr_tfIdf)
# BON > SR > TF-IDF-ICF > K-Means++
pubmed_bon_sr_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_bon_sr_tfIdfIcf)
# BON > TF-IDF > K-Means++
pubmed_bon_tfIdf_kmeansPlus = kmeans_plus(pubmed_bon_tfIdf)
# BON > TF-IDF-ICF > K-Means++
pubmed_bon_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_bon_tfIdfIcf)

# # BONA > SR > TF > K-Means++
# pubmed_bona_sr_tf_kmeansPlus = kmeans_plus(pubmed_bona_sr_tf)
# # BONA > SR > TF-IDF > K-Means++
# pubmed_bona_sr_tfIdf_kmeansPlus = kmeans_plus(pubmed_bona_sr_tfIdf)
# # BONA > SR > TF-IDF-ICF > K-Means++
# pubmed_bona_sr_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_bona_sr_tfIdfIcf)
# # BONA > TF-IDF > K-Means++
# pubmed_bona_tfIdf_kmeansPlus = kmeans_plus(pubmed_bona_tfIdf)
# # BONA > TF-IDF-ICF > K-Means++
# pubmed_bona_tfIdfIcf_kmeansPlus = kmeans_plus(pubmed_bona_tfIdfIcf)


# # scopus
# # BOAW > SR > TF > K-Means++
# scopus_boaw_sr_tf_kmeansPlus = kmeans_plus(scopus_boaw_sr_tf)
# # BOAW > SR > TF-IDF > K-Means++
# scopus_boaw_sr_tfIdf_kmeansPlus = kmeans_plus(scopus_boaw_sr_tfIdf)
# # BOAW > SR > TF-IDF-ICF > K-Means++
# scopus_boaw_sr_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_boaw_sr_tfIdfIcf)
# # BOAW > TF-IDF > K-Means++
# scopus_boaw_tfIdf_kmeansPlus = kmeans_plus(scopus_boaw_tfIdf)
# # BOAW > TF-IDF-ICF > K-Means++
# scopus_boaw_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_boaw_tfIdfIcf)

# # BON > SR > TF > K-Means++
# scopus_bon_sr_tf_kmeansPlus = kmeans_plus(scopus_bon_sr_tf)
# # BON > SR > TF-IDF > K-Means++
# scopus_bon_sr_tfIdf_kmeansPlus = kmeans_plus(scopus_bon_sr_tfIdf)
# # BON > SR > TF-IDF-ICF > K-Means++
# scopus_bon_sr_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_bon_sr_tfIdfIcf)
# # BON > TF-IDF > K-Means++
# scopus_bon_tfIdf_kmeansPlus = kmeans_plus(scopus_bon_tfIdf)
# # BON > TF-IDF-ICF > K-Means++
# scopus_bon_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_bon_tfIdfIcf)

# # BONA > SR > TF > K-Means++
# scopus_bona_sr_tf_kmeansPlus = kmeans_plus(scopus_bona_sr_tf)
# # BONA > SR > TF-IDF > K-Means++
# scopus_bona_sr_tfIdf_kmeansPlus = kmeans_plus(scopus_bona_sr_tfIdf)
# # BONA > SR > TF-IDF-ICF > K-Means++
# scopus_bona_sr_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_bona_sr_tfIdfIcf)
# # BONA > TF-IDF > K-Means++
# scopus_bona_tfIdf_kmeansPlus = kmeans_plus(scopus_bona_tfIdf)
# # BONA > TF-IDF-ICF > K-Means++
# scopus_bona_tfIdfIcf_kmeansPlus = kmeans_plus(scopus_bona_tfIdfIcf)
# # pubmed_bona_sr_tfIdfIcf_kmeansPlus

100%|██████████| 9/9 [00:07<00:00,  1.21it/s]
100%|██████████| 9/9 [00:09<00:00,  1.05s/it]
100%|██████████| 9/9 [00:08<00:00,  1.12it/s]
100%|██████████| 9/9 [00:23<00:00,  2.63s/it]
100%|██████████| 9/9 [00:17<00:00,  1.90s/it]


In [None]:
# get Spherical K-Means clusters
# pubmed

# scopus

In [None]:
# save json
# pubmed
# pubmedJson = {
#     'boaw_sr_tf_kmeansPlus': [],
    # 'boaw_sr_tfIdf_kmeansPlus': pubmed_boaw_sr_tfIdf_kmeansPlus,
    # 'boaw_sr_tfIdfIcf_kmeansPlus': pubmed_boaw_sr_tfIdfIcf_kmeansPlus,
    # 'boaw_tfIdf_kmeansPlus': pubmed_boaw_tfIdf_kmeansPlus,
    # 'boaw_tfIdfIcf_kmeansPlus': pubmed_boaw_tfIdfIcf_kmeansPlus,
    # 'bon_sr_tf_kmeansPlus': pubmed_bon_sr_tf_kmeansPlus,
    # 'bon_sr_tfIdf_kmeansPlus': pubmed_bon_sr_tfIdf_kmeansPlus,
    # 'bon_sr_tfIdfIcf_kmeansPlus': pubmed_bon_sr_tfIdfIcf_kmeansPlus,
    # 'bon_tfIdf_kmeansPlus': pubmed_bon_tfIdf_kmeansPlus,
    # 'bon_tfIdfIcf_kmeansPlus': pubmed_bon_tfIdfIcf_kmeansPlus,
    # 'bona_sr_tf_kmeansPlus': pubmed_bona_sr_tf_kmeansPlus,
    # 'bona_sr_tfIdf_kmeansPlus': pubmed_bona_sr_tfIdf_kmeansPlus,
    # 'bona_sr_tfIdfIcf_kmeansPlus': pubmed_bona_sr_tfIdfIcf_kmeansPlus,
    # 'bona_tfIdf_kmeansPlus': pubmed_bona_tfIdf_kmeansPlus,
    # 'bona_tfIdfIcf_kmeansPlus': pubmed_bona_tfIdfIcf_kmeansPlus
    # spher_kmeans
# }
# pubmedJson['boaw_sr_tf_kmeansPlus'].append(pubmed_boaw_sr_tf_kmeansPlus)
# with open('pubmed/clustered.json', 'w', encoding='utf-8') as f:
#     json.dump(pubmedJson, f, indent=4)
# f.close() 

# scopus
# scopus_boaw_sr_tf_kmeansPlus,
# scopus_boaw_sr_tfIdf_kmeansPlus,
# scopus_boaw_sr_tfIdfIcf_kmeansPlus,
# scopus_boaw_tfIdf_kmeansPlus,
# scopus_boaw_tfIdfIcf_kmeansPlus,
# scopus_bon_sr_tf_kmeansPlus,
# scopus_bon_sr_tfIdf_kmeansPlus,
# scopus_bon_sr_tfIdfIcf_kmeansPlus
# scopus_bon_tfIdf_kmeansPlus,
# scopus_bon_tfIdfIcf_kmeansPlus,
# scopus_bona_sr_tf_kmeansPlus,
# scopus_bona_sr_tfIdf_kmeansPlus,
# scopus_bona_sr_tfIdfIcf_kmeansPlus,
# scopus_bona_tfIdf_kmeansPlus,
# scopus_bona_tfIdfIcf_kmeansPlus,

In [None]:
# read json
# pubmed

# scopus


Performance Analysis

In [11]:
# encode class as label number
def encodeClass(classes):
    # encode class
    clsVec = LabelEncoder()
    cls = clsVec.fit_transform(classes)
    # add 1 to all list (to compare with label)
    return cls + 1
pubmedLabel = encodeClass(pubmedClass)
scopusLabel = encodeClass(scopusClass)

# Silhouette Score
def silhouette(documentWeight, clusterLabel):
    score = []
    for k in tqdm(range(len(clusterLabel))):
        silhouetteScore = silhouette_score(documentWeight, clusterLabel[k])
        score.append(silhouetteScore)
    return score

# sx = silhouette(pubmed_bona_sr_tfIdfIcf, kmpx)
# print(sx)

# Purity
def purity(documentLabel, clusterLabel):
    score = []
    for k in tqdm(range(len(clusterLabel))):
        contingencyMatrix = contingency_matrix(documentLabel, clusterLabel[k])
        purityScore = np.sum(np.amax(contingencyMatrix, axis=0)) / np.sum(contingencyMatrix)
        score.append(purityScore)
    return score

# px = purity(pubmedLabel, kmpx)
# print(px)

# AMI
def ami(documentLabel, clusterLabel):
    score = []
    for k in tqdm(range(len(clusterLabel))):
        amiScore = adjusted_mutual_info_score(documentLabel, clusterLabel[k])
        score.append(amiScore)
    return score

# ax = ami(pubmedLabel, kmpx)
# print(ax)

In [12]:
sr_tfIdfIcf = silhouette(pubmed_bon_sr_tfIdfIcf, pubmed_bon_sr_tfIdfIcf_kmeansPlus)

100%|██████████| 9/9 [00:04<00:00,  1.85it/s]


In [13]:
tfIdfIcf = silhouette(pubmed_bon_tfIdfIcf, pubmed_bon_tfIdfIcf_kmeansPlus)

100%|██████████| 9/9 [00:05<00:00,  1.59it/s]


In [16]:
max(sr_tfIdfIcf)

0.0025779042220132027

In [17]:
max(tfIdfIcf)

0.01554582499538646