In [4]:
# init
import pandas as pd
import numpy as np
from tqdm import tqdm
# import nltk
# nltk.download()

Import Data

In [None]:
import glob
import os

pubmed = []

# import pubmed
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/pubmed/", "*.txt"))

for docPath in docList:
    # get doc file name
    docName = os.path.basename(docPath).split('.')[0]
    
    with open(docPath) as doc:
        # insert [class, docs, feature]
        pubmed.append([docName[:3], docName, doc.read().replace('\n', ' ')])

# print(pubmed)

# make dataframe
dataframe = pd.DataFrame(data=pubmed, columns=['class', 'document', 'feature']) 

# export pubmed raw
dataframe.to_csv('pubmed/raw.csv', index=False)

Dataframe raw

In [None]:
# read csv
dataRaw = pd.read_csv('pubmed/raw.csv')
# get feature
features = dataRaw.loc[:, 'feature']
dataRaw

Preprocesing

In [None]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# cleaning
def cleaning(features):
    result = []
    for feature in features:
        regex = re.sub(r'[^a-zA-Z\s]', '', feature)
        result.append(regex)
    return result

# case folding
def caseFolding(features):
    result = []
    for feature in features:
        lower = feature.lower()
        result.append(lower)
    return result

# tokenization
def tokenization(features):
    result = []
    for feature in features:
        token = word_tokenize(feature)
        result.append(token)
    return result

# stopwords removal
def stopWords(features):
    result = []
    stopWords = set(stopwords.words('english'))
    for token in features:
        cleanedFeature = [feature for feature in token if feature not in stopWords]
        result.append(cleanedFeature)
    return result

In [None]:
# preprocessing
def preprocessing(features):
    clean = cleaning(features)
    case = caseFolding(clean)
    token = tokenization(case)
    preprocessed = stopWords(token)
    
preprocessed = preprocessing(features)
# print(preprocessed)

# export pubmed clean
for i in range(len(preprocessed)):
    dataRaw.loc[i, 'feature'] = ' '.join(preprocessed[i])
dataRaw.to_csv('pubmed/clean.csv', index=False)

In [None]:
# read csv
dataClean = pd.read_csv('pubmed/clean.csv')
# get feature
features = dataClean.loc[:, 'feature']
dataClean

Feature Forming

In [None]:
from textblob import TextBlob

# BOAW
dataClean.rename(columns={'feature': 'BOAW'}, inplace=True)

for i in tqdm(range(len(features))):
    # BON
    dataClean.loc[i,'BON'] = ' '.join(TextBlob(features[i]).noun_phrases)
    # BONA
    dataClean.loc[i,'BONA'] = ' '.join([word for (word, tag) in TextBlob(features[i]).tags if tag[:2]=='NN' or tag[:2]=='JJ'])

# print(dataClean)

dataClean.to_csv('pubmed/formed.csv', index=False)

In [3]:
# read csv
dataFormed = pd.read_csv('pubmed/formed.csv')
# get features
classes = dataFormed.loc[:, 'class']
boaw = dataFormed.loc[:, 'BOAW']
bon = dataFormed.loc[:, 'BON']
bona = dataFormed.loc[:, 'BONA']
dataFormed

Unnamed: 0,class,document,BOAW,BON,BONA
0,ALZ,ALZ1,reduced amounts immunoreactive somatostatin te...,amounts immunoreactive somatostatin temporal c...,reduced amounts immunoreactive somatostatin te...
1,ALZ,ALZ10,diagnostic criteria primary neuronal degenerat...,diagnostic criteria primary neuronal degenerat...,diagnostic criteria primary neuronal degenerat...
2,ALZ,ALZ100,cacetylcholine synthesis ccarbon dioxide produ...,cacetylcholine synthesis ccarbon dioxide produ...,cacetylcholine synthesis ccarbon dioxide produ...
3,ALZ,ALZ1000,pattern reading deterioration dementia alzheim...,pattern reading deterioration dementia alzheim...,pattern deterioration dementia alzheimer type ...
4,ALZ,ALZ101,cerebral blood flow metabolic rate oxygen gluc...,cerebral blood flow metabolic rate oxygen gluc...,cerebral blood flow metabolic rate oxygen gluc...
...,...,...,...,...,...
3995,HIV,HIV3996,major histocompatibility complex genes influen...,major histocompatibility complex genes outcome...,major histocompatibility complex genes influen...
3996,HIV,HIV3997,hiv infection cohort haemophilic patients cour...,hiv infection cohort haemophilic patients cour...,hiv infection cohort haemophilic patients cour...
3997,HIV,HIV3998,evolution definition aids main classifications...,evolution definition aids main classifications...,evolution definition aids main classifications...
3998,HIV,HIV3999,human immunodeficiency virus glycoproteins gp ...,human immunodeficiency virus glycoproteins gp ...,human immunodeficiency virus glycoproteins gp ...


Term Weighting

In [10]:
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# TF
def tf(features):
    # get tf weights
    tfVec = CountVectorizer()
    result = tfVec.fit_transform(features)

    # define weights into dataframe
    featureName = tfVec.get_feature_names_out()
    featureWeight = result.todense().tolist()
    df = pd.DataFrame(featureWeight, columns=featureName)
    return df

    # # counting weights into dictionary
    # tf = dict.fromkeys(featureName, 0)
    # for word in tqdm(featureName):
    #     arr = np.array(df.loc[:, word])
    #     val = np.sum(arr)
    #     tf[word] = val
    # return tf

# tfx = tf(bona)
# print(tfx)

# TF-IDF
def tf_idf(features):
    # get tf-idf weights
    tfIdfVec = TfidfVectorizer()
    result = tfIdfVec.fit_transform(features)

    # define weights into dataframe
    featureName = tfIdfVec.get_feature_names_out()
    featureWeight = result.todense().tolist()
    df = pd.DataFrame(featureWeight, columns=featureName)
    return df
    
    # # counting weights into dictionary
    # tfIdf = dict.fromkeys(featureName, 0)
    # for word in tqdm(featureName):
    #     arr = np.array(df.loc[:, word])
    #     val = np.sum(arr)
    #     tfIdf[word] = val
    # return tfIdf

# idfx = tf_idf(bona)
# print(idfx)

# TF-IDF-ICF
def icf(word, features):
    classTotal = []
    classTerm = []

    # count class 
    for i in classes:
        if i not in classTotal:
            classTotal.append(i)

    # count class term
    for i in range(len(features)):
        if word in features[i]:
            if classes[i] not in classTerm:
                classTerm.append(classes[i])
  
    # count icf
    icf = math.log(len(classTotal) / float(len(classTerm)))
    return icf

def tf_idf_icf(features):
    # count tf-idf
    df = tf_idf(features)

    for col in tqdm(df.columns):
        icfResult = icf(col, features)
        for row in df.iterrows():
            df.loc[row[0], col] = df.loc[row[0], col] * icfResult
    return df

    # # counting weights into dictionary
    # tfIdfIcf = dict.fromkeys(list(tfIdf.keys()), 0)
    # for word, val in tqdm(tfIdf.items()):
    #     tfIdfIcf[word] = val * icf(word, features)
    # return tfIdfIcf

# icfx = tf_idf_icf(bona)
# print(icfx)

Feature Selection

In [101]:
# from textvec.vectorizers import TfrfVectorizer

# TF-RF
# tfRfVec = TfrfVectorizer()
# tfRfVec.fit()

def rf(word, features, wordClass):
    positive = 0
    negative = 0

    for i in range(len(features)):
        if word in features[i]:
            # count docs contain term in class C
            if classes[i] == wordClass:
                positive += 1
            # count docs contain term not in class C
            else :
                negative += 1
    
    # count rf
    rf = math.log2(2 + (positive / np.maximum(1, negative)))
    return rf

def tf_rf(features):
    # count tf
    df = tf(features)

    for col in tqdm(df.columns):
        for row in df.iterrows():
            rfResult = rf(col, features, classes[row[0]])
            df.loc[row[0], col] = df.loc[row[0], col] * rfResult
    return df

    # tfRf = dict.fromkeys(list(tf.keys()), 0)
    # for word, val in tqdm(tf.items()):
    #     tfRf[word] = val * rf(word, features)
    # return tfRf

classes = ['A', 'A', 'A', 'A', 'B', 'C']
data = ['system system information management finance', 'system system information finance', 'system information logistic', 'medical information system', 'system computer education in management school', 'system computer and management']

rfx = tf_rf(data)
rfx.loc[:, 'class'] = classes
print(rfx)

100%|██████████| 11/11 [00:00<00:00, 242.22it/s]

        and  computer  education  finance        in  information  logistic  \
0  0.000000  0.000000   0.000000        2  0.000000     2.584963  0.000000   
1  0.000000  0.000000   0.000000        2  0.000000     2.584963  0.000000   
2  0.000000  0.000000   0.000000        0  0.000000     2.584963  1.584963   
3  0.000000  0.000000   0.000000        0  0.000000     2.584963  0.000000   
4  0.000000  1.584963   1.584963        0  1.169925     0.000000  0.000000   
5  1.584963  1.584963   0.000000        0  0.000000     0.000000  0.000000   

   management   medical    school    system class  
0    1.321928  0.000000  0.000000  4.000000     A  
1    0.000000  0.000000  0.000000  4.000000     A  
2    0.000000  0.000000  0.000000  2.000000     A  
3    0.000000  1.584963  0.000000  2.000000     A  
4    1.321928  0.000000  1.584963  1.137504     B  
5    1.321928  0.000000  0.000000  1.137504     C  





In [102]:
mean = rfx.groupby('class').mean()
mean

Unnamed: 0_level_0,and,computer,education,finance,in,information,logistic,management,medical,school,system
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A,0.0,0.0,0.0,1.0,0.0,2.584963,0.396241,0.330482,0.396241,0.0,3.0
B,0.0,1.584963,1.584963,0.0,1.169925,0.0,0.0,1.321928,0.0,1.584963,1.137504
C,1.584963,1.584963,0.0,0.0,0.0,0.0,0.0,1.321928,0.0,0.0,1.137504


In [103]:
diff = mean.max()
diff

and            1.584963
computer       1.584963
education      1.584963
finance        1.000000
in             1.169925
information    2.584963
logistic       0.396241
management     1.321928
medical        0.396241
school         1.584963
system         3.000000
dtype: float64

In [104]:
(mean/diff).sum()

and            1.000000
computer       2.000000
education      1.000000
finance        1.000000
in             1.000000
information    1.000000
logistic       1.000000
management     2.250000
medical        1.000000
school         1.000000
system         1.758336
dtype: float64

In [10]:
rfx.to_csv('rfrf.csv', index=False)

Clustering

In [None]:
from sklearn.cluster import KMeans
from spherecluster import SphericalKMeans
from matplotlib import pyplot as plt
%matplotlib inline

# K-Means++
# kmp = KMeans(n_clusters=n)

# Spherical K-Means
# skm = SphericalKMeans(n_clusters=n)
# skm.fit(X)

# skm.cluster_centers_
# skm.labels_
# skm.inertia_

Performance Analysis

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import adjusted_mutual_info_score

# Silhouette Score
# silhouette_score(labels_true, labels_pred)

# Purity
# contingency_matrix(labels_true, labels_pred)

# AMI
# adjusted_mutual_info_score(labels_true, labels_pred)