In [1]:
# init
import pandas as pd
import numpy as np
from tqdm import tqdm
# import nltk
# nltk.download()

ModuleNotFoundError: No module named 'pandas'

Import Data

In [24]:
import glob
import os

pubmed = []

# import pubmed
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/pubmed/", "*.txt"))

for docPath in docList:
    # get doc file name
    docName = os.path.basename(docPath).split('.')[0]
    
    with open(docPath) as doc:
        # insert [class, docs, feature]
        pubmed.append([docName[:3], docName, doc.read().replace('\n', ' ')])

# print(pubmed)

# make dataframe
dataframe = pd.DataFrame(data=pubmed, columns=['class', 'document', 'features']) 

# export pubmed raw
dataframe.to_csv('pubmed_raw.csv')

Dataframe raw

In [12]:
# read csv
dataRaw = pd.read_csv('pubmed_raw.csv')
# get feature
features = dataRaw.loc[:, 'feature']
dataRaw

Unnamed: 0,class,document,feature
0,ALZ,ALZ1,Reduced amounts of immunoreactive somatostatin...
1,ALZ,ALZ10,Diagnostic criteria for primary neuronal degen...
2,ALZ,ALZ100,14Cacetylcholine synthesis and 14Ccarbon dioxi...
3,ALZ,ALZ1000,The pattern of reading deterioration in dement...
4,ALZ,ALZ101,Cerebral blood flow and metabolic rate of oxyg...
...,...,...,...
3995,HIV,HIV3996,Major histocompatibility complex genes influen...
3996,HIV,HIV3997,HIV 1 infection in a cohort of haemophilic pat...
3997,HIV,HIV3998,Evolution of the definition of AIDS. The main ...
3998,HIV,HIV3999,Human immunodeficiency virus 1 glycoproteins g...


Preprocesing

In [3]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# cleaning
def cleaning(features):
    result = []
    for feature in features:
        regex = re.sub(r'[^a-zA-Z\s]', '', feature)
        result.append(regex)
    return result

# case folding
def caseFolding(features):
    result = []
    for feature in features:
        lower = feature.lower()
        result.append(lower)
    return result

# tokenization
def tokenization(features):
    result = []
    for feature in features:
        token = word_tokenize(feature)
        result.append(token)
    return result

# stopwords removal
def stopWords(features):
    result = []
    stopWords = set(stopwords.words('english'))
    for token in features:
        cleanedFeature = [feature for feature in token if feature not in stopWords]
        result.append(cleanedFeature)
    return result

In [14]:
# cleaning
clean = cleaning(features)
# print(clean)

case = caseFolding(clean)
# print(casefolding)

token = tokenization(case)
# print(tokenization)

preprocessed = stopWords(token)
# print(preprocessedFeature)

# export pubmed clean
for i in range(len(preprocessed)):
    dataRaw.loc[i, 'feature'] = ' '.join(preprocessed[i])
dataRaw.to_csv('pubmed_clean.csv', index=False)

In [15]:
# read csv
dataClean = pd.read_csv('pubmed_clean.csv')
# get feature
features = dataClean.loc[:, 'feature']
dataClean

Unnamed: 0,class,document,feature
0,ALZ,ALZ1,reduced amounts immunoreactive somatostatin te...
1,ALZ,ALZ10,diagnostic criteria primary neuronal degenerat...
2,ALZ,ALZ100,cacetylcholine synthesis ccarbon dioxide produ...
3,ALZ,ALZ1000,pattern reading deterioration dementia alzheim...
4,ALZ,ALZ101,cerebral blood flow metabolic rate oxygen gluc...
...,...,...,...
3995,HIV,HIV3996,major histocompatibility complex genes influen...
3996,HIV,HIV3997,hiv infection cohort haemophilic patients cour...
3997,HIV,HIV3998,evolution definition aids main classifications...
3998,HIV,HIV3999,human immunodeficiency virus glycoproteins gp ...


Feature Forming

In [27]:
from textblob import TextBlob

# BOAW
dataClean.rename(columns={'feature': 'BOAW'}, inplace=True)

for i in tqdm(range(len(features))):
    # BON
    dataClean.loc[i,'BON'] = ' '.join(TextBlob(features[i]).noun_phrases)
    # BONA
    dataClean.loc[i,'BONA'] = ' '.join([word for (word, tag) in TextBlob(features[i]).tags if tag[:2]=='NN' or tag[:2]=='JJ'])

# print(dataClean)

dataClean.to_csv('pubmed_formed.csv', index=False)

100%|██████████| 4000/4000 [00:42<00:00, 94.40it/s] 


In [7]:
# read csv
dataFormed = pd.read_csv('pubmed_formed.csv')
# get features
classes = dataFormed.loc[:, 'class']
boaw = dataFormed.loc[:, 'BOAW']
bon = dataFormed.loc[:, 'BON']
bona = dataFormed.loc[:, 'BONA']
dataFormed

Unnamed: 0,class,document,BOAW,BON,BONA
0,ALZ,ALZ1,reduced amounts immunoreactive somatostatin te...,amounts immunoreactive somatostatin temporal c...,reduced amounts immunoreactive somatostatin te...
1,ALZ,ALZ10,diagnostic criteria primary neuronal degenerat...,diagnostic criteria primary neuronal degenerat...,diagnostic criteria primary neuronal degenerat...
2,ALZ,ALZ100,cacetylcholine synthesis ccarbon dioxide produ...,cacetylcholine synthesis ccarbon dioxide produ...,cacetylcholine synthesis ccarbon dioxide produ...
3,ALZ,ALZ1000,pattern reading deterioration dementia alzheim...,pattern reading deterioration dementia alzheim...,pattern deterioration dementia alzheimer type ...
4,ALZ,ALZ101,cerebral blood flow metabolic rate oxygen gluc...,cerebral blood flow metabolic rate oxygen gluc...,cerebral blood flow metabolic rate oxygen gluc...
...,...,...,...,...,...
3995,HIV,HIV3996,major histocompatibility complex genes influen...,major histocompatibility complex genes outcome...,major histocompatibility complex genes influen...
3996,HIV,HIV3997,hiv infection cohort haemophilic patients cour...,hiv infection cohort haemophilic patients cour...,hiv infection cohort haemophilic patients cour...
3997,HIV,HIV3998,evolution definition aids main classifications...,evolution definition aids main classifications...,evolution definition aids main classifications...
3998,HIV,HIV3999,human immunodeficiency virus glycoproteins gp ...,human immunodeficiency virus glycoproteins gp ...,human immunodeficiency virus glycoproteins gp ...


Term Weighting

In [1]:
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# tokenization
bona = tokenization(bona)

# TF
def tf(features):
    tf = []
    for feature in features:
        tfVec = CountVectorizer()
        result = tfVec.fit_transform(feature).todense()
        tf.append(result)
    return tf

tfx = tf(bona)
print(tfx)

# TF-IDF
def tf_idf(features):
    tfIdf = []
    for feature in features:
        tfIdfVec = TfidfVectorizer()
        result = tfIdfVec.fit_transform(feature).todense()
        tfIdf.append(result)
    return tfIdf

# idfx = tf_idf(features)
# print(idfx)

# TF-IDF-ICF
def icf(features, classes):
  icf = []
  C = []

  # count class
  for i in classes:
    if i not in C:
      C.append(i)

  # count term
  for feature in features:
    for i in C:
      if 
    
  
  for word, val in idfDict.items():
      idfDict[word] = math.log(N / float(val))
  
  return icf

def tf_idf_icf(features):
    tfIdfIcf = []
    tfIdf = tf_idf(features)

    for word, val in tfIdf.items():
        tfIdfIcf[word] = val * icf[word]
        break
    return tfIdfIcf

icfx = tf_idf_icf(features)
print(icfx)

ModuleNotFoundError: No module named 'sklearn'

Feature Selection

Clustering

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline