In [15]:
# init
import glob
import os
import csv
import string
import re
import numpy as np
import pandas as pd
import nltk
# nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt
%matplotlib inline

Import Data

In [55]:
header = ['class', 'document', 'feature']
pubmedClass = []
pubmedDoc = [] 
pubmedFeature = []

# import pubmed
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/pubmed/", "*.txt"))

for docPath in docList:
    docName = os.path.basename(docPath).split('.')[0]
    pubmedDoc.append(docName)
    pubmedClass.append(docName[:3])
    
    with open(docPath) as doc:
        pubmedFeature.append(doc.read().replace('\n', ' '))

# print(pubmed)

# export csv
def exportCsv(classes, docs, features, dataset, type):
    with open(f'{dataset}_{type}.csv', 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)

        for i in range(0, len(features)):
                writer.writerow([classes[i], docs[i], ' '.join(features[i])])

In [46]:
# export pubmed raw
exportCsv(pubmedClass, pubmedDoc, pubmedFeature, 'pubmed', 'raw')

Dataframe raw

In [65]:
# read csv
dataRaw = pd.read_csv('pubmed_raw.csv')
dataRaw.head()

Unnamed: 0,class,document,feature
0,ALZ,ALZ1,Reduced amounts of immunoreactive somatostatin...
1,ALZ,ALZ10,Diagnostic criteria for primary neuronal degen...
2,ALZ,ALZ100,14Cacetylcholine synthesis and 14Ccarbon dioxi...
3,ALZ,ALZ1000,The pattern of reading deterioration in dement...
4,ALZ,ALZ101,Cerebral blood flow and metabolic rate of oxyg...


Preprocesing

In [66]:

dataRaw['feature'] = dataRaw['feature'].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',x))
dataRaw

features = dataRaw.loc[:, 'feature']

Unnamed: 0,class,document,feature
0,ALZ,ALZ1,Reduced amounts of immunoreactive somatostatin...
1,ALZ,ALZ10,Diagnostic criteria for primary neuronal degen...
2,ALZ,ALZ100,Cacetylcholine synthesis and Ccarbon dioxide p...
3,ALZ,ALZ1000,The pattern of reading deterioration in dement...
4,ALZ,ALZ101,Cerebral blood flow and metabolic rate of oxyg...
...,...,...,...
3995,HIV,HIV3996,Major histocompatibility complex genes influen...
3996,HIV,HIV3997,HIV infection in a cohort of haemophilic pati...
3997,HIV,HIV3998,Evolution of the definition of AIDS The main c...
3998,HIV,HIV3999,Human immunodeficiency virus glycoproteins gp...


In [73]:
# punctuation removal
punctuation = []
for feature in features:
    regex = re.sub(r'[^a-zA-Z\s]', '', feature)
    punctuation.append(regex)

# print(punctuation)

# case folding
casefolding = []
for feature in punctuation:
    lower = feature.lower()
    casefolding.append(lower)

# print(casefolding)

# tokenization
tokenization = []
for feature in casefolding:
    token = word_tokenize(feature)
    tokenization.append(token)

# print(tokenization)

# stopwords removal
preprocessedFeature = []
stopWords = set(stopwords.words('english'))

for token in tokenization:
    cleanedFeature = []
    for feature in token:
        if feature not in stopWords:
            cleanedFeature.append(feature)
    preprocessedFeature.append(cleanedFeature)

# print(preprocessedFeature)

# export pubmed clean
exportCsv(pubmedClass, pubmedDoc, preprocessedFeature, 'pubmed', 'clean')

In [74]:
# read csv
dataClean = pd.read_csv('pubmed_clean.csv')
dataClean

features = dataClean.loc[:, 'feature']

Unnamed: 0,class,document,feature
0,ALZ,ALZ1,reduced amounts immunoreactive somatostatin te...
1,ALZ,ALZ10,diagnostic criteria primary neuronal degenerat...
2,ALZ,ALZ100,cacetylcholine synthesis ccarbon dioxide produ...
3,ALZ,ALZ1000,pattern reading deterioration dementia alzheim...
4,ALZ,ALZ101,cerebral blood flow metabolic rate oxygen gluc...
...,...,...,...
3995,HIV,HIV3996,major histocompatibility complex genes influen...
3996,HIV,HIV3997,hiv infection cohort haemophilic patients cour...
3997,HIV,HIV3998,evolution definition aids main classifications...
3998,HIV,HIV3999,human immunodeficiency virus glycoproteins gp ...


Feature Forming

In [76]:
# text blob
for i in tqdm(range(len(features))):
    # BON
    dataClean.loc[i,'BON'] = ' '.join(TextBlob(features[i]).noun_phrases)
    # BONA
    dataClean.loc[i,'BONA'] = ' '.join([word for (word, tag) in TextBlob(features[i]).tags if tag[:2]=='JJ'])

# print(dataClean)

# export pubmed formed
exportCsv(pubmedClass, pubmedDoc, preprocessedFeature, 'pubmed', 'formed')

100%|██████████| 4000/4000 [00:43<00:00, 91.79it/s] 


In [80]:
# read csv
dataFormed = pd.read_csv('pubmed_formed.csv')
dataFormed

features = dataFormed.loc[:, 'feature']

Term Weighting

In [94]:
# tokenization
tokenization = []
for feature in features:
    token = word_tokenize(feature)
    tokenization.append(token)

# TF
def tf(features):
    tf = []
    for feature in features:
        vectorizer = CountVectorizer()
        result = vectorizer.fit_transform(feature).todense()
        tf.append(result)
    return tf

# tfx = tf(tokenization)
# print(tfx)

# TF-IDF
def tf_idf(features):
    tfIdf = []
    for feature in features:
        vectorizer = TfidfVectorizer()
        result = vectorizer.fit_transform(feature).todense()
        tfIdf.append(result)
    return tfIdf

# idfx = tf_idf(tokenization)
# print(idfx)

# TF-IDF-ICF
def tf_idf_icf(features):
    tfIdfIcf = []
    tfIdf = tf_idf(features)

    for feature in features:
        
    return tfIdfIcf

icfx = tf_idf_icf(tokenization)
print(icfx)

[matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), mat