In [2]:
# init
import glob
import os
import csv
import string
import numpy as np
import pandas as pd
import nltk
# nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt
%matplotlib inline

Import Data

In [3]:
header = ['class', 'document', 'feature']
pubmedClass = []
pubmedDoc = [] 
pubmedFeature = []

# import pubmed
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/pubmed/", "*.txt"))

for docPath in docList:
    docName = os.path.basename(docPath).split('.')[0]
    pubmedDoc.append(docName)
    pubmedClass.append(docName[:3])
    with open(docPath) as doc:
        pubmedFeature.append(doc.read().replace('\n', ' '))

# print(pubmed)

# export csv
def exportCsv(classes, docs, features, fileName):
    with open(f'{fileName}.csv', 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        for i in range(0, len(features)):
            writer.writerow([classes[i], docs[i], features[i]])

In [None]:
# export pubmed raw
exportCsv(pubmedClass, pubmedDoc, pubmedFeature, 'pubmed_raw')

Dataframe raw

In [7]:
# read csv
dataRaw = pd.read_csv('pubmed_raw.csv')
dataRaw.head()

Unnamed: 0,class,document,feature
0,ALZ,ALZ1,Reduced amounts of immunoreactive somatostatin...
1,ALZ,ALZ10,Diagnostic criteria for primary neuronal degen...
2,ALZ,ALZ100,14Cacetylcholine synthesis and 14Ccarbon dioxi...
3,ALZ,ALZ1000,The pattern of reading deterioration in dement...
4,ALZ,ALZ101,Cerebral blood flow and metabolic rate of oxyg...


Preprocesing

In [8]:
features = dataRaw.loc[:, 'feature']

# punctuation removal
punctuation = []
for feature in features:
    translate = feature.translate(str.maketrans('', '', string.punctuation))
    punctuation.append(translate)

# print(punctuation)

# case folding
casefolding = []
for feature in punctuation:
    lower = feature.lower()
    casefolding.append(lower)

# print(casefolding)

# tokenization
tokenization = []
for feature in casefolding:
    token = word_tokenize(feature)
    tokenization.append(token)

# print(tokenization)

# stopwords removal
preprocessedFeature = []
stopWords = set(stopwords.words('english'))

for token in tokenization:
    cleanedFeature = []
    for feature in token:
        if feature not in stopWords:
            cleanedFeature.append(feature)
    preprocessedFeature.append(cleanedFeature)

# print(preprocessedFeature)

# export pubmed clean
exportCsv(pubmedClass, pubmedDoc, preprocessedFeature, 'pubmed_clean')

In [9]:
# read csv
dataClean = pd.read_csv('pubmed_clean.csv')
dataClean.head()

Unnamed: 0,class,document,feature
0,ALZ,ALZ1,"['reduced', 'amounts', 'immunoreactive', 'soma..."
1,ALZ,ALZ10,"['diagnostic', 'criteria', 'primary', 'neurona..."
2,ALZ,ALZ100,"['14cacetylcholine', 'synthesis', '14ccarbon',..."
3,ALZ,ALZ1000,"['pattern', 'reading', 'deterioration', 'demen..."
4,ALZ,ALZ101,"['cerebral', 'blood', 'flow', 'metabolic', 'ra..."


Feature Forming

In [14]:
features = dataClean.loc[:, 'feature']

# POS tag
tagged = nltk.pos_tag(features)

# BOAW
def boaw(features):
    vectorizer = CountVectorizer()
    boaw = vectorizer.fit_transform(features).todense()
    return boaw

boaw(tagged)

# BON
def bon(features):
    list = []
    for (text, tag) in features:
        if tag == 'NOUN':
            list.append(text)

    vectorizer = CountVectorizer()
    bon = vectorizer.fit_transform(list).todense()
    return bon

# BONA
def bona(features):
    list = []
    for (text, tag) in features:
        if tag == 'NOUN' or tag == 'ADJ':
            list.append(text)

    vectorizer = CountVectorizer()
    bona = vectorizer.fit_transform(list).todense()
    return bona

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - 'C:\\Users\\ZephZ/nltk_data'
    - 'd:\\Downloads\\Anaconda\\nltk_data'
    - 'd:\\Downloads\\Anaconda\\share\\nltk_data'
    - 'd:\\Downloads\\Anaconda\\lib\\nltk_data'
    - 'C:\\Users\\ZephZ\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


Term Weighting

In [None]:
# TF
def tf(features):
    vectorizer = CountVectorizer()
    tf = vectorizer.fit_transform(features).todense()
    return tf

# TF-IDF
def tf_idf(features):
    vectorizer = TfidfVectorizer()
    tfIdf = vectorizer.fit_transform(features)
    return tfIdf

# TF-IDF-ICF
def tf_idf_icf(features):
    tfIdf = tf_idf(features)
    # tfIdfIcf = tfIdf * icf
    # return tfIdfIcf
