In [None]:
# init
import glob
import os
import csv
import string
import numpy as np
import pandas as pd
import nltk
# nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt
%matplotlib inline

Import Data

In [8]:
header = ['class', 'document', 'feature']
pubmedClass = []
pubmedDoc = [] 
pubmedFeature = []

# import pubmed
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/pubmed/", "*.txt"))

for docPath in docList:
    docName = os.path.basename(docPath).split('.')[0]
    pubmedDoc.append(docName)
    pubmedClass.append(docName[:3])
    with open(docPath) as doc:
        pubmedFeature.append(doc.read().replace('\n', ' '))

# print(pubmed)

# export csv
def exportCsv(classes, docs, features, fileName):
    with open(f'{fileName}.csv', 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        for i in range(0, len(features)):
            writer.writerow([classes[i], docs[i], features[i]])

In [None]:
# export pubmed raw
exportCsv(pubmedClass, pubmedDoc, pubmedFeature, 'pubmed_raw')

Dataframe raw

In [None]:
# read csv
data = pd.read_csv('pubmed_raw.csv')
data.head()

Preprocesing

In [9]:
features = data.loc[:, 'feature']

# punctuation removal
punctuation = []
for feature in features:
    translate = feature.translate(str.maketrans('', '', string.punctuation))
    punctuation.append(translate)

# print(punctuation)

# case folding
casefolding = []
for feature in punctuation:
    lower = feature.lower()
    casefolding.append(lower)

# print(casefolding)

# tokenization
tokenization = []
for feature in casefolding:
    token = word_tokenize(feature)
    tokenization.append(token)

# print(tokenization)

# stopwords removal
preprocessed = []
stopWords = set(stopwords.words('english'))
for token in tokenization:
    for feature in token:
        if feature not in stopWords:
            preprocessed.append(feature)

# print(preprocessed)

# export pubmed clean
exportCsv(pubmedClass, pubmedDoc, pubmedFeature, 'pubmed_clean')

Feature Forming

In [None]:
# POS tag
tagged = nltk.pos_tag(preprocessed)

# BOAW
def boaw(features):
    vectorizer = CountVectorizer()
    boaw = vectorizer.fit_transform(features).todense()
    return boaw


# BON
def bon(features):
    list = []
    for (text, tag) in features:
        if tag == 'NOUN':
            list.append(text)

    vectorizer = CountVectorizer()
    bon = vectorizer.fit_transform(list).todense()
    return bon

# BONA
def bona(features):
    list = []
    for (text, tag) in features:
        if tag == 'NOUN' or tag == 'ADJ':
            list.append(text)

    vectorizer = CountVectorizer()
    bona = vectorizer.fit_transform(list).todense()
    return bona

Term Weighting

In [None]:
# TF
def tf(features):
    vectorizer = CountVectorizer()
    tf = vectorizer.fit_transform(features).todense()
    return tf

# TF-IDF
def tf_idf(features):
    vectorizer = TfidfVectorizer()
    tfIdf = vectorizer.fit_transform(features)
    return tfIdf

# TF-IDF-ICF
def tf_idf_icf(features):
    tfIdf = tf_idf(features)
    # tfIdfIcf = tfIdf * icf
    # return tfIdfIcf
