In [1]:
# init
import glob
import os
import csv
import string
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt
%matplotlib inline

Import Data

In [None]:
header = ['class', 'document', 'feature']
pubmed = []

# import pubmed
docList = glob.glob(os.path.join(os.getcwd(), "Datasets/pubmed/", "*.txt"))

for docPath in docList:
    docName = os.path.basename(docPath).split('.')[0]
    className = docName[:3]
    with open(docPath) as doc:
        pubmed.append([className, docName, doc.read().replace('\n', ' ')])

# print(pubmed)

# export csv pubmed
with open('pubmed.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # header
    writer.writerow(header)
    # row
    writer.writerows(pubmed)

Dataframe

In [None]:
# read csv
data = pd.read_csv('pubmed.csv')
data.head()

Preprocesing

In [None]:
features = data.loc[:, 'feature']

# punctuation removal
punctuation = []
for feature in features:
    translate = feature.translate(str.maketrans('', '', string.punctuation))
    punctuation.append(translate)

# print(punctuation)

# case folding
casefolding = []
for feature in punctuation:
    lower = feature.lower()
    casefolding.append(lower)

# print(casefolding)

# tokenization
tokenization = []
for feature in casefolding:
    token = word_tokenize(feature)
    tokenization.append(token)

# print(tokenization)

# stopwords removal
preprocessed = []
stopWords = set(stopwords.words('english'))
for token in tokenization:
    for feature in token:
        if feature not in stopWords:
            preprocessed.append(feature)

print(preprocessed)

Feature Forming

In [None]:
# BOAW
vectorizer = CountVectorizer()
boaw = vectorizer.fit_transform(preprocessed).todense()
print(boaw)
boaw.get_feature_names()

# POS tag
tagged = nltk.pos_tag(preprocessed)

# BON
bon = []
for (text, tag) in tagged:
    if tag == 'NOUN':
        bon.append(tag)

# BONA
bona = []
for (text, tag) in tagged:
    if tag == 'NOUN' or tag == 'ADJ':
        bona.append(tag)