# 1. Preprocessing

In this first step, the files containing the collection documents are preprocessed. Only interest tags are kept.

In [1]:
import os
import re
import json
import utils

import numpy as np

from nltk.corpus import stopwords
from nltk import word_tokenize

from collections import defaultdict

In [2]:
# listing the files
path = '../cfc/collection/'
files = os.listdir(path)

In [3]:
# interest tags
tags = (' AU ', ' TI ', ' SO ', ' MJ ', ' MN ', ' AB ', ' EX ', ' RF ')

In [4]:
# dict to store documents
processed = defaultdict(str)

# processing the documents
for file in files:
    curr_file = open(f'{path}{file}', 'r').readlines()
    
    # joining the text and saving into the dict
    processed = utils.split_join(file=curr_file, data=processed, tag='PN', exclude=('RN', 'AN'))

# keeping only interest tags
for doc in processed.keys():
    
    # spliting tags
    processed[doc] = processed[doc].split(' RF ')[0]
    if ' CT ' in processed[doc]:
        processed[doc] = processed[doc].split(' CT ')[0]
    if ' AU ' not in processed[doc]:
        processed[doc] = processed[doc].split(' TI ')[1]
    else:
        processed[doc] = processed[doc].split(' AU ')[1]

    # removing tag name
    for tag in tags:
        processed[doc] = processed[doc].replace(tag, '')

    # removing multiple whitespaces
    processed[doc] = re.sub(' +', ' ', processed[doc])

In [5]:
# checking the last doc
print(processed[doc])

Bohles-H. Heid-H. Stehr-K. Fekl-W.Deficiencies of essential fatty acids and vitamin E in cystic fibrosis.Z-Ernahrungswiss. 1979 Jul. 18(2). P 81-7.CYSTIC-FIBROSIS: pp. FATTY-ACIDS-ESSENTIAL: df. VITAMIN-E-DEFICIENCY: pp.ADOLESCENCE. BODY-HEIGHT. CHILD. CHILD-PRESCHOOL. CHOLESTEROL-ESTERS: bl. CYSTIC-FIBROSIS: co. FATTY-ACIDS: bl. FEMALE. HUMAN. INFANT. MALE. PHOSPHOLIPIDS: bl. TRIGLYCERIDES: bl. VITAMIN-E-DEFICIENCY: co. VITAMIN-E: bl.In 25 children (13 male; 12 female) with cystic fibrosis aged 6 months to 16 years and 24 matched controls total serum vitamin E levels and fatty acid patterns of serum cholesterol esters, phospholipids and triglycerides are demonstrated. Compared to controls (1.02 +/- 0.24 mg/dl) the total serum vitamin E levels are significantly decreased in patients with cystic fibrosis (0.30 +/- 0.26 mg/dl) (p less than 0.01). There is no significant difference comparing the fatty acid patterns of the serum ester fractions of both groups. Differences can be seen best 

In [6]:
# saving the number of words in each doc
def checking():
    word_counter = []
    for doc in processed.keys():
        doc_size = len(word_tokenize(processed[doc]))
        word_counter.append(doc_size)
    print(f'maximum number of words: {max(word_counter)}')
    return word_counter
    
# number of documents with word count greater than 512
sum(np.array(checking()) > 512)

maximum number of words: 657


6

In [7]:
# saving into json file   
with open("../outputs/preprocessed.json", "w") as outfile:
    json.dump(processed, outfile)

## 1.1 Stopwords

Next we will remove all punctuation.

In [8]:
# english stopwords
en_stopwords = set(stopwords.words('english'))

In [9]:
# removing stopwords
for doc in processed.keys():
    new_str = ''
    for word in word_tokenize(processed[doc]):
        if word not in en_stopwords:
            new_str += f'{word} '
    processed[doc] = new_str[:-1]

In [10]:
# number of documents with word count greater than 512
sum(np.array(checking()) > 512)

maximum number of words: 444


0

In [11]:
# saving into json file   
with open("../outputs/preprocessed_wo_stopwords.json", "w") as outfile:
    json.dump(processed, outfile)