In [5]:
# Reading PDF files and basic NLP
# import PyPDF2
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import io

import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Fuzzy string match
from fuzzywuzzy import fuzz

# To generate word clouds
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
% matplotlib inline

# Read unicode data from the extra_stop_words file
import unicodedata
import re
import os

# Frequency counting and collections
import numpy as np
import collections

# To navigate through all files in a directory
from os import listdir
from os.path import isfile, join

## 1. Define functions for the text extraction, preprocessing and analysis

In [6]:
# Adapted from https://gist.github.com/boniattirodrigo/67429ada53b7337d2e79
def remove_special_characters(word, only_numbers_letters_spaces=False):

    # Unicode normalize transforma um caracter em seu equivalente em latin.
    nfkd = unicodedata.normalize('NFKD', word)
    plain_word = u"".join([c for c in nfkd if not unicodedata.combining(c)])

    # return the word with only numbers, letters and spaces
    if only_numbers_letters_spaces:
        return re.sub('[^a-zA-Z0-9 \\\]', '', plain_word)
    return plain_word

In [7]:
# Adapted from https://medium.com/@rqaiserr/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f
# Updated the keywords removal 
def extract_keywords(path_pdf,
                     pdf_file,
                     path_output_raw_text,
                     language,#='portuguese', 
                     path_extra_stop_words,
                     file_extra_stop_words
                    ): 
    # The method used to transform the pdf to text ('pdfminer.six' or 'textract_OCR')
    method = ''
    # Check if the raw text is not already availabe in the path_output_raw_text
    text = ''
    try:
        candidate_raw = open(os.path.join(path_output_raw_text, pdf_file.replace('.pdf', '.txt')), 'r')
        text = candidate_raw.read()
#         print('successfully read the raw text! ')
    except IOError:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        with open(os.path.join(path_pdf, pdf_file), 'rb') as candidate_raw:
            for page in PDFPage.get_pages(candidate_raw, 
                                          caching=True,
                                          check_extractable=True):
                page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()

        # close open handles
        converter.close()
        fake_file_handle.close()
        
        if text:
            method = 'pdfminer.six'
            text = text
        # If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
        else: # fileurl
            method = 'textract_OCR'
#             print('Activating OCR library for {0} file (language = {1})'.format(pdf_file,language))
            if language == 'portuguese':
                text = textract.process(os.path.join(path_pdf, pdf_file), method='tesseract', language='por', encoding='utf8')
            else:
                text = textract.process(os.path.join(path_pdf, pdf_file), method='tesseract', language='eng', encoding='utf8')

        file_raw_text = open(os.path.join(path_output_raw_text, pdf_file.replace('.pdf', '.txt')), 'w', encoding='utf8')
        file_raw_text.write(text)
    
    text = remove_special_characters(text)
    # The word_tokenize() function will break our text phrases into #individual words
    tokens = word_tokenize(text)
    punctuations = ['(', ')', ';', ':', '[', ']', ',', '%', '-', '.', '|', '']
    stop_words = stopwords.words(language)
    extra_stop_words = open(os.path.join(path_extra_stop_words, file_extra_stop_words), 'r', encoding='utf8').read().split('\n')
    
    # We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN
    # punctuations.
    keywords = [word.lower() for word in tokens 
                if 
                not word.lower() in stop_words and 
                not word.lower() in punctuations and 
                not word.lower() in extra_stop_words and 
                word.isalpha() and
                len(word) >= 2]
    
    return np.asarray(keywords), method

In [8]:
def word_cloud_from_keywords_frequency(keywords_frequency, file_name, path='./wordclouds/', show_image=False):
    wordcloud = WordCloud(width = 512, height = 512, background_color='white')
    fig = plt.figure(figsize=(20,16),facecolor = 'white', edgecolor='blue')
    plt.imshow(wordcloud.generate_from_frequencies(keywords_frequency), interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout(pad=0)
    
    if show_image:
        plt.show()
    plt.savefig(path+file_name)
    plt.close(fig)

In [9]:
def calculate_statistics(keywords):
    total_words = 0
    keywords_dict = dict(collections.Counter(keywords).most_common())
    for k in keywords_dict:
        total_words += keywords_dict[k]

    stats = dict()
    for k in keywords_dict:
        stats[k] = {'count': keywords_dict[k], 'text_frequency': keywords_dict[k]/float(total_words)}    
#     stats['frequency'].most_common(10)
    
    return stats

In [10]:
def write_report(path, output_file, keyword_stats):
    output = open(path+output_file, 'w', encoding='utf8')
    header = u'word,count,text_frequency\n'
    output.write(header)
    for k in keyword_stats:
        line = u'%s,%d,%.5f\n' % (k, keyword_stats[k]['count'], keyword_stats[k]['text_frequency'])
        output.write(line)
    output.close()

## 2. Count occurrences of any word

* All PDFs in a given path are processed.
* This function generates 'raw reports', counting the occurrence and frequency of every word of the document. 
* A word is defined as a sequence of characters delimited by empty spaces, i.e. ' '. 
* **TODO:** This version of the code still doesn't handle pdfs that are in PT and EN.

In [7]:
def process_PDFs(path='./data/', # dir containing the subdirs per language
                     raw_text_path='./raw_texts/EN/',
                     output_path='./reports/EN/', 
                     wordclouds_path='./wordclouds/EN/',
                     language='english',
                     path_extra_stop_words='./',
                     file_extra_stop_words='extra_stop_words_EN.txt',
                     wordclouds=False, 
                     start_index=0):
    
    num_pdfminersix = 0
    num_textract_OCR = 0
    # We assume there are only pdfs in this directory
    PDFs = [f for f in listdir(path) if isfile(join(path, f))]
    PDFs = list(filter(lambda pdf: pdf.find('.pdf') != -1, PDFs))
        
    counter = start_index
    for pdf in PDFs[counter:]:
        
        print('Processing PDFs ({0}) {1} of {2} ({3})'.format(language, counter+1, len(PDFs), pdf))
        keywords, method = extract_keywords(path, pdf, 
                                    path_output_raw_text=raw_text_path, 
                                    language=language,
                                    path_extra_stop_words=path_extra_stop_words,
                                    file_extra_stop_words=file_extra_stop_words)
        if method == 'pdfminer.six':
            num_pdfminersix += 1
        else:
            num_textract_OCR += 1
        keywords_statistics = calculate_statistics(keywords)
        write_report(output_path, pdf.replace('.pdf', '.csv'), keywords_statistics)
        counter = counter + 1
        if wordclouds:
            word_cloud_from_keywords_frequency(collections.Counter(keywords), 
                                               pdf.replace('.pdf', '.png'), 
                                               path=wordclouds_path,
                                               show_image=False)
    print('Stats for {0}: num_pdfminersix = {1}, num_textract_OCR = {2}'
          .format(raw_text_path, num_pdfminersix, num_textract_OCR))

### 2.1 Executing the PDF processing
This code may take a while to finish depending on the amount of PDF files. 

In [8]:
%%time
# # ENglish PDFs processing
# process_PDFs(path='/Users/hmg/Desktop/Data/pdfs_EN/', 
#              raw_text_path='./raw_texts/EN/',
#              output_path='./reports/EN/', 
#              wordclouds_path='./wordclouds/EN/',
#              language='english',
#              wordclouds=True,
#             start_index = 0)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 11 µs


In [9]:
%%time
# # PorTuguese PDFs processing
# for i in range(1,5): # Hardcoded 5 because I know there are only 4 directories... 
#     print('Processing pdfs_PT{0}'.format(i))
#     process_PDFs(path='/Users/hmg/Desktop/Data/pdfs_PT{0}/'.format(i), 
#              raw_text_path='./raw_texts/PT/',
#              output_path='./reports/PT/', 
#              wordclouds_path='./wordclouds/PT/',
#              language='portuguese',
#              file_extra_stop_words='extra_stop_words_PT.txt',
#              wordclouds=True,
#              start_index=0)

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 10 µs


## 3. Count occurrences of predefined phrases/words

Count occurrences of predefined text using 'fuzzy string match', added ```word_match_count``` lambda object. 

In [11]:
# def count_total_words(text, delim=' '): 
#     return len(remove_special_characters(text).split(delim))

In [12]:
def count_normal(text, word, confidence = 0, phrases = False, debug = False, output = ''):
    if phrases:
        return text.count(word)
    else:
        counter = 0
        for p in text.split(' '):
            if(p == word):
                counter = counter + 1
        return counter

def count_fuzzy(text, word, confidence = 95, phrases=False, debug = False, output = ''):
    counter = 0
    if len(word) == 0:
        return 0
    # Phrases
    if phrases:
        # Sliding window strategy: create a text_word from text with length word.len 
        #  and by moving 1 character at a time
        #     If match, then skip the next word.len, just to avoid double counting!
        i = 0
        ## DEBUG
#         print('count_fuzzy - word = {0} and, len(text) = {1}, confidence = {2}'.format(word, len(text), confidence))
        if debug:
            output.write('{0} ({1})'.format(word, confidence) + ',')
#             print('{0} ({1})'.format(word, confidence))
    
        while i < len(text):
            text_word_last_idx = i+len(word)
            match_confidence = 0
            if text_word_last_idx <= len(text):
                match_confidence = fuzz.ratio(text[i:text_word_last_idx], word)
#                 print('token_set_ratio({0},{1})={2}'.format(text[i:text_word_last_idx], word, match_confidence))
                if match_confidence > confidence:
                    counter += 1
                    if debug:
                        output.write('{0} ({1})'.format(text[i:text_word_last_idx], match_confidence) + ',')
#                         print('{0} =~ {1} ({2})'.format(text[i:text_word_last_idx], word, match_confidence))
                    i += len(word)
                else:
                    i += 1
            else:
                break
            ## DEBUG - only about 100 characters
#             if debug and (i % int(len(text)/10) == 0):
#                 print('{} of {} = {:.2f}%'.format(i, len(text), i/len(text)*100))
    else:
        text_vec = text.split()
        if debug:
            output.write('{0} ({1})'.format(word, confidence) + ',')
        for w in text_vec:
            match_confidence = fuzz.ratio(w, word)
            if match_confidence > confidence:
                if debug:
                    output.write('{0} ({1})'.format(w, match_confidence) + ',')
                counter += 1
    if debug:
        output.write('\n')
    return counter

In [14]:
def filter_special_keywords(filter_list_path='./', 
                            filter_list_file='lista_palavras_EN.txt', 
                            raw_text_path='./raw_texts/EN/',
                            output_path='./reports_special/EN/',
                            output_file='output_EN.csv',
                            language='english',
                            path_extra_stop_words='./',
                            file_extra_stop_words='extra_stop_words_EN.txt',
                            word_match_count=count_normal,
                            output_encoding='utf8',
                            confidence=95,
                            phrases=False,
                            output_debug_path='./debug_fuzzy/EN/',
                            debug=False,
                            start_index=0,
                            last_index=-1,
                            preprocessing=False):
    print('Starting filter_special_keywords')
    punctuations = ['(', ')', ';', ':', '[', ']', ',', '%', '-', '.', '|', '']
    stop_words = stopwords.words(language)
    extra_stop_words = open(os.path.join(path_extra_stop_words, file_extra_stop_words), 'r', encoding='utf8').read().split('\n')
    
    if not phrases:
        print('[not phrases] Stopwords will be removed prior to processing the texts.')
        
    
    filter_words = ''
    try:
        filter_words = open(os.path.join(filter_list_path, filter_list_file), 'r', encoding='utf8').read().split('\n')
        print('successfully read the filter words at {0} named {1}'.format(filter_list_path, filter_list_file))
    except IOError:
        print('failed to read the filter words at {0} named {1}'.format(filter_list_path, filter_list_file))
    
    # assuming there are only the raw texts in the directory
    text_files = [f for f in listdir(raw_text_path) if isfile(join(raw_text_path, f))]
    text_files = list(filter(lambda text_file: text_file.find('.txt') != -1, text_files))
    
    stats_per_file = {}
    total_words_per_file = {}
    
    processed = start_index
    if last_index == -1:
        last_index = len(text_files)
    
    for text_file in text_files[processed:last_index]:
        print('Processing Text ({0}) {1} of {2} ({3})'.format(language, processed+1, 
                                                              last_index, text_file))
        debug_output = open(output_debug_path+'DEBUG_{0}_'.format(confidence)+text_file.replace('.txt','.csv'), 'w', encoding='utf8') 
        try:
            text = open(os.path.join(raw_text_path, text_file), 'r').read()
        except IOError:
            print('Failed to open file at {0} named {1}'.format(raw_text_path, text_file))
                
        # Initialize the stats dictionary that will hold counters for the search words {'word': #occurrences}
        stats = {}
        
        # Transform the original text, transform from unicode to ASCII (DOES NOT REMOVE SPECIAL) and set it to lower.
        search_text = remove_special_characters(text).lower()
        
#         if preprocessing: PREPROCESSING WILL CERTAINLY BE USED FOR COUNTING NUMBER OF WORDS!!
        # This remove stop_words, extra_words and punctuations before counting. 
        # TODO: merge this and the previous stop_word filter in a function. 
        search_text_nostopwords = [word for word in search_text.split(' ')
                               if 
                               not word in stop_words and
                               not word in punctuations and 
                               not word in extra_stop_words and
                               len(word) >= 1]
        
        if preprocessing:
            # If NOT phrases, then it is safe to remove stop words from the search_text.
            # That is an assumption! Maybe we run into a search word that is a stop word!
            if not phrases:
                search_text = ' '.join(search_text_nostopwords)
                
        # Update to ALWAYS count as total words after removing stop_words. 
        stats['@TOTAL_WORDS'] = len(search_text_nostopwords)
#         else:
#             stats['@TOTAL_WORDS'] = len(search_text.split())
        
        for filter_word in filter_words:
            # Transform the filter_word, transform from unicode to ASCII (DOES NOT REMOVE SPECIAL) and set it to lower.
            search_word = remove_special_characters(filter_word).lower()
            stats[filter_word] = word_match_count(search_text, search_word, confidence, phrases, debug, debug_output)
            # text.lower().count(filter_word.lower())
#             line = u'%s,%d,%.5f\n' % (filter_word, counter, -1)

        stats_per_file[text_file.replace('.txt','')] = stats
        
        processed = processed + 1
        
    output = open(output_path+output_file, 'w', encoding=output_encoding)   
    header = u',' + u','.join(text_files[start_index:last_index])+u'\n'
    header = header + u'words \ total_words_per_file,' + u','.join(total_words_per_file)
    output.write(header)
    
    for text_file in text_files[start_index:last_index]:
        output.write(str(stats_per_file[text_file.replace('.txt','')]['@TOTAL_WORDS']) + ',')
    output.write('\n')
    
    for filter_word in filter_words:
        output.write(filter_word + ',')
        for text_file in text_files[start_index:last_index]:
#             line = u'%s,%d,%.5f\n' % (filter_word, counter, -1)
            output.write(str(stats_per_file[text_file.replace('.txt','')][filter_word]) + ',')
        output.write('\n')
    output.close()
    print('Finishing filter_special_keywords')

# ~ + ~ + ~ + ~ + ~ + ~ + ~
# Filter the special words
# ~ + ~ + ~ + ~ + ~ + ~ + ~

## 3.1 EN 'exact match'


### <span style="color:blue">PREPROCESSING=OFF</span>
### <span style="color:blue">EXACT MATCH</span>

In [13]:
%%time
# filter_special_keywords(filter_list_path='./', 
#                             filter_list_file='lista_palavras_EN.txt', 
#                             raw_text_path='./raw_texts/EN/',
#                             output_path='./reports_special/EN/',
#                             output_file='relatorio_normal_frases_EN.csv',
#                             word_match_count=count_normal,
#                             confidence=-1,
#                             phrases=True,
#                             preprocessing=False
#                             )

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs


### 3.1.1 EN <span style="color:red">'fuzzy match'</span>


#### PREPROCESSING=OFF
# <span style="color:red">FUZZY MATCH (90%)</span>

In [14]:
%%time
CONFIDENCE = 90
START_INDEX = 0
LAST_INDEX = -1

# filter_special_keywords(filter_list_path='./', 
#                             filter_list_file='lista_palavras_EN.txt', 
#                             raw_text_path='./raw_texts/EN/',
#                             output_path='./reports_special/EN/',
#                             output_file='relatorio_aproximado({0})_frases_EN_{1}-{2}.csv'
#                                         .format(CONFIDENCE, START_INDEX, str(LAST_INDEX) if LAST_INDEX != -1 else 'end'),
#                             word_match_count=count_fuzzy,
#                             confidence=CONFIDENCE,
#                             phrases=True,
#                             preprocessing=False,
                        
#                             output_debug_path='./debug_fuzzy/EN/',
#                             debug=True,
#                             start_index=START_INDEX,
#                             last_index=LAST_INDEX
#                             )

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


### 3.1.2 EN <span style="color:red">'fuzzy match'</span>


#### PREPROCESSING=OFF
# <span style="color:red">FUZZY MATCH (95%)</span>

In [15]:
%%time
# CONFIDENCE = 95
# START_INDEX = 0
# LAST_INDEX = -1

# filter_special_keywords(filter_list_path='./', 
#                             filter_list_file='lista_palavras_EN.txt', 
#                             raw_text_path='./raw_texts/EN/',
#                             output_path='./reports_special/EN/',
#                             output_file='relatorio_aproximado({0})_frases_EN_{1}-{2}.csv'
#                                         .format(CONFIDENCE, START_INDEX, str(LAST_INDEX) if LAST_INDEX != -1 else 'end'),
#                             word_match_count=count_fuzzy,
#                             confidence=CONFIDENCE,
#                             phrases=True,
#                             preprocessing=False,
                        
#                             output_debug_path='./debug_fuzzy/EN/',
#                             debug=True,
#                             start_index=START_INDEX,
#                             last_index=LAST_INDEX
#                             )

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


## 3.2 PT 'exact match'


### <span style="color:blue">PREPROCESSING=OFF</span>
### <span style="color:blue">EXACT MATCH</span>

In [16]:
%%time
# filter_special_keywords(filter_list_path='./', 
#                             filter_list_file='lista_palavras_PT.txt', 
#                             raw_text_path='./raw_texts/PT/',
                        
#                             path_extra_stop_words='./',
#                             file_extra_stop_words='extra_stop_words_PT.txt',
                        
#                             output_path='./reports_special/PT/',
#                             output_file='relatorio_normal_frases_PT.csv',
#                             word_match_count=count_normal,
                        
#                             language='portuguese',
                        
#                             confidence=-1,
#                             phrases=True,
#                             preprocessing=False
#                             )

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.91 µs


### 3.2.1 PT <span style="color:red">'fuzzy match'</span>


#### PREPROCESSING=OFF
# <span style="color:red">FUZZY MATCH (90%)</span>

In [17]:
%%time
# CONFIDENCE = 90
# START_INDEX = 0
# LAST_INDEX = -1

# filter_special_keywords(filter_list_path='./', 
#                             filter_list_file='lista_palavras_PT.txt', 
#                             raw_text_path='./raw_texts/PT/',
                        
#                             path_extra_stop_words='./',
#                             file_extra_stop_words='extra_stop_words_PT.txt',
                        
#                             output_path='./reports_special/PT/',
#                             output_file='relatorio_aproximado({0})_frases_PT_{1}-{2}.csv'
#                                         .format(CONFIDENCE, START_INDEX, str(LAST_INDEX) if LAST_INDEX != -1 else 'end'),
#                             word_match_count=count_fuzzy,
#                             confidence=CONFIDENCE,
#                             phrases=True,
#                             preprocessing=False,
                        
#                             language='portuguese',
                        
#                             output_debug_path='./debug_fuzzy/PT/',
#                             debug=True,
#                             start_index=START_INDEX,
#                             last_index=LAST_INDEX
#                             )

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


### 3.2.2 PT <span style="color:red">'fuzzy match'</span>


#### PREPROCESSING=OFF
# <span style="color:red">FUZZY MATCH (95%)</span>

In [18]:
%%time
# CONFIDENCE = 95
# START_INDEX = 0
# LAST_INDEX = -1

# filter_special_keywords(filter_list_path='./', 
#                             filter_list_file='lista_palavras_PT.txt', 
#                             raw_text_path='./raw_texts/PT/',
                        
#                             path_extra_stop_words='./',
#                             file_extra_stop_words='extra_stop_words_PT.txt',
                        
#                             output_path='./reports_special/PT/',
#                             output_file='relatorio_aproximado({0})_frases_PT_{1}-{2}.csv'
#                                         .format(CONFIDENCE, START_INDEX, str(LAST_INDEX) if LAST_INDEX != -1 else 'end'),
#                             word_match_count=count_fuzzy,
#                             confidence=CONFIDENCE,
#                             phrases=True,
#                             preprocessing=False,
                        
#                             language='portuguese',
                        
#                             output_debug_path='./debug_fuzzy/PT/',
#                             debug=True,
#                             start_index=START_INDEX,
#                             last_index=LAST_INDEX
#                             )

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs
