In [1]:
# Reading PDF files and basic NLP
import PyPDF2
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Fuzzy string match
from fuzzywuzzy import fuzz

# To generate word clouds
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
% matplotlib inline

# Read unicode data from the extra_stop_words file
import unicodedata
import re
import os

# Frequency counting and collections
import numpy as np
import collections

# To navigate through all files in a directory
from os import listdir
from os.path import isfile, join

## 1. Define functions for the text extraction, preprocessing and analysis

In [2]:
# Adapted from https://gist.github.com/boniattirodrigo/67429ada53b7337d2e79
def remove_special_characters(word):

    # Unicode normalize transforma um caracter em seu equivalente em latin.
    nfkd = unicodedata.normalize('NFKD', word)
    plain_word = u"".join([c for c in nfkd if not unicodedata.combining(c)])

    # return the word with only numbers, letters and spaces
    return re.sub('[^a-zA-Z0-9 \\\]', '', plain_word)

In [3]:
# Adapted from https://medium.com/@rqaiserr/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f
# Updated the keywords removal 
def extract_keywords(path_pdf,
                     pdf_file,
                     path_output_raw_text,
                     stop_words_language,#='portuguese', 
                     path_extra_stop_words,#='/Users/hmg/Dropbox/veve e heitor/Projeto_tese',
                     file_extra_stop_words): #='extra_stop_words.txt'):
    
    # Check if the raw text is not already availabe in the path_output_raw_text
    text = ''
    try:
        candidate_raw = open(os.path.join(path_output_raw_text, pdf_file.replace('.pdf', '.txt')), 'r')
        text = candidate_raw.read()
#         print('successfully read the raw text! ')
    except IOError:
#         print('gotta scan the pdf...')
        # open allows you to read the file
        pdfFileObj = open(os.path.join(path_pdf, pdf_file), 'rb')
        # The pdfReader variable is a readable object that will be parsed
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        # discerning the number of pages will allow us to parse through all the pages
        num_pages = pdfReader.numPages
        # Just to show info about creator and creation time. 
        # print(str(pdfReader.getDocumentInfo())
        count = 0

        # The while loop will read each page
        while count < num_pages:
            pageObj = pdfReader.getPage(count)
            count += 1
            text += pageObj.extractText()
        # This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned
        # files.
        if text != "":
            text = text
        # If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
        else: # fileurl
            text = textract.process(os.path.join(path_pdf, pdf_file), method='tesseract', language='eng', encoding='utf8')

        file_raw_text = open(os.path.join(path_output_raw_text, pdf_file.replace('.pdf', '.txt')), 'w', encoding='utf8')
        file_raw_text.write(text)
    
    text = remove_special_characters(text)
    # The word_tokenize() function will break our text phrases into #individual words
    tokens = word_tokenize(text)
    punctuations = ['(', ')', ';', ':', '[', ']', ',', '%', '-', '.', '|', '']
    stop_words = stopwords.words(stop_words_language)
    extra_stop_words = open(os.path.join(path_extra_stop_words, file_extra_stop_words), 'r', encoding='utf8').read().split('\n')
    
    # We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN
    # punctuations.
    keywords = [word.lower() for word in tokens 
                if 
                not word.lower() in stop_words and 
                not word.lower() in punctuations and 
                not word.lower() in extra_stop_words and 
                word.isalpha() and
                len(word) >= 2]
    
    return np.asarray(keywords)

In [4]:
def word_cloud_from_keywords_frequency(keywords_frequency, file_name, path='./wordclouds/', show_image=False):
    wordcloud = WordCloud(width = 512, height = 512, background_color='white')
    fig = plt.figure(figsize=(20,16),facecolor = 'white', edgecolor='blue')
    plt.imshow(wordcloud.generate_from_frequencies(keywords_frequency), interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout(pad=0)
    
    if show_image:
        plt.show()
    plt.savefig(path+file_name)
    plt.close(fig)

In [5]:
def calculate_statistics(keywords):
    total_words = 0
    keywords_dict = dict(collections.Counter(keywords).most_common())
    for k in keywords_dict:
        total_words += keywords_dict[k]

    stats = dict()
    for k in keywords_dict:
        stats[k] = {'count': keywords_dict[k], 'text_frequency': keywords_dict[k]/float(total_words)}    
#     stats['frequency'].most_common(10)
    
    return stats

In [6]:
def write_report(path, output_file, keyword_stats):
    output = open(path+output_file, 'w', encoding='utf8')
    header = u'word,count,text_frequency\n'
    output.write(header)
    for k in keyword_stats:
        line = u'%s,%d,%.5f\n' % (k, keyword_stats[k]['count'], keyword_stats[k]['text_frequency'])
        output.write(line)
    output.close()

## 2. Count occurrences of any word

* All PDFs in a given path are processed.
* This function generates 'raw reports', counting the occurrence and frequency of every word of the document. 
* A word is defined as a sequence of characters delimited by empty spaces, i.e. ' '. 
* **TODO:** This version of the code still doesn't handle pdfs that are in PT and EN.

In [7]:
def process_PDFs(path='/Users/hmg/Desktop/data/PDF_relatorios_sustentabilidade/', 
                     raw_text_path='./raw_texts/EN/',
                     output_path='./reports/EN/', 
                     wordclouds_path='./wordclouds/EN/',
                     stop_words_language='english',
                     path_extra_stop_words='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/',
                     file_extra_stop_words='extra_stop_words_EN.txt',
                     wordclouds=False, 
                     start_index=0):
    # We assume there are only pdfs in this directory
    PDFs = [f for f in listdir(path) if isfile(join(path, f))]
    PDFs = list(filter(lambda pdf: pdf.find('.pdf') != -1, PDFs))
        
    counter = start_index
    for pdf in PDFs[counter:]:
        
        print('Processing PDFs ({0}) {1} of {2} ({3})'.format(stop_words_language, counter+1, len(PDFs), pdf))
        keywords = extract_keywords(path, pdf, 
                                    path_output_raw_text=raw_text_path, 
                                    stop_words_language=stop_words_language,
                                    path_extra_stop_words=path_extra_stop_words,
                                    file_extra_stop_words=file_extra_stop_words)
        keywords_statistics = calculate_statistics(keywords)
        write_report(output_path, pdf.replace('.pdf', '.csv'), keywords_statistics)
        counter = counter + 1
        if wordclouds:
            word_cloud_from_keywords_frequency(collections.Counter(keywords), 
                                               pdf.replace('.pdf', '.png'), 
                                               path=wordclouds_path,
                                               show_image=False)

In [8]:
# ENglish PDFs processing
# process_PDFs(path='/Users/hmg/Desktop/Data/pdfs_EN/', 
#              raw_text_path='./raw_texts/EN/',
#              output_path='./reports/EN/', 
#              wordclouds_path='./wordclouds/EN/',
#              stop_words_language='english',
#              wordclouds=True,
#             start_index = 0)

In [9]:
# PorTuguese PDFs processing
# for i in range(1,5): # Hardcoded 5 because I know there are only 4 directories... 
#     print('Processing pdfs_PT{0}'.format(i))
#     process_PDFs(path='/Users/hmg/Desktop/Data/pdfs_PT{0}/'.format(i), 
#              raw_text_path='./raw_texts/PT/',
#              output_path='./reports/PT/', 
#              wordclouds_path='./wordclouds/PT/',
#              stop_words_language='portuguese',
#              file_extra_stop_words='extra_stop_words_PT.txt',
#              wordclouds=True,
#              start_index=0)

## 3. Count occurrences of predefined phrases/words

**IN-DEVELOPMENT:** Count occurrences of predefined text using 'fuzzy string match', added ```word_match_count``` lambda object. 

In [10]:
# def count_total_words(text, delim=' '): 
#     return len(remove_special_characters(text).split(delim))

In [92]:
def count_normal(text, word, confidence = 0, phrases = False, debug = False, output = ''):
    if phrases:
        return text.count(word)
    else:
        counter = 0
        for p in text.split(' '):
            if(p == word):
                counter = counter + 1
        return counter

def count_fuzzy(text, word, confidence = 95, phrases=False, debug = False, output = ''):
    counter = 0
    if len(word) == 0:
        return 0
    # Phrases
    if phrases:
        # Sliding window strategy: create a text_word from text with length word.len 
        #  and by moving 1 character at a time
        #     If match, then skip the next word.len, just to avoid double counting!
        i = 0
        ## DEBUG
#         print('count_fuzzy - word = {0} and, len(text) = {1}, confidence = {2}'.format(word, len(text), confidence))
        if debug:
            output.write('{0} ({1})'.format(word, confidence) + ',')
#             print('{0} ({1})'.format(word, confidence))
    
        while i < len(text):
            text_word_last_idx = i+len(word)
            match_confidence = 0
            if text_word_last_idx <= len(text):
                match_confidence = fuzz.ratio(text[i:text_word_last_idx], word)
#                 print('token_set_ratio({0},{1})={2}'.format(text[i:text_word_last_idx], word, match_confidence))
                if match_confidence > confidence:
                    counter += 1
                    if debug:
                        output.write('{0} ({1})'.format(text[i:text_word_last_idx], match_confidence) + ',')
#                         print('{0} =~ {1} ({2})'.format(text[i:text_word_last_idx], word, match_confidence))
                    i += len(word)
                else:
                    i += 1
            else:
                break
            ## DEBUG - only about 100 characters
#             if debug and (i % int(len(text)/10) == 0):
#                 print('{} of {} = {:.2f}%'.format(i, len(text), i/len(text)*100))
    else:
        text_vec = text.split()
        if debug:
            output.write('{0} ({1})'.format(word, confidence) + ',')
        for w in text_vec:
            match_confidence = fuzz.ratio(w, word)
            if match_confidence > confidence:
                if debug:
                    output.write('{0} ({1})'.format(w, match_confidence) + ',')
                counter += 1
    if debug:
        output.write('\n')
    return counter

In [103]:
def filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_EN.txt', 
                            raw_text_path='./raw_texts/EN/',
                            output_path='./reports_special/EN/',
                            output_file='output_EN.csv',
                            stop_words_language='english',
                            path_extra_stop_words='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/',
                            file_extra_stop_words='extra_stop_words_EN.txt',
                            word_match_count=count_normal,
                            confidence=95,
                            phrases=False,
                            output_debug_path='./debug_fuzzy/EN/',
                            debug=False):
    print('Starting filter_special_keywords')
    punctuations = ['(', ')', ';', ':', '[', ']', ',', '%', '-', '.', '|', '']
    stop_words = stopwords.words(stop_words_language)
    extra_stop_words = open(os.path.join(path_extra_stop_words, file_extra_stop_words), 'r', encoding='utf8').read().split('\n')
    
    if not phrases:
        print('[not phrases] Stopwords will be removed prior to processing the texts.')
        
    
    filter_words = ''
    try:
        filter_words = open(os.path.join(filter_list_path, filter_list_file), 'r', encoding='utf8').read().split('\n')
        print('successfully read the filter words at {0} named {1}'.format(filter_list_path, filter_list_file))
    except IOError:
        print('failed to read the filter words at {0} named {1}'.format(filter_list_path, filter_list_file))
    
    # assuming there are only the raw texts in the directory
    text_files = [f for f in listdir(raw_text_path) if isfile(join(raw_text_path, f))]
    text_files = list(filter(lambda text_file: text_file.find('.txt') != -1, text_files))
    
    stats_per_file = {}
    total_words_per_file = {}
    processed = 0
    for text_file in text_files:
        print('Processing Text ({0}) {1} of {2} ({3})'.format(stop_words_language, processed+1, 
                                                              len(text_files), text_file))
        debug_output = open(output_debug_path+'DEBUG_'+text_file.replace('.txt','.csv'), 'w', encoding='utf8') 
        try:
            text = open(os.path.join(raw_text_path, text_file), 'r').read()
        except IOError:
            print('Failed to open file at {0} named {1}'.format(raw_text_path, text_file))
                        
        # Transform the original text, remove special characters and set it to lower
        search_text = remove_special_characters(text).lower()
        
        # TODO: merge this and the previous stop_word filter in a function. 
        search_text_nostopwords = [word for word in search_text.split(' ') 
                                   if 
                                   not word in stop_words and
                                   not word in punctuations and 
                                   not word in extra_stop_words and
                                   len(word) >= 1]
        
        stats = {}
        stats['@TOTAL_WORDS'] = len(search_text_nostopwords)
        if not phrases:
            search_text = ' '.join(search_text_nostopwords)
        
        
        for filter_word in filter_words:
            # Transform the filter_word, remove special characters and set it to lower
            search_word = remove_special_characters(filter_word).lower()
            stats[filter_word] = word_match_count(search_text, search_word, confidence, phrases, debug, debug_output)
            # text.lower().count(filter_word.lower())
#             line = u'%s,%d,%.5f\n' % (filter_word, counter, -1)

        stats_per_file[text_file.replace('.txt','')] = stats
        
        processed = processed + 1
        
    output = open(output_path+output_file, 'w', encoding='utf8')   
    header = u',' + u','.join(text_files)+u'\n'
    header = header + u'words \ total_words_per_file,' + u','.join(total_words_per_file)
    output.write(header)
    
    for text_file in text_files:
        output.write(str(stats_per_file[text_file.replace('.txt','')]['@TOTAL_WORDS']) + ',')
    output.write('\n')
    
    for filter_word in filter_words:
        output.write(filter_word + ',')
        for text_file in text_files:
#             line = u'%s,%d,%.5f\n' % (filter_word, counter, -1)
            output.write(str(stats_per_file[text_file.replace('.txt','')][filter_word]) + ',')
        output.write('\n')
    output.close()
    print('Finishing filter_special_keywords')

### 3.1 Filter the special words EN

In [101]:
%%time
filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_EN.txt', 
                            raw_text_path='./raw_texts/EN/',
                            output_path='./reports_special/EN/',
                            output_file='relatorio_normal_frases_EN.csv',
                            word_match_count=count_normal,
                            confidence=-1,
                            phrases=True
                            )

Starting filter_special_keywords
successfully read the filter words at /Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/ named lista_palavras_EN.txt
Processing Text (english) 1 of 209 (CPFL_RE_10_2014_EN.txt)
Processing Text (english) 2 of 209 (ESTC_11_2015_EN.txt)
Processing Text (english) 3 of 209 (BUN_8_Non_2012_EN.txt)
Processing Text (english) 4 of 209 (EMBR_19_2012_EN.txt)
Processing Text (english) 5 of 209 (LREN_17_2015_EN.txt)
Processing Text (english) 6 of 209 (Syngenta_24_2015_EN.txt)
Processing Text (english) 7 of 209 (TCSA_21_2017_EN.txt)
Processing Text (english) 8 of 209 (ALI_14_Non_2015_EN.txt)
Processing Text (english) 9 of 209 (END_2_Non_2011_EN.txt)
Processing Text (english) 10 of 209 (FIBR_7_2014_EN.txt)
Processing Text (english) 11 of 209 (SANB_14_2011_EN.txt)
Processing Text (english) 12 of 209 (IMA_33_Non_2013_EN.txt)
Processing Text (english) 13 of 209 (CIEL_14_2016_EN.txt)
Processing Text (english) 14 of 209 (INP_24_Non_2013_EN.txt)
Processing Text (english) 1

In [99]:
%%time
## APROXIMADO 95%
confidence = 95
filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_EN.txt', 
                            raw_text_path='./raw_texts/EN/',
                            output_path='./reports_special/EN/',
                            output_file='relatorio_aproximado({0})_frases_EN.csv'.format(confidence),
                            word_match_count=count_fuzzy,
                            confidence=confidence,
                            phrases=True,
                            debug=True,
                            output_debug_path='./debug_fuzzy/EN/'
                            )

Starting filter_special_keywords
successfully read the filter words at /Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/ named lista_palavras_EN.txt
Processing Text (english) 1 of 209 (CPFL_RE_10_2014_EN.txt)
Processing Text (english) 2 of 209 (ESTC_11_2015_EN.txt)
Processing Text (english) 3 of 209 (BUN_8_Non_2012_EN.txt)
Processing Text (english) 4 of 209 (EMBR_19_2012_EN.txt)
Processing Text (english) 5 of 209 (LREN_17_2015_EN.txt)
Processing Text (english) 6 of 209 (Syngenta_24_2015_EN.txt)
Processing Text (english) 7 of 209 (TCSA_21_2017_EN.txt)
Processing Text (english) 8 of 209 (ALI_14_Non_2015_EN.txt)
Processing Text (english) 9 of 209 (END_2_Non_2011_EN.txt)
Processing Text (english) 10 of 209 (FIBR_7_2014_EN.txt)
Processing Text (english) 11 of 209 (SANB_14_2011_EN.txt)
Processing Text (english) 12 of 209 (IMA_33_Non_2013_EN.txt)
Processing Text (english) 13 of 209 (CIEL_14_2016_EN.txt)
Processing Text (english) 14 of 209 (INP_24_Non_2013_EN.txt)
Processing Text (english) 1

Processing Text (english) 138 of 209 (HSB_14_Non_2013_EN.txt)
Processing Text (english) 139 of 209 (Syngenta_24_2016_EN.txt)
Processing Text (english) 140 of 209 (FDC_11_Non_2017_EN.txt)
Processing Text (english) 141 of 209 (ITA_10_Non_2014_EN.txt)
Processing Text (english) 142 of 209 (BBDC_14_2012.txt)
Processing Text (english) 143 of 209 (NATU_15_2016_EN.txt)
Processing Text (english) 144 of 209 (LAME_25_2017_EN.txt)
Processing Text (english) 145 of 209 (TIET ELE_2_2015_EN.txt)
Processing Text (english) 146 of 209 (KLBN_7_2017_EN.txt)
Processing Text (english) 147 of 209 (B3SA_14_2015_EN.txt)
Processing Text (english) 148 of 209 (ELE_10_Non_2014_EN.txt)
Processing Text (english) 149 of 209 (INV_12_Non_2015_EN.txt)
Processing Text (english) 150 of 209 (BBAS_14_2015_EN.txt)
Processing Text (english) 151 of 209 (TET_6_Non_2018_EN.txt)
Processing Text (english) 152 of 209 (AMA_12_Non_2016_EN.txt)
Processing Text (english) 153 of 209 (VIM_1_Non_2015_EN.txt)
Processing Text (english) 154 o

## Sandbox

In [None]:
# print(len(PDFs))
# print(len(PDFs_PT))
# print(len(PDFs_EN))

In [None]:
# %%time
# ## APROXIMADO 85%
# confidence = 85
# filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
#                             filter_list_file='lista_palavras_EN.txt', 
#                             raw_text_path='./raw_texts/EN_dev/',
#                             output_path='./reports_special/EN/',
#                             output_file='relatorio_aproximado({0})_frases_EN_dev.csv'.format(confidence),
#                             word_match_count=count_fuzzy,
#                             confidence=confidence,
#                             phrases=True,
#                             debug=True,
#                             output_debug_path='./debug_fuzzy/EN_dev/'
#                             )

In [None]:
text = u'administrativa da joao Elekeiroz. GRI 2.3  joao| 2.4As vendas para o mercado interno 88 corresponderam a 88% da Receita Líquida da Elekeiroz em 2011. As exportações para\nmais de 30 países responderam pelos restantes 12%. Os produtos \norgânicos foram vendidos nos mercados interno e externo, enquanto os inorgânicos tiveram como destino o mercado interno. MISSÃO GRI 4.8  GRI 2.2 | 2.7 | 2.8 Buscar a melhoria contínua dos produtos e serviços oferecidos aos seus clientes internos e \nexternos; destacar-se em '
print('count = {0}'.format(count_total_words(text)))

count_normal(text, 'joão', phrases=False)

In [16]:
# keywords = extract_keywords('/Users/hmg/Downloads/', '3M_RS2014.pdf', '/Users/hmg/Downloads/')
# keywords_frequency = collections.Counter(keywords)
# 
# keywords_frequency.most_common(10)

In [15]:
# keywords = extract_keywords('/Users/hmg/Desktop/data/PDF_relatorios_sustentabilidade/FIBR_7_2012_EN.pdf', 
#                             stop_words_language='english')
# keywords_frequency = collections.Counter(keywords)

# word_cloud_from_keywords_frequency(keywords_frequency, file_name='hey.png', plot=False)

# keywords_frequency.most_common(10)

In [None]:
# calculate_statistics(keywords)

In [46]:
text = 'test text with test words in the text test words'

testCount = count_normal(text, 'test', phrases = False)
phraseCount = count_normal(text, 'test words', phrases = True)
print('testCount = {0} and phraseCount = {1}'.format(testCount, phraseCount))

testCount = 3 and phraseCount = 2


In [None]:
# filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Projeto_tese/', 
#                             filter_list_file='lista_palavras_EN.txt', 
#                             raw_text_path='./raw_texts/EN/',
#                             output_path='./reports_special/EN/')

# filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Projeto_tese/', 
filter_special_keywords(filter_list_path='./', 
                            filter_list_file='lista_frases_v3_dev.txt', 
                            raw_text_path='./raw_texts/PT_dev/',
                            output_path='./reports_special/PT/',
                            output_file='output_PT_dev_fuzzy_phrases95.csv',
                            word_match_count=count_fuzzy,
                            confidence=95,
                            phrases=True
                            )

In [18]:
# Tests with fuzzy string match
# fuzz.ratio(str.lower(u'Casarão'), str.lower(u'Caserão'))
# fuzz.token_set_ratio(str.lower(u'Casarão'), str.lower(u'Casârão'))

# normal_count = count_normal("mama mia this is a mama and there is no mia in this mama", "mãma")
# fuzzy_count = count_fuzzy("mama mia this is a mama and there is no mia in this mama", "mãma", 80)

# print('normal = {0}, fuzzy = {1}'.format(normal_count, fuzzy_count))

86

In [92]:
fuzz.ratio(str.lower(u'vacao da biodiversidade'), str.lower(u''))
# fuzz.token_set_ratio(str.lower(u'EN-13'), str.lower(u'EN-'))

92